In [25]:
import sys
sys.path.append('/workspaces/quantum/')
from colors import Bcolors as bc
print(sys.executable)

import pandas as pd

/workspaces/quantum/venv/bin/python


# Looking at the Data

Load the data:

In [26]:
train = pd.read_csv('../data/train.csv')

# Dataset Shape
num_rows = train.shape[0]
num_columns = train.shape[1]
print(f"train.csv has {bc.BLUE} {num_rows} rows {bc.ENDC} and {bc.PURPLE} {num_columns} columns{bc.ENDC}.")

train.csv has [34m 891 rows [0m and [35m 12 columns[0m.


In [27]:
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing the Data

## 1. Missing Values ⭕️
### Approach 1
You only have two passengers without an `Embarked` location so lets drop them.

In [28]:
train = train.dropna(subset=['Embarked'])
num_rows = train.shape[0]
num_columns = train.shape[1]
# Dataset Shape
num_rows = train.shape[0]
num_columns = train.shape[1]
print(f"Now, train.csv has {bc.BLUE} {num_rows} rows {bc.ENDC} and {bc.PURPLE} {num_columns} columns{bc.ENDC}.")

Now, train.csv has [34m 889 rows [0m and [35m 12 columns[0m.


### Approach 2
You have very little information about the cabin, so lets drop it

In [29]:
train = train.drop("Cabin", axis=1)
num_rows = train.shape[0]
num_columns = train.shape[1]
print(f"\nAnd now, train.csv has {bc.BLUE}{num_rows} rows {bc.ENDC} and {bc.PURPLE}{num_columns} columns{bc.ENDC}.")


And now, train.csv has [34m889 rows [0m and [35m11 columns[0m.


### Approach 3
The age category is omitted often. But intuition tells us it might be important.

In [30]:
mean = train["Age"].mean()
train["Age"] = train["Age"].fillna(mean)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


## 2. Identifiers ℹ️

### Perfect Identifiers 1️⃣

In [31]:
# Perfect identifiers
num_unique_passengerIDs = train["PassengerId"].nunique()
num_unique_names = train["Name"].nunique()
print(f"\nThere are {bc.GREEN}{num_unique_passengerIDs}{bc.ENDC} different (unique) {bc.GREEN}PassengerIDs{bc.ENDC} in the data.")
print(f"\nThere are {bc.GREEN}{num_unique_names}{bc.ENDC} different (unique) {bc.GREEN}names{bc.ENDC} in the data.")



There are [32m889[0m different (unique) [32mPassengerIDs[0m in the data.

There are [32m889[0m different (unique) [32mnames[0m in the data.


### Imperfect Identifiers ⭕️

In [32]:
# Imperfect identifier
num_unique_tickets = train["Ticket"].nunique()
print(f"\nThere are {bc.GREEN}{num_unique_tickets}{bc.ENDC} different (unique) {bc.GREEN}ticket numbers{bc.ENDC} in the data")


There are [32m680[0m different (unique) [32mticket numbers[0m in the data


In [33]:
train = train.drop("PassengerId", axis=1)
train = train.drop("Name", axis=1)
train = train.drop("Ticket", axis=1)

num_rows = train.shape[0]
num_columns = train.shape[1]
print(f"\nNow, the train dataset has {bc.BLUE}{num_rows} rows {bc.ENDC} and {bc.PURPLE}{num_columns} columns{bc.ENDC}.")


Now, the train dataset has [34m889 rows [0m and [35m8 columns[0m.


# 3. Handling Text and Categorical Attributes

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Sex', 'Embarked']:
    le.fit(train[col])
    train[col] = le.transform(train[col])
    
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


# Feature Scaling

In [35]:
max_age = train["Age"].max()
max_fare = train["Fare"].max()
print(f"The {bc.OKCYAN}maximum age{bc.ENDC} is {bc.OKCYAN}{max_age}{bc.ENDC}")
print(f"The {bc.OKCYAN}maximum fare{bc.ENDC} is {bc.OKCYAN}{max_fare}{bc.ENDC}")


The [96mmaximum age[0m is [96m80.0[0m
The [96mmaximum fare[0m is [96m512.3292[0m


In [48]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train)   # the scaler returns a NumPy-array instead of a Pandas DataFrame!
train = scaler.transform(train)

print(f"The minimum value is {bc.OKCYAN}{train.min()}{bc.ENDC} and the maximum value is {bc.OKCYAN}{train.max()}{bc.ENDC}")

The minimum value is [96m0.0[0m and the maximum value is [96m1.0[0m


# 5. Training and Testing
Includes...
- Data preparation and cleaning
- Separating input from labels (outputs) and training sets from testing sets

In [37]:
from sklearn.model_selection import train_test_split

input_data = train[:, 1:8]
labels = train[:, 0]
train_input, test_input, train_labels, test_labels = train_test_split(input_data, labels, test_size=0.2)

num_training_rows = train_input.shape[0]
num_testing_rows = test_input.shape[0]
num_training_columns = train_input.shape[1]
print(f"We have {bc.BLUE}{num_training_rows} training rows{bc.ENDC} and {bc.BLUE}{num_testing_rows} testing rows{bc.ENDC}")
print()
print(f"There are {bc.PURPLE}{num_training_columns} input columns{bc.ENDC}")

We have [34m711 training rows[0m and [34m178 testing rows[0m

There are [35m7 input columns[0m


# 6. Save Preprocessed Data to the File System 💾

In [38]:
import numpy as np 

with open('train.npy', 'wb') as f:
    np.save(f, train_input)
    np.save(f, train_labels)
    
with open('test.npy', 'wb') as f:
    np.save(f, test_input)
    np.save(f, test_labels)

# 7. Baseline 🔰

## Random Classifier 🪙

In [39]:
import random
random.seed(a=None, version=2)

def classify(passenger):
    return random.randint(0, 1)

# The classification runner
def run(f_classify, x):
    return list(map(f_classify, x))

# Run the classifier
result = run(classify, train_input)
print(result)

[0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 

### Evaluate the Classifier
#### 📙 Accuracy measure
$$ \text{Accuracy} = \frac{\sum{\text{True Positives}}+\sum{\text{True Negatives}}}{\text{Total Population}}

In [40]:
def evaluate(predictions, actual):
    correct = list(filter(
        lambda item: item[0] == item[1],
        list(zip(predictions, actual))
    ))
    return f"\n{bc.GREEN}{len(correct)}{bc.ENDC} correct predictions out of {bc.BLUE}{len(actual)}{bc.ENDC}. Accuracy {bc.GREEN}{100*len(correct)/len(actual):.0f}% {bc.ENDC}."

print(evaluate(run(classify, train_input), train_labels))


[32m357[0m correct predictions out of [34m711[0m. Accuracy [32m50% [0m.


## ii) Always predict a passenger died 💯

In [41]:
def predict_death(item):
    return 0

print(f"Always predict a passenger died:", 
      evaluate(run(predict_death, train_input), train_labels))

Always predict a passenger died: 
[32m437[0m correct predictions out of [34m711[0m. Accuracy [32m61% [0m.


### Confusion matrix of the predict death classifier
Confusion matrices provide more detailed measures of classifier performance compared to just accuracy.
- Precision
- Recall
- Specificity
- Negative predictive value (NPV)

In [42]:
# Classifier evaluation
from sklearn.metrics import confusion_matrix
predictions = run(predict_death, train_input)
print(confusion_matrix(train_labels, predictions))

[[437   0]
 [274   0]]


#### 📙Precision = accuracy of positive predictions
$$ \text{Precision} = \frac{\sum{\text{True Positives}}}{\sum{\text{All Predicted Positives}}} $$

*Note: Since we didn't have a single positive prediction in the predict_death classifier, our precision is undefined. Scikit-Learn will interpret this as a precision score of 0.0. We will observe a similar result for the recall and specificity scores.

In [43]:
from sklearn.metrics import precision_score
print(f"The {bc.GREEN}precision score{bc.ENDC} of the predict_death classifier is {bc.GREEN}{precision_score(train_labels, predictions, zero_division=0)}{bc.ENDC}")

The [32mprecision score[0m of the predict_death classifier is [32m0.0[0m


#### 📙Recall = accuracy of actual positives
$$ \text{Recall} = \frac{\sum{\text{True Positives}}}{\sum{\text{All Actual Positives}}} $$

In [44]:
from sklearn.metrics import recall_score
print(f"The {bc.GREEN}recall score{bc.ENDC} of the predict_death classifier is {bc.GREEN}{recall_score(train_labels, predictions)}{bc.ENDC}")

The [32mrecall score[0m of the predict_death classifier is [32m0.0[0m


#### 📙Specificity = accuracy of actual negatives
$$ \text{Specificity} = \frac{\sum{\text{True Negatives}}}{\sum{\text{All Actual Negatives}}} $$

The function `specificty` (defined below) takes the confusion matrix as a parameter
- True negatives = `(matrix[0][0])`
- False positives = `(matrix[0][1])`

Note: The specificity score for the a model that always predits death will be 1.0 since it only predicts deaths.

In [45]:
# Specificity 
def specificity(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[0][1]) if (matrix[0][0]+matrix[0][1] > 0) else 0

cm = confusion_matrix(train_labels, predictions)
print(f"The {bc.GREEN}specificity score{bc.ENDC} of the predict_death classifier is {bc.GREEN}{specificity(cm):.2f}{bc.ENDC}")

The [32mspecificity score[0m of the predict_death classifier is [32m1.00[0m


#### 📙NPV = accuracy of negative predictions
$$ \text{NPV} = \frac{\sum{\text{True Negatives}}}{\sum{\text{All Predicted Negatives}}} $$

In [46]:
# NPV
def npv(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[1][0]) if (matrix[0][0]+matrix[1][0] > 0) else 0

print(f"The {bc.GREEN}NPV score{bc.ENDC} of the predict_death classifier is {bc.GREEN}{npv(cm):.2f}{bc.ENDC}")

The [32mNPV score[0m of the predict_death classifier is [32m0.61[0m


# Confusion matrix of the Random Classifier

In [47]:
#Scores of the random classifier
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score

# def specificity(matrix):
#    return matrix[0][0]/(matrix[0][0]+matrix[0][1]) if (matrix[0][0]+matrix[0][1] > 0) else 0

# def npv(matrix):
#    return matrix[0][0]/(matrix[0][0]+matrix[1][0]) if (matrix[0][0]+matrix[1][0] > 0) else 0

random_predictions = run(classify, train_input)
random_cm = confusion_matrix(train_labels, random_predictions)

print('The precision score of the random classifier is {} {} {} \n'
      .format(bc.GREEN, precision_score(train_labels, random_predictions, zero_division=0), bc.ENDC))

print('The recall score of the random classifier is {} {} {} \n'
      .format(bc.GREEN, recall_score(train_labels, random_predictions), bc.ENDC))

print('The specificity score of the random classifier is {} {:.2f} {} \n'
      .format(bc.GREEN,specificity(random_cm), bc.ENDC))

print('The NPV score of the random classifier is {} {:.2f} {}'
      .format(bc.GREEN,npv(random_cm), bc.ENDC))

The precision score of the random classifier is [32m 0.3545706371191136 [0m 

The recall score of the random classifier is [32m 0.46715328467153283 [0m 

The specificity score of the random classifier is [32m 0.47 [0m 

The NPV score of the random classifier is [32m 0.58 [0m
