In [86]:
import seaborn as sns #this is the plotting library I'll be using 
import pandas as pd #"as pd" means that we can use the abbreviation in commands
import matplotlib.pyplot as plt #we need Matplotlib for setting the labels in the Seaborn graphs
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

df = pd.read_csv('../titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [87]:
#Creating dummy variables of qualitative variables
dummies = pd.get_dummies(df['Sex'])
dummies.head(1)

Unnamed: 0,female,male
0,0,1


In [88]:
#Adding dummies to the df by concatenating variables to dataframe
df = pd.concat([df, dummies], axis=1) #the axis=1 means: add it to the columns (axis=0 is rows)
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,0,1


In [89]:
df_subset = df[['PassengerId','Pclass', 'female', 'Age', 'SibSp', 'Parch', 'Survived']]
df_subset.head(10)

Unnamed: 0,PassengerId,Pclass,female,Age,SibSp,Parch,Survived
0,1,3,0,22.0,1,0,0
1,2,1,1,38.0,1,0,1
2,3,3,1,26.0,0,0,1
3,4,1,1,35.0,1,0,1
4,5,3,0,35.0,0,0,0
5,6,3,0,,0,0,0
6,7,1,0,54.0,0,0,0
7,8,3,0,2.0,3,1,0
8,9,3,1,27.0,0,2,1
9,10,2,1,14.0,1,0,1


In [90]:
df_subset = df_subset.dropna()
df_subset.head(10)


Unnamed: 0,PassengerId,Pclass,female,Age,SibSp,Parch,Survived
0,1,3,0,22.0,1,0,0
1,2,1,1,38.0,1,0,1
2,3,3,1,26.0,0,0,1
3,4,1,1,35.0,1,0,1
4,5,3,0,35.0,0,0,0
6,7,1,0,54.0,0,0,0
7,8,3,0,2.0,3,1,0
8,9,3,1,27.0,0,2,1
9,10,2,1,14.0,1,0,1
10,11,3,1,4.0,1,1,1


In [91]:
from sklearn.preprocessing import normalize #get the function needed to normalize our data.
y = df_subset[['Survived']] #We need to take out the shares_log as our Y-variable
X = df_subset[['Pclass', 'female', 'Age', 'SibSp', 'Parch']]
# X = normalize(X) Im not using the normalize right now bc it is baked into the knn already

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables
X_train.head()

Unnamed: 0,Pclass,female,Age,SibSp,Parch
641,1,1,24.0,0,0
433,3,0,17.0,0,0
202,3,0,34.0,0,0
585,1,1,18.0,0,2
544,1,0,50.0,1,0


In [92]:
from sklearn.neighbors import KNeighborsClassifier #the object class we need

knn = KNeighborsClassifier(n_neighbors=6) #create a KNN-classifier with 5 neighbors (default)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

  after removing the cwd from sys.path.


0.813953488372093

Roughly 81% of survivors are well predicted (k=5 is optimal number of neighbors) = accuracy

In [93]:
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[124,  10],
       [ 30,  51]])

In [104]:
y_test['Survived'].value_counts() #value counts only work on column so get column from total y_test

0    134
1     81
Name: Survived, dtype: int64

In [95]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['dead', 'survivor' ], columns = ['dead_p', 'survivor_p']) 
conf_matrix

Unnamed: 0,dead_p,survivor_p
dead,124,10
survivor,30,51


The way to read this is that of the survivors, 51 are correctly predicted as 'survivors', 30 are instead predicted as 'dead'. The recall and precision for the survivors is:


$precision = \frac{51}{51+10} = .84$

$recall = \frac{51}{51 + 30 } = .63$


We might improve our scores by trying out different values of k.