In [15]:
import seaborn as sns #this is the plotting library I'll be using 
import pandas as pd #"as pd" means that we can use the abbreviation in commands
import matplotlib.pyplot as plt #we need Matplotlib for setting the labels in the Seaborn graphs

df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [16]:
dummies = pd.get_dummies(df['Sex'])
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [17]:
df = pd.concat([df, dummies], axis=1) #the axis=1 means: add it to the columns (axis=0 is rows)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,,0,1


In [18]:
df = df.dropna()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,E46,0,1
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,G6,1,0
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,C103,1,0


In [22]:
df_subset = df[['Pclass', 'female', 'Age', 'SibSp', 'Parch', 'Survived']]
df_subset.head(5)

Unnamed: 0,Pclass,female,Age,SibSp,Parch,Survived
1,1,1,38.0,1,0,1
3,1,1,35.0,1,0,1
6,1,0,54.0,0,0,0
10,3,1,4.0,1,1,1
11,1,1,58.0,0,0,1


In [24]:
df_subset = df_subset.dropna()

In [38]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

Y = df_subset['Survived'] # define the target variable (dependent variable) as y
X = df_subset[['Pclass', 'female', 'Age', 'SibSp', 'Parch']]
X = normalize(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) #test_size=0.3 indicates the percentage of the data that should be held over for testing

In [39]:
from sklearn.neighbors import KNeighborsClassifier #the object class we need

knn = KNeighborsClassifier(n_neighbors=3) #create a KNN-classifier with 5 neighbors (default)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.6964285714285714

69% of the survivers is predicted accurately.

In [33]:
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[ 9, 10],
       [ 9, 28]])

In [41]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['Dead', 'Survived'], columns = ['Dead_p', 'Survived_p']) 
conf_matrix

Unnamed: 0,Dead_p,Survived_p
Dead,9,10
Survived,9,28


The way to read this is that of the survivers, 9 are correctly predicted as survived. With dead, 9 are correctly predicted dead. 

$precision = \frac{10}{10 + 28} = .26$

$recall = \frac{10}{10 + 9} = .53$

We might improve our scores by trying out different values of k.  