# K-Nearest Neighbors: Titanic Dataset

In [1]:
# basic imports
import numpy as np
import pandas as pd

In [2]:
# load training and testing data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train

In [None]:
test

In [3]:
# choose features to use
sex = np.append(train.Sex.values, test.Sex.values).reshape(-1, 1)
age = np.append(train.Age.values, test.Age.values).reshape(-1, 1)

# get labels
survived = train.Survived.values

In [None]:
sex

In [None]:
age

In [None]:
survived

In [4]:
# one-hot encode sex
sex = [1 if i == 'male' else 0 for i in sex]
sex = np.asarray(sex).reshape(-1, 1)

In [None]:
sex

In [6]:
# fill in missing values in age
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean')
age = imputer.fit_transform(age)

In [None]:
age

In [7]:
# scale age
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
age = scaler.fit_transform(age)

In [None]:
age

In [8]:
# set up features and labels
X = np.hstack((sex, age))
y = survived
m = y.size

In [None]:
X

In [None]:
y

In [None]:
m

In [9]:
# create KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X[:m], y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [10]:
# check accuracy on training set
from sklearn.metrics import accuracy_score
accuracy_score(y, knn.predict(X[:m]))

0.7991021324354658

In [11]:
# make predictions on test set
predictions = knn.predict(X[m:])

In [None]:
predictions

In [12]:
# include predictions and passenger ID
predictions = np.hstack((test.PassengerId.values.reshape(-1, 1),
                       predictions.reshape(-1, 1)))

predictions = np.vstack((np.array([['PassengerID', 'Survived']]),
                       predictions))

# save predictions to csv file (for Kaggle)
np.savetxt('titanic_predictions.csv', 
           predictions, 
           delimiter=',', 
           fmt='%s')