In [5]:
from datasets import titanic_data

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from tools import roc

import pandas as pd

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [3]:
original_X, original_y, train_X, train_y, test_X, test_y = titanic_data()

## Select the best k based on 5-fold split

In [6]:
k = []
r2_means = []

for i in range(1, 60):
    model = KNeighborsClassifier(n_neighbors=i)
    r2 = cross_val_score(model, original_X, original_y)
    r2_means.append(r2.mean())
    k.append(i)

result = pd.DataFrame(zip(k, r2_means), columns=['k', 'accuracy (mean)'])
px.line(result, x='k', y='accuracy (mean)')

## Estimate test error by 5-fold cross validation

In [12]:
model = KNeighborsClassifier(n_neighbors=12)
cross_val_score(model, original_X, original_y).mean()

0.8136902893729208

## Fit KNN Classification model

In [13]:
model = KNeighborsClassifier(n_neighbors=12)
model = model.fit(train_X, train_y)

## Confusion matrix and ROC

In [14]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(original_X).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 243, TN: 490, FP: 59, FN: 99'

In [15]:
roc(model, original_X, original_y).show()