# KNN and Decision Tree models

In [1]:
# load packages
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random

In [2]:
# read dataframe

from datetime import datetime
import os

# this should be the last date!!!
files = sorted([f for f in os.listdir("data") if (f.endswith(".csv") and (f.startswith("preprocessed_2")))], reverse=True)
latest = files[0]
df = pd.read_csv(f"data/{latest}")

# drop new generated index column
df.drop(df.columns[0], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble
0,0,3,1,0,1,2,0,0,0,0,1,0,1,0,0,0
1,1,1,1,0,1,4,4,5,1,0,0,0,0,1,0,0
2,1,3,0,0,1,3,1,0,0,0,1,0,0,0,1,0
3,1,1,1,0,1,4,4,5,0,0,1,0,0,1,0,0
4,0,3,0,0,1,4,1,0,0,0,1,0,1,0,0,0


In [3]:
df.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble
0,0,3,1,0,1,2,0,0,0,0,1,0,1,0,0,0
1,1,1,1,0,1,4,4,5,1,0,0,0,0,1,0,0
2,1,3,0,0,1,3,1,0,0,0,1,0,0,0,1,0
3,1,1,1,0,1,4,4,5,0,0,1,0,0,1,0,0
4,0,3,0,0,1,4,1,0,0,0,1,0,1,0,0,0


### KNN

In [4]:
# import relevant packages for modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# define target variable and drop it in the training set
y = df["Survived"]
X = df.drop("Survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)


In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import random

# Set seed to get always same results
random.seed(10)

# create an estimator
knn = KNeighborsClassifier()

# specify the parameter grid
parameters = {
    'n_neighbors': range(1, 21),
    'p': range(1,5),
    'weights': ['uniform', 'distance']

}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(knn, parameters, scoring='f1', cv=stratified_10_fold_cv, return_train_score=False)

# run the grid search
grid_search_estimator.fit(X_train,y_train)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005826,0.003131,0.009572,0.002944,1,1,uniform,"{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}",0.730769,0.678571,...,0.652174,0.666667,0.615385,0.636364,0.444444,0.651163,0.666667,0.642402,0.072093,145
1,0.002022,0.003637,0.006525,0.002425,1,1,distance,"{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}",0.730769,0.678571,...,0.652174,0.666667,0.615385,0.636364,0.444444,0.651163,0.666667,0.642402,0.072093,145
2,0.002526,0.003165,0.013008,0.003199,1,2,uniform,"{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}",0.730769,0.678571,...,0.666667,0.680000,0.650000,0.636364,0.432432,0.651163,0.619048,0.642683,0.075813,143
3,0.001585,0.002647,0.008071,0.003024,1,2,distance,"{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}",0.730769,0.678571,...,0.666667,0.680000,0.650000,0.636364,0.432432,0.651163,0.619048,0.642683,0.075813,143
4,0.004755,0.003978,0.010509,0.003602,1,3,uniform,"{'n_neighbors': 1, 'p': 3, 'weights': 'uniform'}",0.730769,0.678571,...,0.666667,0.680000,0.650000,0.636364,0.432432,0.636364,0.636364,0.642935,0.075448,138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.005018,0.003449,0.003224,0.003369,20,2,distance,"{'n_neighbors': 20, 'p': 2, 'weights': 'distan...",0.800000,0.808511,...,0.761905,0.666667,0.540541,0.744186,0.666667,0.595745,0.809524,0.706041,0.088780,80
156,0.006555,0.003420,0.012932,0.005349,20,3,uniform,"{'n_neighbors': 20, 'p': 3, 'weights': 'uniform'}",0.717949,0.755556,...,0.650000,0.585366,0.514286,0.555556,0.611111,0.608696,0.731707,0.634134,0.074943,154
157,0.007032,0.002947,0.016682,0.003839,20,3,distance,"{'n_neighbors': 20, 'p': 3, 'weights': 'distan...",0.818182,0.808511,...,0.790698,0.666667,0.594595,0.714286,0.700000,0.583333,0.837209,0.718015,0.087576,52
158,0.001422,0.003001,0.017207,0.002708,20,4,uniform,"{'n_neighbors': 20, 'p': 4, 'weights': 'uniform'}",0.717949,0.755556,...,0.666667,0.585366,0.514286,0.555556,0.611111,0.608696,0.731707,0.635800,0.075461,151


best score is 0.7613308775847476 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}


Scoring = Accuracy  
best score is 0.8345366103430619 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}

Scoring = f1_micro  
best score is 0.8345366103430619 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}

Scoring = f1_macro  
best score is 0.8168984976668607 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}

Scoring = f1_weighted  
best score is 0.8312945759310859 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}

Scoring = f1  
best score is 0.7613308775847476 with params {'n_neighbors': 17, 'p': 1, 'weights': 'distance'}

In [7]:
random.seed(10)
knn = KNeighborsClassifier(n_neighbors=17, p = 1, weights='distance')
knn.fit(X_train, y_train)

# Get the predicitions
y_pred = knn.predict(X_test) 

In [8]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       157
           1       0.76      0.74      0.75       111

    accuracy                           0.79       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.79      0.79      0.79       268



We don't need to scale our data because we have only categorical values and no numerical after the preprocessing.

### Decision Trees