In [4]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score

df_userdata = pd.read_csv('../assets/userdata_min_40.csv')
df_userdata= df_userdata.drop(['user_id'], axis = 1)

y= df_userdata['frans']
X = df_userdata.drop(['thais','aziatisch','frans','indonesisch','hollands','italiaans','mexicaans','amerikaans','mediterraan'], axis = 1)

X_main, X_test, y_main, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size = 0.25, random_state=42, stratify=y_main)
df_userdata


Unnamed: 0,thais,aziatisch,frans,indonesisch,hollands,italiaans,mexicaans,amerikaans,mediterraan,tag_end,...,kippenbouillon van tablet,(arachide)olie,tomatenblokjes,zoete puntpaprika,(olijf)olie,kruimige aardappelen,magere gerookte spekreepjes,takje rozemarijn,boter,aardappelen
0,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,1,0,0
696,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
697,0,0,0,0,0,1,0,0,1,1,...,0,0,0,1,1,0,0,0,0,0
698,0,0,1,0,0,1,0,0,1,1,...,0,0,0,0,1,0,0,0,1,0


In [5]:
model = KNeighborsClassifier()
model.fit(X_train,y_train)
y_proba = model.predict(X_val)
y_proba

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [6]:
print('Acc_score', accuracy_score(y_proba,y_val))
print('prec_score', precision_score(y_proba,y_val))
print('rec_score', recall_score(y_proba,y_val))

Acc_score 0.9928571428571429
prec_score 0.9565217391304348
rec_score 1.0


In [7]:
param_dic ={'n_neighbors': np.arange(1,20),
           'metric':['euclidean','manhattan'],
           'weights': ['uniform', 'distance'],
           'leaf_size': np.arange(1,10)}


In [8]:
grid = GridSearchCV(model,param_dic, cv = 5)
grid.fit(X_train,y_train)

In [9]:
grid.best_params_

{'leaf_size': 1,
 'metric': 'euclidean',
 'n_neighbors': 11,
 'weights': 'uniform'}

In [10]:
grid.best_estimator_

In [11]:
grid.best_score_

0.9714285714285715

In [12]:
model = KNeighborsClassifier(leaf_size=1, metric='euclidean', n_neighbors=4)

In [13]:
model.fit(X_train,y_train)
y_proba = model.predict(X_test)

In [14]:
print('Acc_score',accuracy_score(y_proba,y_test))
print('prec_score',precision_score(y_proba,y_test))
print('rec_score',recall_score(y_proba,y_test))

Acc_score 0.9571428571428572
prec_score 0.7391304347826086
rec_score 1.0


In [15]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_proba))
print(classification_report(y_test, y_proba))

[[117   0]
 [  6  17]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       117
           1       1.00      0.74      0.85        23

    accuracy                           0.96       140
   macro avg       0.98      0.87      0.91       140
weighted avg       0.96      0.96      0.95       140

