# K-nearest neighbor

In [1]:
# Data import
from preprocessing.preprocessing import download_brfss_dataset
#download_brfss_dataset("username", "token") -> insert kaggle username and api-token

In [1]:
#imports

from preprocessing.preprocessing import get_preprocessed_brfss_train_test_split_one_hot_encoded, get_preprocessed_brfss_train_test_split, get_preprocessed_brfss_dataset

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


The following cell loads the already preprocessed data and makes it available for this notebook.

In [8]:
train_dataset, target = get_preprocessed_brfss_dataset()
target = target.squeeze()

0    3.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: DIABETE3, dtype: float64

The following cell loads the already preprocessed data and makes it available for this notebook.
The data is thereby split in train and test data as well as data and target.

In [5]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split()
data_train = data_train.head(5000)
target_train = target_train.head(5000)

No we can apply a K-nearest neighbor classifier to the train data and test against the test data onhow it performs by using the accuracy score.

In [11]:
for n_neighbors in range(1,10):
    knn_estimator = KNeighborsClassifier(n_neighbors)
    knn_estimator.fit(data_train, target_train)
    diabetes_test_prediction = knn_estimator.predict(data_test)
    print("k= {} acc: {}".format(n_neighbors, accuracy_score(target_test, diabetes_test_prediction)))

k= 1 acc: 0.7655227092535961
k= 2 acc: 0.6503114735530637
k= 3 acc: 0.807894438781289
k= 4 acc: 0.7589874277947672
k= 5 acc: 0.8235587269226413
k= 6 acc: 0.7967833276701778
k= 7 acc: 0.8309548080190282
k= 8 acc: 0.8142258466417488
k= 9 acc: 0.8346471854117115


We can see that k=9 performs the best. However, the accuracy score is not sufficient given that the data is unbalanced at the amount of no diabetes entities is around 84% (???).

To check whether another metric would perform better, we use a grid search for hyperparameter tuning.

In [7]:
knn_estimator = KNeighborsClassifier(7)
knn_estimator.get_params()

params = {
    'n_neighbors': range (7,15),
    'metric': ('minkowski', 'cosine', 'sqeuclidean')
}

grid_search_estimator = GridSearchCV(knn_estimator, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007805,0.0047,0.062354,0.002345,minkowski,7,"{'metric': 'minkowski', 'n_neighbors': 7}",0.827,0.829,0.828,0.836,0.821,0.8282,0.004792,23
1,0.005343,0.000165,0.057128,0.001219,minkowski,8,"{'metric': 'minkowski', 'n_neighbors': 8}",0.825,0.83,0.825,0.835,0.829,0.8288,0.003709,21
2,0.005938,0.000667,0.06405,0.004413,minkowski,9,"{'metric': 'minkowski', 'n_neighbors': 9}",0.83,0.838,0.834,0.841,0.83,0.8346,0.004363,15
3,0.006075,0.001018,0.060789,0.002419,minkowski,10,"{'metric': 'minkowski', 'n_neighbors': 10}",0.831,0.836,0.836,0.844,0.833,0.836,0.004427,13
4,0.005734,0.000362,0.06062,0.001441,minkowski,11,"{'metric': 'minkowski', 'n_neighbors': 11}",0.836,0.841,0.839,0.843,0.836,0.839,0.002757,3
5,0.00556,0.000201,0.058892,0.002331,minkowski,12,"{'metric': 'minkowski', 'n_neighbors': 12}",0.837,0.84,0.839,0.842,0.838,0.8392,0.00172,1
6,0.005718,0.000462,0.061785,0.002873,minkowski,13,"{'metric': 'minkowski', 'n_neighbors': 13}",0.837,0.84,0.839,0.842,0.837,0.839,0.001897,3
7,0.005349,9.5e-05,0.060219,0.002833,minkowski,14,"{'metric': 'minkowski', 'n_neighbors': 14}",0.837,0.839,0.841,0.841,0.837,0.839,0.001789,9
8,0.005319,0.000344,0.101337,0.009748,cosine,7,"{'metric': 'cosine', 'n_neighbors': 7}",0.83,0.827,0.822,0.839,0.835,0.8306,0.005953,19
9,0.005144,0.000117,0.096499,0.004489,cosine,8,"{'metric': 'cosine', 'n_neighbors': 8}",0.826,0.824,0.826,0.834,0.834,0.8288,0.004308,20


best score is 0.8392 with params {'metric': 'minkowski', 'n_neighbors': 12}




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005559,0.000425,0.065221,0.002731,minkowski,7,"{'metric': 'minkowski', 'n_neighbors': 7}",0.827,0.829,0.828,0.836,0.821,0.8282,0.004792,23
1,0.005458,0.000124,0.06029,0.002532,minkowski,8,"{'metric': 'minkowski', 'n_neighbors': 8}",0.825,0.83,0.825,0.835,0.829,0.8288,0.003709,21
2,0.005244,0.000141,0.059838,0.003112,minkowski,9,"{'metric': 'minkowski', 'n_neighbors': 9}",0.83,0.838,0.834,0.841,0.83,0.8346,0.004363,15
3,0.005911,0.000353,0.061408,0.002083,minkowski,10,"{'metric': 'minkowski', 'n_neighbors': 10}",0.831,0.836,0.836,0.844,0.833,0.836,0.004427,13
4,0.005406,0.000104,0.061058,0.002294,minkowski,11,"{'metric': 'minkowski', 'n_neighbors': 11}",0.836,0.841,0.839,0.843,0.836,0.839,0.002757,3
5,0.005248,0.000211,0.059559,0.001252,minkowski,12,"{'metric': 'minkowski', 'n_neighbors': 12}",0.837,0.84,0.839,0.842,0.838,0.8392,0.00172,1
6,0.00536,0.000112,0.061208,0.001564,minkowski,13,"{'metric': 'minkowski', 'n_neighbors': 13}",0.837,0.84,0.839,0.842,0.837,0.839,0.001897,3
7,0.005363,0.000123,0.061985,0.001812,minkowski,14,"{'metric': 'minkowski', 'n_neighbors': 14}",0.837,0.839,0.841,0.841,0.837,0.839,0.001789,9
8,0.005172,0.000163,0.096308,0.004012,cosine,7,"{'metric': 'cosine', 'n_neighbors': 7}",0.83,0.827,0.822,0.839,0.835,0.8306,0.005953,19
9,0.005188,9.7e-05,0.095892,0.003365,cosine,8,"{'metric': 'cosine', 'n_neighbors': 8}",0.826,0.824,0.826,0.834,0.834,0.8288,0.004308,20


best score is 0.8392 with params {'metric': 'minkowski', 'n_neighbors': 12}


Unfortunately the best combination only leads to an increase of 0.0046 in accuracy which is still not satisfying.

Next we try to balance the data using oversampling.

In [10]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split(True)
data_train = data_train.head(5000)
target_train = target_train.head(5000)

In [12]:
knn_estimator = KNeighborsClassifier()
knn_estimator.get_params()

params = {
    'n_neighbors': range (1,15),
    'metric': ('minkowski', 'cosine', 'sqeuclidean')
}

grid_search_estimator = GridSearchCV(knn_estimator, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006089,0.001251,0.098946,0.070602,minkowski,1,"{'metric': 'minkowski', 'n_neighbors': 1}",0.485,0.464,0.499,0.502,0.481,0.4862,0.013673,1
1,0.00537,0.00023,0.065916,0.013969,minkowski,2,"{'metric': 'minkowski', 'n_neighbors': 2}",0.411,0.388,0.411,0.416,0.383,0.4018,0.013526,6
2,0.005251,0.000118,0.057692,0.000711,minkowski,3,"{'metric': 'minkowski', 'n_neighbors': 3}",0.4,0.389,0.391,0.416,0.38,0.3952,0.012189,8
3,0.005355,0.000174,0.056282,0.001613,minkowski,4,"{'metric': 'minkowski', 'n_neighbors': 4}",0.382,0.372,0.366,0.394,0.383,0.3794,0.009666,11
4,0.005423,0.000185,0.060603,0.002457,minkowski,5,"{'metric': 'minkowski', 'n_neighbors': 5}",0.385,0.352,0.374,0.373,0.375,0.3718,0.010796,14
5,0.005374,0.000222,0.056627,0.001708,minkowski,6,"{'metric': 'minkowski', 'n_neighbors': 6}",0.381,0.343,0.354,0.376,0.368,0.3644,0.014066,17
6,0.005275,7.4e-05,0.059939,0.002567,minkowski,7,"{'metric': 'minkowski', 'n_neighbors': 7}",0.356,0.336,0.345,0.344,0.376,0.3514,0.013851,21
7,0.00531,0.000166,0.059707,0.002193,minkowski,8,"{'metric': 'minkowski', 'n_neighbors': 8}",0.355,0.337,0.341,0.336,0.372,0.3482,0.013703,24
8,0.00539,8.8e-05,0.058563,0.00222,minkowski,9,"{'metric': 'minkowski', 'n_neighbors': 9}",0.346,0.317,0.333,0.332,0.367,0.339,0.016745,28
9,0.005231,0.000114,0.06031,0.002026,minkowski,10,"{'metric': 'minkowski', 'n_neighbors': 10}",0.336,0.318,0.323,0.327,0.354,0.3316,0.012659,32


best score is 0.4862 with params {'metric': 'minkowski', 'n_neighbors': 1}


We can see that oversampling doesn't improve the classification. 