# K-nearest neighbor
In this notebook we want to train a K-nearest-neighbor classifier that should predict whether a patient has diabetes or not.

In [1]:
# Data import
from preprocessing.preprocessing import download_brfss_dataset
#download_brfss_dataset("username", "token") -> insert kaggle username and api-token



In [2]:
#imports

from preprocessing.preprocessing import *
from preprocessing.preprocessing_label_encoding import *
from preprocessing.preproccessing_one_hot_encoding import *

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

from visualization.general_plots import plot_class_frequencies


In [3]:
#ideas to use
#plot_class_frequencies(target)

# Hyperparameter tuning and preprocessing
We will evaluate different parameters for the classifier (hyperparameter tuning) as well as different preprocessing steps.

In detail, we will vary regarding parameters:<br>


| **Parameter** |                        **range Values**                        |
|:-------------:|:--------------------------------------------------------------:|
|   n-neighbor  |                              1-15                              |
| metric        | 'euclidean',<br>'minkowski',<br>'cosine',<br>'sqeuclidean',<br>'manhattan' |

And we will vary for preprocessing:

|              **Preprocessing**              |                           **Description**                           | **Name train data** | **Name train target** | **Name test data** | **Name test target** |
|:-------------------------------------------:|:-------------------------------------------------------------------:|-----------------|-------------------|--------------------|----------------------|
|     Label Encoding<br>    |                   Label encoding                  | data_train      | target_train      | data_test          | target_test          |
|     Label Encoding<br>+<br>Oversampling     |                   Label encoding and oversampling                   | data_train_os   | target_train_os   | data_test_os       | target_test_os       |
|    Label Encoding<br>+<br>Undersampling     |                  Label encoding and undersampling                   | data_train_us   | target_train_us   | data_test_us       | target_test_us       |
|            One Hot Encoding (1)             |           One hot encoding for all columns except yes/no            | data_train_oh   | target_train_oh   | data_test_oh       | target_test_oh       |
|            One Hot Encoding (2)             |          One hot encoding for all columns including yes/no          | data_train_a_oh | target_train_a_oh | data_test_a_oh     | target_test_a_oh     |
|  One Hot Encoding (1)<br>+<br>Oversampling  |   One hot encoding for all columns except yes/no and oversampling   | data_train_oh_os | target_train_oh_os | data_test_oh_os    | target_test_oh_os    |
| One Hot Encoding (2) <br>+<br>Oversampling  | One hot encoding for all columns including yes/no and oversampling  | data_train_a_oh_os | target_train_a_oh_os | data_test_a_oh_os  | target_test_a_oh_os  |
| One Hot Encoding (1)<br>+<br>Undersampling  |  One hot encoding for all columns except yes/no and undersampling   | data_train_oh_us | target_train_oh_us | data_test_oh_us    | target_test_oh_us    |
| One Hot Encoding (2) <br>+<br>Undersampling | One hot encoding for all columns including yes/no and undersampling | data_train_a_oh_us | target_train_a_oh_us | data_test_a_oh_us  | target_test_a_oh_us  |

So we start with loading the data:

In [4]:
#label encoding
dataset, target = get_preprocessed_brfss_dataset_label_encoded()
data_train, data_test, target_train, target_test = get_train_test_split(dataset, target)

#label encoding oversampling
dataset_os, target_os = get_preprocessed_brfss_dataset_label_encoded_oversampled()
data_train_os, data_test_os, target_train_os, target_test_os = get_train_test_split(dataset, target)

#label encoding undersampling
dataset_us, target_us = get_preprocessed_brfss_dataset_label_encoded_undersampled()
data_train_us, data_test_us, target_train_us, target_test_us = get_train_test_split(dataset, target)

#one hot encoding (1) - target not one hot encoded
dataset_oh, target_oh = get_preprocessed_brfss_dataset_one_hot_encoded()
data_train_oh, data_test_oh, target_train_oh, target_test_oh = get_train_test_split(dataset, target)

#one hot encoding (2) - target not one hot encoded
dataset_a_oh, target_a_oh = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns()
data_train_a_oh, data_test_a_oh, target_train_a_oh, target_test_a_oh = get_train_test_split(dataset, target)

#one hot encoding (1) - target not one hot encoded + oversampling
dataset_oh_os, target_oh_os = get_preprocessed_brfss_dataset_one_hot_encoded_oversampled()
data_train_oh_os, data_test_oh_os, target_train_oh_os, target_test_oh_os = get_train_test_split(dataset, target)

#one hot encoding (2) - target not one hot encoded + oversampling
dataset_a_oh_os, target_a_oh_os = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_oversampled()
data_train_a_oh_os, data_test_a_oh_os, target_train_a_oh_os, target_test_a_oh_os = get_train_test_split(dataset, target)

#one hot encoding (1) - target not one hot encoded + undersampling
dataset_oh_us, target_oh_us = get_preprocessed_brfss_dataset_one_hot_encoded_undersampled()
data_train_oh_us, data_test_oh_us, target_train_oh_us, target_test_oh_us = get_train_test_split(dataset, target)

#one hot encoding (2) - target not one hot encoded + undersampling
dataset_a_oh_os_us, target_a_oh_os_us = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_undersampled()
data_train_a_oh_os_us, data_test_a_oh_os_us, target_train_a_oh_os_us, target_test_a_oh_os_us = get_train_test_split(dataset, target)

No we can apply a K-nearest neighbor classifier to the train data and test against the test data on how it performs by using the accuracy score.

We will do so for each combination that is listed above by applying a gridsearch estimator. The following estomators are structured by the different style of preprocessing.

# Label Encoding

In [None]:
for n_neighbors in range(1,10):
    knn_estimator = KNeighborsClassifier(n_neighbors)
    knn_estimator.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = knn_estimator.predict(data_test)
    print("k= {} acc: {}".format(n_neighbors, accuracy_score(target_test, diabetes_test_prediction)))

k= 1 acc: 0.7846814553136959
k= 2 acc: 0.7489393388401129


In [None]:
knn_estimator = KNeighborsClassifier()
knn_estimator.get_params()

data_train.head(50000)
data_test.head(50000)

params = {
    'n_neighbors': range (1,6),
    'metric': ('euclidean', 'minkowski')
    #'metric': ('euclidean', 'minkowski', 'cosine', 'sqeuclidean', 'manhattan')
}

grid_search_estimator = GridSearchCV(knn_estimator, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)

print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))