# K-nearest neighbor
In this notebook we want to train a K-nearest-neighbor classifier that should predict whether a patient has diabetes or not.

In [10]:
#imports
from preprocessing.preprocessing_label_encoding import *
from preprocessing.preprocessing_one_hot_encoding import *

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *

import warnings
warnings.filterwarnings('ignore')

# Hyperparameter tuning and preprocessing
We will evaluate different parameters for the classifier (hyperparameter tuning) as well as different preprocessing steps based on the accuracy and F1-Score.

In detail, we will vary regarding parameters:<br>

| **Parameter** |                    **range Values**                    |
|:-------------:|:------------------------------------------------------:|
| metric        | 'euclidean',<br>'cosine',<br>'manhattan',<br>'jaccard' |

And we will vary for preprocessing:


|              **Preprocessing**              |                           **Description**                           |
|:-------------------------------------------:|:-------------------------------------------------------------------:|
|               Label Encoding                |                           Label encoding                            |
|     Label Encoding<br>+<br>Oversampling     |                   Label encoding and oversampling                   |
|    Label Encoding<br>+<br>Undersampling     |                  Label encoding and undersampling                   |
|            One Hot Encoding (1)             |           One hot encoding for all columns except yes/no            |
|            One Hot Encoding (2)             |          One hot encoding for all columns including yes/no          |
|  One Hot Encoding (1)<br>+<br>Oversampling  |   One hot encoding for all columns except yes/no and oversampling   |
| One Hot Encoding (2) <br>+<br>Oversampling  | One hot encoding for all columns including yes/no and oversampling  |
| One Hot Encoding (1)<br>+<br>Undersampling  |  One hot encoding for all columns except yes/no and undersampling   |
| One Hot Encoding (2) <br>+<br>Undersampling | One hot encoding for all columns including yes/no and undersampling |

We start with applying a K-nearest neighbor classifier to the train data and test against the validation data on how it performs by using the accuracy and f1 score.

We will do so for each combination that is listed above by using two for loops. The following estimators are structured by the different style of preprocessing.

At the end we test the best approach against the actual test data.

# Label Encoding

In [11]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

k= 9 metric: euclidean -> acc: 0.8454262089267933

              precision    recall  f1-score   support

         1.0       0.44      0.19      0.27      5759
         2.0       0.00      0.00      0.00       393
         3.0       0.87      0.97      0.92     39277
         4.0       0.00      0.00      0.00       769

    accuracy                           0.85     46198
   macro avg       0.33      0.29      0.30     46198
weighted avg       0.79      0.85      0.81     46198

k= 9 metric: minkowski -> acc: 0.8454262089267933

              precision    recall  f1-score   support

         1.0       0.44      0.19      0.27      5759
         2.0       0.00      0.00      0.00       393
         3.0       0.87      0.97      0.92     39277
         4.0       0.00      0.00      0.00       769

    accuracy                           0.85     46198
   macro avg       0.33      0.29      0.30     46198
weighted avg       0.79      0.85      0.81     46198



# Label Encoding + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# Label Encoding + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (1)
yes/no values not one hot encoded

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (2)
all columns one hot encoded

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (1) + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (2) + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (1) + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# One Hot Encoding (2) + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for n_neighbors in range(5,16):
    for metric in params:
        knn_estimator = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric)
        knn_estimator.fit(data_train, target_train.values.ravel())
        diabetes_test_prediction = knn_estimator.predict(data_validation)
        print("k= {} metric: {} -> acc: {}".format(n_neighbors, metric, accuracy_score(target_validation, diabetes_test_prediction)))
        print("\n" + classification_report(target_validation, diabetes_test_prediction))

# Best Approach
tbd