# K-nearest centroids
In this notebook we want to train a K-nearest-centroid classifier that should predict whether a patient has diabetes or not.

In [1]:
#imports
from preprocessing.preprocessing import *
from preprocessing.preprocessing_label_encoding import *
from preprocessing.preprocessing_one_hot_encoding import *

import pandas as pd

from sklearn.neighbors import NearestCentroid
from sklearn.metrics import *

import warnings
warnings.filterwarnings('ignore')



# Hyperparameter tuning and preprocessing
We will evaluate different parameters for the classifier (hyperparameter tuning) as well as different preprocessing steps.

In detail, we will vary regarding parameters:<br>


| **Parameter** |                   **range Values**                    |
|:-------------:|:-----------------------------------------------------:|
| metric        | 'euclidean',<br>'cosine',<br>'manhattan',<br>'jaccard' |

And we will vary for preprocessing:

|              **Preprocessing**              |                           **Description**                           |
|:-------------------------------------------:|:-------------------------------------------------------------------:|
|               Label Encoding                |                           Label encoding                            |
|     Label Encoding<br>+<br>Oversampling     |                   Label encoding and oversampling                   |
|    Label Encoding<br>+<br>Undersampling     |                  Label encoding and undersampling                   |
|            One Hot Encoding (1)             |           One hot encoding for all columns except yes/no            |
|            One Hot Encoding (2)             |          One hot encoding for all columns including yes/no          |
|  One Hot Encoding (1)<br>+<br>Oversampling  |   One hot encoding for all columns except yes/no and oversampling   |
| One Hot Encoding (2) <br>+<br>Oversampling  | One hot encoding for all columns including yes/no and oversampling  |
| One Hot Encoding (1)<br>+<br>Undersampling  |  One hot encoding for all columns except yes/no and undersampling   |
| One Hot Encoding (2) <br>+<br>Undersampling | One hot encoding for all columns including yes/no and undersampling |

We start with applying a K-nearest centroid classifier to the train data and test against the validation data on how it performs by using the accuracy and f1 score.

We will do so for each combination that is listed above by using a for loop. The following estimators are structured by the different style of preprocessing.

At the end we test the best approach against the actual test data.

# Label Encoding

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# Label Encoding + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# Label Encoding + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_label_encoded_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (1)
yes/no values not one hot encoded

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (2)
all columns one hot encoded

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (1) + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (2) + Oversampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_oversampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (1) + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# One Hot Encoding (2) + Undersampling

In [None]:
#load data
data_train, data_validation, target_train, target_validation = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_undersampled()

#metrics that should be used
params = ('euclidean', 'cosine', 'manhattan', 'jaccard')

for metric in params:
    nearest_centroid = NearestCentroid(metric=metric)
    nearest_centroid.fit(data_train, target_train.values.ravel())
    diabetes_test_prediction = nearest_centroid.predict(data_validation)
    print("metric: {} -> acc: {}".format(metric, accuracy_score(target_validation, diabetes_test_prediction)))

# Best Approach
tbd
