In [1]:
# Data import
from preprocessing.preprocessing import download_brfss_dataset
#download_brfss_dataset("username", "token") -> insert kaggle username and api-token
#imports
from preprocessing.preprocessing import get_preprocessed_brfss_train_test_split_one_hot_encoded, get_preprocessed_brfss_train_test_split, get_preprocessed_brfss_dataset

import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

We start with loading the already preprocessed data that is split by train and test as well as data and target values.

In [2]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split()
target_train.head()

Unnamed: 0,DIABETE3
434107,3.0
110299,3.0
218193,3.0
394642,3.0
354955,3.0


Having the data at hand we create the nearest centroid estimator and train it with the data, to see how it performs.

In [3]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_train, target_train.values.ravel())
predictions = nearest_centroid.predict(data_test)
print("nearest_centroid: acc: {}".format(accuracy_score(target_test, predictions)))

nearest_centroid: acc: 0.3455770755464945


The accuracy score is really low. To see if the estimator performs better with another metric we apply a grid search cross-validation evaluating other metrics.

In [4]:
nearest_centroid.get_params()

params = {
    'metric': ('euclidean', 'minkowski', 'cosine', 'sqeuclidean', 'manhattan')
}

grid_search_estimator = GridSearchCV(nearest_centroid, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.648981,0.025958,0.051737,0.00632,euclidean,{'metric': 'euclidean'},0.249844,0.28354,0.342394,0.376133,0.34734,0.31985,0.046154,2
1,0.635718,0.026952,0.126896,0.001323,minkowski,{'metric': 'minkowski'},0.249844,0.28354,0.342394,0.376133,0.34734,0.31985,0.046154,2
2,0.652428,0.042118,0.121776,0.007259,cosine,{'metric': 'cosine'},0.437281,0.431504,0.4414,0.423916,0.431298,0.43308,0.005942,1
3,0.639262,0.027631,0.12758,0.003676,sqeuclidean,{'metric': 'sqeuclidean'},0.249844,0.28354,0.342394,0.376133,0.34734,0.31985,0.046154,2
4,3.131539,0.0287,0.108409,0.011255,manhattan,{'metric': 'manhattan'},0.074456,0.080983,0.076084,0.0775,0.072093,0.076223,0.002983,5


best score is 0.4330797125888323 with params {'metric': 'cosine'}


We can see that the cosine metric performs better, but still the accuracy is too low.

Therefor we try the same with a balanced data set by applying oversampling.

In [5]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split(True)

In [6]:
nearest_centroid.get_params()

params = {
    'metric': ('euclidean', 'minkowski', 'cosine', 'sqeuclidean', 'manhattan')
}

grid_search_estimator = GridSearchCV(nearest_centroid, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.271346,0.248693,0.443944,0.018117,euclidean,{'metric': 'euclidean'},0.258485,0.260951,0.259118,0.26164,0.259958,0.26003,0.001155,1
1,5.834755,0.113323,0.807827,0.014279,minkowski,{'metric': 'minkowski'},0.258485,0.260951,0.259118,0.26164,0.259958,0.26003,0.001155,1
2,6.201809,0.278082,0.899676,0.04264,cosine,{'metric': 'cosine'},0.258857,0.258622,0.259482,0.258286,0.259409,0.258931,0.000458,4
3,6.756401,0.141814,0.825689,0.022557,sqeuclidean,{'metric': 'sqeuclidean'},0.258485,0.260951,0.259118,0.26164,0.259958,0.26003,0.001155,1
4,20.894546,0.172316,0.675569,0.008895,manhattan,{'metric': 'manhattan'},0.227888,0.227014,0.227809,0.22737,0.226077,0.227232,0.000658,5


best score is 0.26003037901471193 with params {'metric': 'euclidean'}


We see that oversampling doesn't lead to a better accuracy.