In [1]:
# Data import
from preprocessing.preprocessing import download_brfss_dataset
#download_brfss_dataset("username", "token") -> insert kaggle username and api-token
#imports
from preprocessing.preprocessing import get_preprocessed_brfss_train_test_split_one_hot_encoded, get_preprocessed_brfss_train_test_split, get_preprocessed_brfss_dataset

import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

   GenHealth  PhysHealth  MentHealth  Healthcare  MedCost  Checkup  HighBP  \
0        5.0        15.0        18.0         1.0      2.0      1.0     1.0   
1        3.0        88.0        88.0         2.0      1.0      4.0     3.0   
2        4.0        15.0        88.0         1.0      2.0      1.0     3.0   
3        5.0        30.0        30.0         1.0      1.0      1.0     1.0   
4        5.0        20.0        88.0         1.0      2.0      1.0     3.0   

   HighChol  HeartAttack  AngiCoro  Stroke  Asthma  Arthritis  Kidney  Sex  \
0       1.0          2.0       2.0     2.0     1.0        1.0     2.0  2.0   
1       2.0          2.0       2.0     2.0     2.0        2.0     2.0  2.0   
2       1.0          7.0       2.0     1.0     2.0        1.0     2.0  2.0   
3       1.0          2.0       2.0     2.0     2.0        1.0     2.0  2.0   
4       2.0          2.0       2.0     2.0     2.0        1.0     2.0  2.0   

   Income  SodiumSalt   Age  Height   Weight  BMI  Education  

We start with loading the already preprocessed data that is split by train and test as well as data and target values.

In [2]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split()
target_train.head()

Unnamed: 0,DIABETE3
434107,3.0
110299,3.0
218193,3.0
394642,3.0
354955,3.0


Having the data at hand we create the nearest centroid estimator and train it with the data, to see how it performs.

In [3]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_train, target_train.values.ravel())
predictions = nearest_centroid.predict(data_test)
print("nearest_centroid: acc: {}".format(accuracy_score(target_test, predictions)))

nearest_centroid: acc: 0.15057197870653527


The accuracy score is really low. To see if the estimator performs better with another metric we apply a grid search cross-validation evaluating other metrics.

In [4]:
nearest_centroid.get_params()

params = {
    'metric': ('euclidean', 'minkowski', 'cosine', 'sqeuclidean', 'manhattan')
}

grid_search_estimator = GridSearchCV(nearest_centroid, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.058831,0.00498,0.011605,0.000648,euclidean,{'metric': 'euclidean'},0.146888,0.14921,0.147766,0.146449,0.149495,0.147962,0.001216,2
1,0.052294,0.00392,0.014411,0.000776,minkowski,{'metric': 'minkowski'},0.146888,0.14921,0.147766,0.146449,0.149495,0.147962,0.001216,2
2,0.050737,0.001637,0.01191,0.00049,cosine,{'metric': 'cosine'},0.370823,0.362782,0.36768,0.372253,0.366213,0.36795,0.003364,1
3,0.050477,0.00097,0.013757,0.000381,sqeuclidean,{'metric': 'sqeuclidean'},0.146888,0.14921,0.147766,0.146449,0.149495,0.147962,0.001216,2
4,0.18025,0.005372,0.011906,0.000155,manhattan,{'metric': 'manhattan'},0.132178,0.108633,0.109384,0.111847,0.109754,0.114359,0.008973,5


best score is 0.3679504083507075 with params {'metric': 'cosine'}


We can see that the cosine metric performs better, but still the accuracy is too low.

Therefor we try the same with a balanced data set by applying oversampling.

In [5]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_train_test_split(True)

In [6]:
nearest_centroid.get_params()

params = {
    'metric': ('euclidean', 'minkowski', 'cosine', 'sqeuclidean', 'manhattan')
}

grid_search_estimator = GridSearchCV(nearest_centroid, params, scoring='accuracy', cv=5, return_train_score=False)
grid_search_estimator.fit(data_train,target_train.values.ravel())

results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)


print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.409747,0.009358,0.051765,0.000694,euclidean,{'metric': 'euclidean'},0.289597,0.28894,0.2899,0.28934,0.28901,0.289357,0.00036,3
1,0.397329,0.002604,0.069561,0.00114,minkowski,{'metric': 'minkowski'},0.289597,0.28894,0.2899,0.28934,0.28901,0.289357,0.00036,3
2,0.406972,0.006148,0.060112,0.006748,cosine,{'metric': 'cosine'},0.306257,0.307463,0.30566,0.307682,0.307229,0.306858,0.000772,1
3,0.434645,0.038477,0.070327,0.003606,sqeuclidean,{'metric': 'sqeuclidean'},0.289597,0.28894,0.2899,0.28934,0.28901,0.289357,0.00036,3
4,1.128352,0.012005,0.059367,0.000574,manhattan,{'metric': 'manhattan'},0.292831,0.293368,0.293407,0.293906,0.295337,0.29377,0.000854,2


best score is 0.3068581306919539 with params {'metric': 'cosine'}


We see that oversampling doesn't lead to a better accuracy.