Introduction
data description (where from, what cols)
goal/hypothesis

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot
import seaborn as sns 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score 
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFpr, chi2

import pickle 

### BINARY ANALYSIS 
Loading data

In [None]:
diabetes_bin = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

data exploration
correlation

In [None]:
cols = diabetes_bin.columns.tolist()
for col in cols[1:]:
    print(f' correlation for {col} : {diabetes_bin["Diabetes_binary"].corr(diabetes_bin[col])}')

 
#split data first and do feature selection solely on split


In [None]:

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(diabetes_bin.iloc[:,1:], 
                                                    diabetes_bin['Diabetes_binary'], 
                                                    test_size= 0.20, random_state= 777, 
                                                    shuffle = True, stratify= diabetes_bin['Diabetes_binary'])

feature selection
# create select fpr, fit transform with data 

In [None]:
X_train_bin = SelectFpr(chi2, alpha=0.01).fit_transform(X_train_bin, y_train_bin)

# Random Forest (random_state = 777, oob_score = True)
# grid search (n_jobs = 4, cv = 5, return_train_score = True, scoring = ['f1_weighted','precision_weighted','recall_weighted','roc_auc'], refit = 'f1_weighted')

In [None]:
params_rf = {'n_estimators':np.arange(1000,4000,1000),
          'max_depth':[13,14,15,16,None],
          'max_features':['sqrt',0.5]}
clf_rf_bin = RandomForestClassifier(random_state= 777, oob_score= True)
gcv_rf_bin = GridSearchCV(clf_rf_bin, param_grid= params_rf, 
                   n_jobs = 4, cv = 5, 
                   return_train_score = True, 
                   scoring = ['f1_weighted'], 
                   refit = 'f1_weighted').fit(X_train_bin,y_train_bin)
print(f'best validation F1 score: {gcv_rf_bin.cv_results_["mean_test_f1_weighted"][gcv_rf_bin.best_index_]}')
print(f'best training F1 score: {gcv_rf_bin.cv_results_["mean_train_f1_weighted"][gcv_rf_bin.best_index_]}')
print(f'best hyperparameters: {gcv_rf_bin.best_params_}')

conf mat using training set

In [None]:
y_pred_rf_bin = gcv_rf_bin.best_estimator_.predict(X_train_bin)
conf_mat_rf_bin = confusion_matrix(y_true = y_train_bin, y_pred= y_pred_rf_bin)
axes = sns.heatmap(conf_mat_rf_bin, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

Hist gradient Boosting

In [None]:
params_hgb = {'max_iter':np.arange(1000,4000,1000),
          'max_depth':[13,14,15,16,None],
          'min_samples_leaf':[40,60,80,100],
          'learning_rate': [0.001,0.01,0.1]}
clf_hgb_bin = HistGradientBoostingClassifier(random_state= 777, categorical_features= [0,1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])
gcv_hgb_bin = GridSearchCV(clf_hgb_bin, param_grid= params_hgb, n_jobs = 4, 
                   cv = 5, return_train_score = True, 
                   scoring = ['f1_weighted'], 
                   refit = 'f1_weighted').fit(X_train_bin,y_train_bin)

print(f'best validation F1 score: {gcv_hgb_bin.cv_results_["mean_test_f1_weighted"][gcv_hgb_bin.best_index_]}')
print(f'best training F1 score: {gcv_hgb_bin.cv_results_["mean_train_f1_weighted"][gcv_hgb_bin.best_index_]}')
print(f'best hyperparameters: {gcv_hgb_bin.best_params_}')

In [None]:
y_pred_hgb_bin = gcv_hgb_bin.best_estimator_.predict(X_train_bin)
conf_mat_hgb_bin = confusion_matrix(y_true = y_train_bin, y_pred= y_pred_hgb_bin)
axes = sns.heatmap(conf_mat_hgb_bin, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

In [None]:
y_pred_test_bin = gcv_hgb_bin.best_estimator_.predict(X_test_bin)
conf_mat_test_bin = confusion_matrix(y_true = y_test_bin, y_pred= y_pred_test_bin)
axes = sns.heatmap(conf_mat_test_bin, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

f1_score(y_test_bin, y_pred_test_bin, average= 'weighted')

Multiclass 

In [None]:
diabetes_mc = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
diabetes_mc.head()

Splitting data

In [None]:
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(diabetes_mc.iloc[:,1:], 
                                                    diabetes_mc['Diabetes_012'], 
                                                    test_size= 0.20, random_state= 777, 
                                                    shuffle = True, 
                                                    stratify= diabetes_mc['Diabetes_012'])

feature selection

In [None]:
X_train_mc = SelectFpr(chi2, alpha=0.01).fit_transform(X_train_mc, y_train_mc)

Random Forest

In [None]:
clf_rf_mc = RandomForestClassifier(random_state= 777, oob_score= True)
gcv_rf_mc = GridSearchCV(clf_rf_mc, param_grid= params_rf, 
                   n_jobs = 4, cv = 5, 
                   return_train_score = True, 
                   scoring = ['f1_weighted'], 
                   refit = 'f1_weighted').fit(X_train_mc,y_train_mc)
print(f'best validation F1 score: {gcv_rf_mc.cv_results_["mean_test_f1_weighted"][gcv_rf_mc.best_index_]}')
print(f'best training F1 score: {gcv_rf_mc.cv_results_["mean_train_f1_weighted"][gcv_rf_mc.best_index_]}')
print(f'best hyperparameters: {gcv_rf_mc.best_params_}')

In [None]:
y_pred_rf_mc = gcv_rf_mc.best_estimator_.predict(X_train_mc)
conf_mat_rf_mc = confusion_matrix(y_true = y_train_mc, y_pred= y_pred_rf_mc)
axes = sns.heatmap(conf_mat_rf_mc, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

Hist Gradient Boost

In [None]:
clf_hgb_mc = HistGradientBoostingClassifier(random_state= 777, categorical_features= [0,1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])
gcv_hgb_mc = GridSearchCV(clf_hgb_mc, param_grid= params_hgb, n_jobs = 4, 
                   cv = 5, return_train_score = True, 
                   scoring = ['f1_weighted'], 
                   refit = 'f1_weighted').fit(X_train_mc,y_train_mc)

print(f'best validation F1 score: {gcv_hgb_mc.cv_results_["mean_test_f1_weighted"][gcv_hgb_mc.best_index_]}')
print(f'best training F1 score: {gcv_hgb_mc.cv_results_["mean_train_f1_weighted"][gcv_hgb_mc.best_index_]}')
print(f'best hyperparameters: {gcv_hgb_mc.best_params_}')

In [None]:
y_pred_hgb_mc = gcv_hgb_mc.best_estimator_.predict(X_train_mc)
conf_mat_hgb_mc = confusion_matrix(y_true = y_train_mc, y_pred= y_pred_hgb_mc)
axes = sns.heatmap(conf_mat_hgb_mc, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

# use predict on test data, use f1 score function with weighted + do confusion matrix 

In [None]:
y_pred_test_mc = gcv_hgb_mc.best_estimator_.predict(X_test_mc)
conf_mat_test_mc = confusion_matrix(y_true = y_test_mc, y_pred= y_pred_test_mc)
axes = sns.heatmap(conf_mat_test_mc, cbar= False, annot= True, fmt = 'd', cmap="crest")
axes.set_xlabel('Predicted')
axes.set_ylabel('True')

f1_score(y_test_mc, y_pred_test_mc, average= 'weighted')