In [None]:
#This notebook contains the script to build a ML model simplified to deploy a more user friendly web app

# 0.1 Imports

In [1]:
import pandas as pd
import pickle
import random
import warnings

from imblearn.over_sampling import SMOTE


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold,cross_val_score, RepeatedStratifiedKFold,StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from time import time


warnings.filterwarnings("ignore")

# 0.2 Functions

In [2]:
def compare_models (model, X_train, y_train, X_test, y_test, **keyargs):
    ml = model.fit(X_train, y_train)
    
    y_lr_pred = ml.predict(X_test)
    score_lr= accuracy_score(y_test,y_lr_pred)*100
    print("MODEL PERFORMANCE:")
    print("train accuracy score: ",accuracy_score(y_train,ml.predict(X_train))*100)
    print("test accuracy score: ",score_lr)

    print(f"\n Confusion Matrix : \n {confusion_matrix(y_test,y_lr_pred)} \n")
    print(f"Classiication Report : \n {classification_report(y_test, y_lr_pred)}")
    
    return None

# 0.3 Load data

In [3]:
df_raw = pd.read_csv('./Dataset/diabetes_binary_health_indicators_BRFSS2015.csv')

# 2 Feature engineering

Dropping features with low relevance and hard to be adjusted for other countries

In [4]:
df_raw = df_raw.drop(['Income', 'Education', 'NoDocbcCost','AnyHealthcare', 'CholCheck', 'MentHlth', 'HeartDiseaseorAttack'], axis=1)

Joining fetures with high correlation and simplifying our dashboard for users

In [5]:
df_raw['fruit_veggi'] = df_raw.apply(lambda x: max(x['Fruits'],x['Veggies']),axis=1)
df_raw = df_raw.drop(['Fruits', 'Veggies'], axis=1)

In [6]:
df_raw['PhysHlth_clust'] = df_raw['PhysHlth'].apply(lambda x: 1 if x == 0 else
                                                  2 if x < 6 else
                                                  3 if x < 11 else
                                                  4 if x < 16 else
                                                  5 if x < 21 else 6)

df_raw = df_raw.drop(['PhysHlth'], axis=1)

# 3 Data preparation

## 3.10 train/test split

In [7]:
df2 = df_raw.copy()

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for str_train_set_index, str_test_set_index in sss.split(df2,df2['Diabetes_binary']):
    str_train_set = df2.iloc[str_train_set_index]
    str_test_set = df2.iloc[str_test_set_index]

In [9]:
str_train_set[str_train_set['Diabetes_binary'] == 0.0]

Unnamed: 0,Diabetes_binary,HighBP,HighChol,BMI,Smoker,Stroke,PhysActivity,HvyAlcoholConsump,GenHlth,DiffWalk,Sex,Age,fruit_veggi,PhysHlth_clust
153147,0.0,0.0,0.0,28.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1
176137,0.0,1.0,0.0,23.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,13.0,1.0,1
175578,0.0,1.0,1.0,29.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,1.0,1
177887,0.0,1.0,1.0,39.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,7.0,0.0,1
182143,0.0,0.0,1.0,16.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,7.0,1.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217091,0.0,0.0,1.0,31.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,4.0,1.0,1
134394,0.0,0.0,1.0,29.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,9.0,1.0,2
209330,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,1.0,6
179576,0.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,13.0,1.0,6


In [10]:
X_train = str_train_set.drop('Diabetes_binary', axis=1)
y_train = str_train_set['Diabetes_binary']

X_test = str_test_set.drop('Diabetes_binary', axis=1)
y_test = str_test_set['Diabetes_binary']

In [11]:
str_test_set.drop('Diabetes_binary', axis=1)

Unnamed: 0,HighBP,HighChol,BMI,Smoker,Stroke,PhysActivity,HvyAlcoholConsump,GenHlth,DiffWalk,Sex,Age,fruit_veggi,PhysHlth_clust
128677,1.0,1.0,28.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,5.0,1.0,1
225051,1.0,0.0,36.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,10.0,1.0,1
27174,0.0,0.0,27.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,5.0,1.0,1
132371,0.0,0.0,22.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,1.0,1
164896,1.0,1.0,26.0,1.0,0.0,1.0,0.0,4.0,0.0,0.0,13.0,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48574,0.0,0.0,25.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,6.0,1.0,1
230352,1.0,1.0,31.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,9.0,1.0,2
134904,0.0,0.0,25.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1
1376,1.0,0.0,22.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,12.0,1.0,6


## 3.20 SMOTE

In [12]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

## 3.30 Standardization

Use pickle to save Standardization pattern for future use in upcoming predictions

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#pickle.dump( sc, open( 'parameter/sc.pkl', 'wb' ) )
X_test = sc.transform(X_test)

# 4 Fine tuning

In [14]:
param = {
    'max_iter': [100, 1500, 1700, 2500, 3000, 3500, 10000],
    'penalty': ['l2', 'l1'],
    'C': [0.05, 0.25, 0.5, 1, 2, 4, 20],
    'multi_class': ['auto', 'ovr']
        }

MAX_EVAL = 25

In [15]:
final_result = pd.DataFrame()


for i in range( MAX_EVAL ):
    start = time()
    # choose values for parameters randomly
    hp = { k: random.sample( v, 1 )[0] for k, v in param.items() }

    # model
    fine_model = LogisticRegression(solver = 'liblinear',
                                  max_iter = hp['max_iter'], 
                                  penalty = hp['penalty'], 
                                  C = hp['C'], 
                                  multi_class = hp['multi_class']).fit(X_train, y_train)

    # performance
    y_lr_pred = fine_model.predict(X_test)
    score_lr= accuracy_score(y_test,y_lr_pred)*100
    stop = time()
    
    hp['precision'] = score_lr
    final_result = final_result.append(hp, ignore_index=True)

In [16]:
final_result.sort_values(by=['precision'], ascending=False)

Unnamed: 0,C,max_iter,multi_class,penalty,precision
0,1.0,2500.0,ovr,l1,72.829943
10,20.0,3500.0,auto,l2,72.829943
23,4.0,2500.0,ovr,l1,72.829943
22,4.0,3000.0,auto,l1,72.829943
21,20.0,2500.0,auto,l2,72.829943
16,2.0,2500.0,ovr,l1,72.829943
15,2.0,1500.0,auto,l2,72.829943
14,20.0,3500.0,ovr,l1,72.829943
1,0.5,1700.0,auto,l2,72.829943
12,20.0,10000.0,ovr,l1,72.829943


In [17]:
param_tuned = {
    'max_iter': final_result['max_iter'].head(1),
    'penalty': final_result['penalty'].head(1),
    'C': final_result['C'].head(1),
    'multi_class': final_result['multi_class'].head(1)
        }

In [18]:
final_model = LogisticRegression(solver = 'liblinear',
                              max_iter = param_tuned['max_iter'][0], 
                              penalty = param_tuned['penalty'][0], 
                              C = param_tuned['C'][0], 
                              multi_class = param_tuned['multi_class'][0]).fit(X_train, y_train)

## 4.1 Fine tuned performance

In [19]:
y_lr_pred = final_model.predict(X_test)
score_lr= accuracy_score(y_test,y_lr_pred)*100

print("MODEL PERFORMANCE:")
print("train accuracy score: ",accuracy_score(y_train,final_model.predict(X_train))*100)
print("test accuracy score: ",score_lr)

print(f"\n Confusion Matrix : \n {confusion_matrix(y_test,y_lr_pred)} \n")
print(f"Classiication Report : \n {classification_report(y_test, y_lr_pred)}")

MODEL PERFORMANCE:
train accuracy score:  74.99270039560993
test accuracy score:  72.82797224850205

 Confusion Matrix : 
 [[31596 12071]
 [ 1715  5354]] 

Classiication Report : 
               precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43667
         1.0       0.31      0.76      0.44      7069

    accuracy                           0.73     50736
   macro avg       0.63      0.74      0.63     50736
weighted avg       0.86      0.73      0.77     50736



As we can see above, the model performance was not significantly impaired by our simplifying feature engineering

## 4.2 Saving final model

In [None]:
# pickle.dump( final_model, open( 'model/Diabetes_predictor.pkl', 'wb' ) )