In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [9]:
df = pd.read_csv('../Data/train_preprocessed.csv')

In [10]:
df

Unnamed: 0,Age,Glucose,Blood Pressure,BMI,Oxygen Saturation,Cholesterol,Triglycerides,HbA1c,Smoking,Alcohol,Physical Activity,Diet Score,Family History,Stress Level,Sleep Hours,target
0,0.020409,-0.949270,0.921535,1.119114,0.532989,-0.087165,-0.409764,-1.131017,1.620501,1.779035,-0.825064,-0.148625,-0.879636,0.375813,-1.092823,6
1,1.581744,-1.228016,-0.863656,-0.232136,-0.737363,1.477347,1.618654,0.575192,1.620501,1.779035,-0.521054,-0.027763,1.136834,1.151285,0.917099,5
2,-0.522665,0.207498,-1.158036,0.384132,-0.493990,-0.118711,0.129825,0.180868,-0.617093,-0.562103,-0.750307,-0.472753,-0.879636,-0.291454,0.210683,6
3,-0.590549,-0.161827,1.246579,-0.246103,0.883338,2.321952,-0.315228,-0.312037,-0.617093,1.779035,-0.356589,-0.203562,1.136834,-0.273420,-0.235032,5
4,-0.929970,-1.308992,-0.403687,0.860736,-0.777479,-0.344891,0.725275,-0.494032,1.620501,-0.562103,-0.595810,-1.077061,1.136834,-1.801822,0.429335,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20395,-0.481934,-0.480023,-0.275918,1.134826,-1.031550,0.824103,2.020123,0.014039,-0.617093,-0.562103,-1.153994,-0.274980,-0.879636,-0.075043,0.168634,6
20396,0.020409,-0.425519,-0.634693,0.144957,-0.400386,-0.273764,1.665308,0.044371,-0.617093,-0.562103,-0.944675,0.840243,-0.879636,0.177436,0.605939,0
20397,0.088293,0.847782,-0.488525,-0.640654,0.423337,1.170218,0.794232,-0.190706,1.620501,-0.562103,-0.635680,0.180998,1.136834,-2.667465,0.370467,0
20398,-1.201507,-0.488328,-1.176435,-0.179762,-0.285385,1.257714,-0.608247,-1.017270,-0.617093,-0.562103,0.326189,-0.159612,-0.879636,0.822160,0.538661,2


In [11]:
X = df.drop(columns=['target'])
y = df['target']

In [15]:
y.value_counts()

target
5    5696
3    5133
6    3086
4    2431
1    1630
0    1437
2     987
Name: count, dtype: int64

### Perform Stratified K-Fold (Unbalanced Dataset) ####
---

In [26]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [34]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [35]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(),
}


In [36]:
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro'),
    'f1_macro': make_scorer(f1_score, average='macro')
}


In [37]:
results = {}
for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=skf, scoring=scoring, return_train_score=True)

    results[name] = {}

    for metric in scoring.keys():
        train_scores = cv_results['train_'+metric]            
        train_mean_score = train_scores.mean() 
        results[name]['train_'+metric] = train_mean_score
        val_scores = cv_results['test_'+metric]
        val_mean_score = val_scores.mean()            
        results[name]['val_'+metric] = val_mean_score       

In [39]:
results_df = pd.DataFrame (results)

In [42]:
results_df

Unnamed: 0,LogisticRegression,DecisionTree,RandomForest,GradientBoosting,SVM
train_accuracy,0.855282,1.0,1.0,0.905172,0.890417
val_accuracy,0.853382,0.761471,0.860441,0.866765,0.865637
train_precision_macro,0.810809,1.0,1.0,0.890149,0.867172
val_precision_macro,0.808086,0.682552,0.838049,0.828674,0.827712
train_recall_macro,0.78212,1.0,1.0,0.851406,0.827886
val_recall_macro,0.779591,0.683967,0.771138,0.795948,0.793537
train_f1_macro,0.792965,1.0,1.0,0.86696,0.842605
val_f1_macro,0.790243,0.682992,0.788647,0.808627,0.806156


In [41]:
results_df.T

Unnamed: 0,train_accuracy,val_accuracy,train_precision_macro,val_precision_macro,train_recall_macro,val_recall_macro,train_f1_macro,val_f1_macro
LogisticRegression,0.855282,0.853382,0.810809,0.808086,0.78212,0.779591,0.792965,0.790243
DecisionTree,1.0,0.761471,1.0,0.682552,1.0,0.683967,1.0,0.682992
RandomForest,1.0,0.860441,1.0,0.838049,1.0,0.771138,1.0,0.788647
GradientBoosting,0.905172,0.866765,0.890149,0.828674,0.851406,0.795948,0.86696,0.808627
SVM,0.890417,0.865637,0.867172,0.827712,0.827886,0.793537,0.842605,0.806156


DecisionTree and RandomForest show clear signs of overfitting. Val_scores and considerably lower than Train_scores.  
LR, GB, SVM all generalize quite well. LR is a good baseline model but GB and SVM have the best balance of performance + generalization.  
These 2 will be fine_tuned.

### Fine Tuning ###
---

In [45]:
from sklearn.model_selection import RandomizedSearchCV

In [49]:
skf = StratifiedKFold (n_splits = 3, shuffle=True, random_state = 42)

In [50]:
svm = SVC()
gb = GradientBoostingClassifier()

In [51]:
param_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

param_gb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

In [52]:
search_svm = RandomizedSearchCV (svm, param_svm , cv=skf, n_iter=15, scoring = 'recall_macro', n_jobs=-1)

In [53]:
search_svm.fit(X,y)

0,1,2
,estimator,SVC()
,param_distributions,"{'C': [0.1, 1, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', ...]}"
,n_iter,15
,scoring,'recall_macro'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [55]:
print(" SVM Best params:", search_svm.best_params_)
print(" SVM Best recall_macro", search_svm.best_score_)

 SVM Best params: {'kernel': 'rbf', 'gamma': 'auto', 'C': 1}
 SVM Best recall_macro 0.7934512926833838


In [56]:
search_gb = RandomizedSearchCV (gb, param_gb , cv=skf, n_iter=15, scoring = 'recall_macro', n_jobs=-1)

In [57]:
search_gb.fit(X,y)

0,1,2
,estimator,GradientBoostingClassifier()
,param_distributions,"{'learning_rate': [0.01, 0.05, ...], 'max_depth': [3, 5, ...], 'n_estimators': [100, 200, ...]}"
,n_iter,15
,scoring,'recall_macro'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,n_estimators,300
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [58]:
print(" GB Best params:", search_gb.best_params_)
print(" GB Best recall_macro:", search_gb.best_score_)

 GB Best params: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05}
 GB Best recall_macro: 0.8004874934438119


In [59]:
final_model_gb = GradientBoostingClassifier (**search_gb.best_params_)
final_model_svm = SVC (**search_svm.best_params_)

In [60]:
final_model_svm.fit(X,y)

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [61]:
final_model_gb.fit(X,y)

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,n_estimators,300
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [62]:
import joblib

In [63]:
joblib.dump(final_model_svm, '../Models/fina_model_svm.pkl')

['../Models/fina_model_svm.pkl']

In [64]:
joblib.dump(final_model_gb, '../Models/fina_model_gb.pkl')

['../Models/fina_model_gb.pkl']