# Case 1 - Santander - Tunning Hiper-Parametros do Modelo Original
## Marcio de Lima

<img style="float: left;" src="https://guardian.ng/wp-content/uploads/2016/08/Heart-diseases.jpg" width="350px"/>

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.model_selection import train_test_split #for data splitting
import eli5 #for purmutation importance
from eli5.sklearn import PermutationImportance
import shap #for SHAP values
from pdpbox import pdp, info_plots #for partial plots
np.random.seed(123) #ensure reproducibility

pd.options.mode.chained_assignment = None  #hide any pandas warnings

#Marcio de Lima
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


<a id='section2'></a>

# The Data

In [4]:
dt = pd.read_csv("../dados/heart.csv")

In [5]:
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [6]:
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'

dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain'
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic'

dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality'
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'

dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes'

dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'
dt['st_slope'][dt['st_slope'] == 2] = 'flat'
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping'

dt['thalassemia'][dt['thalassemia'] == 1] = 'normal'
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect'
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'

In [7]:
dt['sex'] = dt['sex'].astype('object')
dt['chest_pain_type'] = dt['chest_pain_type'].astype('object')
dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object')
dt['rest_ecg'] = dt['rest_ecg'].astype('object')
dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object')
dt['st_slope'] = dt['st_slope'].astype('object')
dt['thalassemia'] = dt['thalassemia'].astype('object')

In [8]:
dt = pd.get_dummies(dt, drop_first=True)

# The Model

The next part fits a random forest model to the data,

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop('target', 1), dt['target'], test_size = .2, random_state=10) #split the data

In [10]:
model = RandomForestClassifier(max_depth=5)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)
y_pred_bin = model.predict(X_test)

In [12]:
confusion_matrix = confusion_matrix(y_test, y_pred_bin)
confusion_matrix

array([[28,  7],
       [ 3, 23]])

In [13]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.9032258064516129
Specificity :  0.7666666666666667


In [14]:
print('Accuracy of RandomForest Regression Classifier on train set: {:.2f}'.format(model.score(X_train, y_train)*100))
print('Accuracy of RandomForest Regression Classifier on test set: {:.2f}'.format(model.score(X_test, y_test)*100))

Accuracy of RandomForest Regression Classifier on train set: 92.15
Accuracy of RandomForest Regression Classifier on test set: 83.61


In [15]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.80      0.85        35
           1       0.77      0.88      0.82        26

    accuracy                           0.84        61
   macro avg       0.83      0.84      0.83        61
weighted avg       0.85      0.84      0.84        61



<a id='section4'></a>

# Tunning Model - Version 1

In [16]:
def rodarTunning(X_train, y_train, X_test, y_test, rf_classifier):
    
    param_grid = {'n_estimators': [50, 75, 100, 125, 150, 175],
                  'min_samples_split':[2,4,6,8,10],
                  'min_samples_leaf': [1, 2, 3, 4],
                  'max_depth': [5, 10, 15, 20, 25]}

    grid_obj = GridSearchCV(rf_classifier,
                            iid=True,
                            return_train_score=True,
                            param_grid=param_grid,
                            scoring='roc_auc',
                            cv=10)

    grid_fit = grid_obj.fit(X_train, y_train)
    rf_opt = grid_fit.best_estimator_

    print('='*20)
    print("best params: " + str(grid_obj.best_estimator_))
    print("best params: " + str(grid_obj.best_params_))
    print('best score:', grid_obj.best_score_)
    print('='*20)
    
    print(classification_report(y_test, rf_opt.predict(X_test)))

    print('New Accuracy of Model on train set: {:.2f}'.format(rf_opt.score(X_train, y_train)*100))
    print('New Accuracy of Model on test set: {:.2f}'.format(rf_opt.score(X_test, y_test)*100))

    return rf_opt

In [17]:
rf_classifier = RandomForestClassifier(class_weight = "balanced", random_state=7)
rf_opt = rodarTunning(X_train, y_train, X_test, y_test, rf_classifier)

best params: RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=3,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=125, n_jobs=None, oob_score=False,
                       random_state=7, verbose=0, warm_start=False)
best params: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 125}
best score: 0.9244156669776504
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.81      0.81      0.81        26

    accuracy                           0.84        61
   macro avg       0.83      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61

New Accuracy of Model on train set: 95

# Tunning Model - Version 2

### Dados com escalas diferentes - Aplicando MinMaxScaler

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 5))
df_HR = dt
HR_col = list(df_HR.columns)
HR_col.remove('target')
for col in HR_col:
    df_HR[col] = df_HR[col].astype(float)
    df_HR[[col]] = scaler.fit_transform(df_HR[[col]])
df_HR['target'] = pd.to_numeric(df_HR['target'], downcast='float')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df_HR.drop('target', 1), df_HR['target'], test_size = .2, random_state=10) #split the data
rf_classifier = RandomForestClassifier(class_weight = "balanced", random_state=7)
rf_opt2 = rodarTunning(X_train, y_train, X_test, y_test, rf_classifier)

best params: RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=3,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=150, n_jobs=None, oob_score=False,
                       random_state=7, verbose=0, warm_start=False)
best params: {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 150}
best score: 0.9263678470290041
              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84        35
         1.0       0.78      0.81      0.79        26

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61

New Accuracy of Model on train set: 92.9

# Tunning Model - Version 3

## Avaliando outros modelos

In [20]:
from sklearn import svm, tree, linear_model, neighbors
from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

def testarModelos(X_train, X_test, y_train, y_test):

    models = []
    models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state=7,
                                                             class_weight='balanced')))
    models.append(('SVM', SVC(gamma='auto', random_state=7)))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('Decision Tree Classifier',
                   DecisionTreeClassifier(random_state=7)))
    models.append(('Gaussian NB', GaussianNB()))
    models.append(('Xgboost', XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)))
    models.append(('RandomForestClassifier', RandomForestClassifier(max_depth=5)))

    acc_results = []
    auc_results = []
    names = []

    col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 
           'Accuracy Mean', 'Accuracy STD']
    df_results = pd.DataFrame(columns=col)
    i = 0

    for name, model in models:
        kfold = model_selection.KFold(
            n_splits=10, random_state=7)  # 10-fold cross-validation

        cv_acc_results = model_selection.cross_val_score(  # accuracy scoring
            model, X_train, y_train, cv=kfold, scoring='accuracy')

        cv_auc_results = model_selection.cross_val_score(  # roc_auc scoring
            model, X_train, y_train, cv=kfold, scoring='roc_auc')

        acc_results.append(cv_acc_results)
        auc_results.append(cv_auc_results)
        names.append(name)
        df_results.loc[i] = [name,
                             round(cv_auc_results.mean()*100, 2),
                             round(cv_auc_results.std()*100, 2),
                             round(cv_acc_results.mean()*100, 2),
                             round(cv_acc_results.std()*100, 2)
                             ]
        i += 1
    return df_results.sort_values(by=['ROC AUC Mean'], ascending=False)


In [21]:
#Sem MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(dt.drop('target', 1), dt['target'], test_size = .2, random_state=10) 
df_results = testarModelos(X_train, X_test, y_train, y_test)
print(df_results)

#Com MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(df_HR.drop('target', 1), df_HR['target'], test_size = .2, random_state=10) 
df_results = testarModelos(X_train, X_test, y_train, y_test)
print(df_results)

                  Algorithm  ROC AUC Mean  ROC AUC STD  Accuracy Mean  \
0       Logistic Regression         91.73         4.36          82.28   
5                   Xgboost         91.52         6.94          82.33   
6    RandomForestClassifier         88.46         6.77          81.47   
4               Gaussian NB         87.91         7.34          79.78   
1                       SVM         86.97         6.30          79.40   
2                       KNN         86.65         8.03          80.63   
3  Decision Tree Classifier         77.31         8.60          76.45   

   Accuracy STD  
0          5.97  
5          8.16  
6          6.99  
4          6.71  
1          6.35  
2          4.98  
3          6.69  
                  Algorithm  ROC AUC Mean  ROC AUC STD  Accuracy Mean  \
0       Logistic Regression         91.73         4.36          82.28   
5                   Xgboost         91.52         6.94          82.33   
4               Gaussian NB         87.91         7.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop('target', 1), dt['target'], test_size = .2, random_state=10) 
rf_classifier = XGBClassifier(learning_rate=0.02, objective='binary:logistic')
rf_opt3 = rodarTunning(X_train, y_train, X_test, y_test, rf_classifier)

best params: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, min_samples_leaf=1, min_samples_split=2,
              missing=None, n_estimators=175, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)
best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 175}
best score: 0.9106463784149734
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.85        35
         1.0       0.80      0.77      0.78        26

    accuracy                           0.82        61
   macro avg       0.82      0.81      0.81        61
weighted avg       0.82      0.82      0.82        61

New

## Tunning 1 demonstrou melhor acurária e maior acertos nas 2 target (0 e 1)
#### Tivemos um aumento de 3% no Treinamento e o mesmo resultado no Teste, mas pela métrica de matriz de confusão e relatório de classificação o acerto entre as classes foi equalizado, mais genérico. 
#### Não foi muita diferença na aplicação de escala no dataset, desta forma, foi ignorada. 
#### O modelo XGBClassifier aparece como promissor, mas para o case, vamos seguir com a decisão do Data Science (Autor) com o RandomForestClassifier

In [23]:
#Save Modelo Tunning Version 1 - Marcio de Lima
import pickle

filename = 'modelo/tunning_model.pkl'
pickle.dump(rf_opt, open(filename, 'wb'))