In [None]:
import time
import pickle
from tabulate import tabulate
from IPython.display import display

import pandas as pd
pd.set_option('display.colheader_justify', 'center')
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV

from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier,RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.metrics import roc_auc_score, f1_score, roc_curve, RocCurveDisplay, brier_score_loss
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline

import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

### Helper functions

In [None]:
randomState = 42
url='Datasets/diabetes_dataset.csv'

def getData(url = url):

    url = url
    df = pd.read_csv(url)
    return df

def cleanData(url = url):

    #Drop duplicates
    print('Dropping duplicates...\n')
    df.drop_duplicates(inplace=True)
    time.sleep(1)

    #Change all column names to lower case
    print('Converting to lower case columns and data...')
    df.columns = df.columns.str.replace('Diabetes_binary','diabetes').str.lower()

    #This next for loop doesn't get executed because there are no "object" type columns
    for col in df.select_dtypes(object).columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')

    time.sleep(1)

    return df

def splitData(df):

    target = df.diabetes
    data = df.drop(columns=['diabetes'])
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=randomState)
    dfTrain, dfVal, yTrain, yVal = train_test_split(dfTrainFull, yTrainFull, test_size=0.25, random_state=randomState)

    print(  f'Dataset has been split in: Training set with {len(yTrain)} samples, '
            f'Validation set with {len(yVal)} samples and Test set with {len(yTest)} samples')

    return dfTrainFull, yTrainFull, dfTrain, yTrain, dfVal, yVal, dfTest, yTest

def printHelper(f1Score, auc):
    
    print('\n---------------------------------')
    print(f'Test set weighted f1-score: {f1Score}')
    print(f'Test set auc: {auc}')
    print('---------------------------------\n')

def printResults(results):  
    print('\n-----------------------------------------')
    for i,j in results.items():
        print('{:<20}:  {:<6}'.format(i, " ± ".join([str(x) for x in j])))
    print('-----------------------------------------')

    
def getMeasures(model):

    yTestpredProb = model.predict_proba(dfTest)[:,1]
    yTestpred = model.predict(dfTest)
    auc = round(roc_auc_score(yTest, yTestpredProb),3)
    f1Score = round(f1_score(yTest, yTestpred, average='weighted'),3)
    modelName = type(model.named_steps.classifier).__name__

    printHelper(f1Score, auc)

    print(classification_report(yTest, yTestpred))

    fig1, ax1 = plt.subplots(figsize=(14, 6))
    fpr, tpr, _ = roc_curve(yTest.values, yTestpredProb)
    roc_display1 = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax1, name=f'ROC_AUC {modelName}', )
    ax1.text(0.4,0.5,f'auc = {auc}', size=14, fontweight='semibold', )
    ax1.text(0.4,0.4,f'Weighted f1  = {f1Score}', size=14, fontweight='semibold')
    ax1.legend(loc=4, prop={'size': 20})

    print()

    fig2, ax2 = plt.subplots(1,2, figsize=(16, 6))
    ax2[0].grid(False)
    ax2[1].grid(False)

    cm = confusion_matrix(yTest, yTestpred)
    cmprob = confusion_matrix(yTest, yTestpred, normalize='true')
    cm_display1 = ConfusionMatrixDisplay(cm, display_labels=['No-Diabetes', 'Diabetes'])
    cm_display2 = ConfusionMatrixDisplay(cmprob, display_labels=['No-Diabetes', 'Diabetes'])

    cm_display1.plot(ax=ax2[0])
    cm_display1.ax_.set_title("Confusion Matrix", size=16)
    cm_display2.plot(ax=ax2[1])
    cm_display2.ax_.set_title("Narmalized Confusion Matrix", size=16)

    return [auc], [f1Score]#, yTestpred, yTestpredProb

def getResults(model, params):
    
    baseParams = ["mean_train_auc",
                  "std_train_auc",
                  "rank_test_auc",
                  "mean_train_f1_weighted",
                  "std_train_f1_weighted",
                  "rank_test_f1_weighted"
    ]
    
    allParams = baseParams + params
    
    cv_results = pd.DataFrame(model.cv_results_)

    res = cv_results[allParams]
    
    if 'param_classifier__reg_lambda' not in params:
        display(res.query('rank_test_auc < 30 & rank_test_f1_weighted < 30').sort_values(by=["rank_test_auc", "rank_test_f1_weighted"]))
    else:
        display(res.sort_values(by=["rank_test_auc", "rank_test_f1_weighted"]).head(20))

def getBestModelResults(model):
    
    cv_results = cross_validate(model,
                                dfTrainFull,
                                yTrainFull,
                                cv=outerCV,
                                scoring=['f1_weighted','roc_auc'],
                                n_jobs=-1,
                                return_train_score=True,
                                return_estimator=True,
    )

    cv_results = pd.DataFrame(cv_results)
    cv_test_scores = cv_results[['test_f1_weighted', 'train_f1_weighted', 'test_roc_auc', 'train_roc_auc']]
    cv_test_scores.columns = ['val_f1_weighted', 'val_roc_auc','train_f1_weighted', 'train_roc_auc']
    
    print("Scores after hyperparameters tuning:\n")
    
    res = cv_test_scores.copy()
    res.loc['mean'] = res.mean().round(4)
    res.loc['std'] = res.std().round(4)
    
    results = {}
    for col in res:
        print('{:<20}:  {:<6} +/- {:<6}'.format(col, res.loc["mean"][col], res.loc["std"][col]))
        #print(f'{col}: {res.loc["mean"][col]} +/- {res.loc["std"][col]}')
        results[col] = [res.loc["mean"][col], res.loc["std"][col]]
    
    return results

def reload(module):
    importlib.reload(module)

### Load binary unbalanced data

In [None]:
df = getData()
df.head()

## EDA

#### Preparing and cleaning data

In [None]:
df = cleanData(df)
df

#### Checking Correlations

In [None]:
sns.set_theme()
corr_matrix = df.corr().abs()
plt.figure(figsize=(17,17))

_ = sns.heatmap(corr_matrix, cmap="Blues", annot=True)

In [None]:
#corr_matrix.diabetes.sort_values(ascending=False).hist()
sns.set_theme()
dfMatrix = df.drop(columns='diabetes')
corr_matrix = dfMatrix.corrwith(df.diabetes).abs()
plt.figure(figsize=(14,5))
_ = corr_matrix.plot(kind='bar', grid=True)
_ = plt.show()    

In [None]:
corr_matrix.sort_values(ascending=False)

In [None]:
df.drop(columns=['nodocbccost', 'fruits', 'anyhealthcare', 'veggies'], inplace=True)

#### The target value is heavily imbalanced.  No Diabetes- 194377, Diabetes- 35097.

In [None]:
_= df.hist(figsize=(16,16))

In [None]:
dfTrainFull, yTrainFull, dfTrain, yTrain, dfVal, yVal, dfTest, yTest = splitData(df)

#### Preprocessing

This part will be used for the Logistic Regression classifier only

In [None]:
categoricalCols = ['highbp', 'highchol', 'cholcheck','smoker','stroke',
                   'heartdiseaseorattack', 'physactivity', 'hvyalcoholconsump',
                   'genhlth','diffwalk', 'sex', 'education']

numericalCols = ['bmi', 'menthlth', 'physhlth', 'income']

In [None]:
# creating preprocesors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

# Transforming the data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    ('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")

## ML Models

#### Setting up some variables

In [None]:
# Inner cross-validation(for Hyperparameter tuning)
innerCV = StratifiedKFold(n_splits=5, shuffle=True, random_state=randomState)
# Outer cross-validation(for testing the tunned model)
outerCV = StratifiedKFold(n_splits=3, shuffle=True, random_state=randomState)

scoring = {"auc": "roc_auc", "f1_weighted": "f1_weighted"}

### DecisionTreeClassifier

#### Will use nested CrossValidation

inner cross-validation(for Hyperparameter tuning)

In [None]:
%%time

# Model
model_DT = Pipeline([("classifier", DecisionTreeClassifier(class_weight='balanced', random_state = randomState))])

# Gridsearch params
param_grid = {
    'classifier__max_depth': (1,3,5,7,10),
    'classifier__max_leaf_nodes': (1, 5,10,15,20),
    'classifier__max_features': (1,3,5,7,10,15)
}

# Gridsearch
model_grid_search_DT = GridSearchCV(model_DT,
                                    param_grid=param_grid,
                                    scoring=scoring,
                                    n_jobs=-1,
                                    cv=innerCV,
                                    return_train_score=True,
                                    refit=False)

_ = model_grid_search_DT.fit(dfTrainFull, yTrainFull)

In [None]:
paramsDT = ["param_classifier__max_depth",
            'param_classifier__max_leaf_nodes',
            "param_classifier__max_features"]

getResults(model_grid_search_DT, paramsDT)

#### Selecting best parameters
We will choose max_depth=7, max_features=5 and max_leaf_nodes=20. Reaching a compromise between F1 Score and AUC.

In [None]:
%%time

bestParamsDT = ['max_depth=7', 'max_leaf_nodes=20', 'max_features=5']

modelDT = Pipeline([("classifier", DecisionTreeClassifier(class_weight='balanced',
                                                        random_state = randomState,
                                                        max_depth=7,
                                                        max_leaf_nodes=20,
                                                        max_features=15))])

_ = modelDT.fit(dfTrainFull, yTrainFull)

Outer cross-validation(for testing the tunned model)

In [None]:
results = getBestModelResults(modelDT)

#### Curves and error measures

In [None]:
results['test_roc'], results['test_f1-score'] = getMeasures(modelDT)

### DecisionTreeClassifier results:

In [None]:
printResults(results)

### Logistic Regression

In [None]:
%%time

model_LR = Pipeline([("processor", preprocessor),
                  ("classifier", LogisticRegression(max_iter=1000,
                                                    class_weight='balanced',
                                                    random_state=randomState))])

param_grid = {
    'classifier__C': (1e-3, 1e-2, 0.1, 1, 5, 10, 20),
}
scoring = {"auc": "roc_auc", "f1_weighted": "f1_weighted"}

model_grid_search_LR = GridSearchCV(model_LR,
                                 param_grid=param_grid,
                                 scoring=scoring,
                                 n_jobs=-1,
                                 cv=innerCV,
                                 return_train_score=True,
                                 refit=False)
_ = model_grid_search_LR.fit(dfTrainFull, yTrainFull)

In [None]:
paramsLR = ["param_classifier__C"]

getResults(model_grid_search_LR, paramsLR)

#### Selecting best parameters
Choosing C=0.1, in this case it's the beast AUC and 2nd best F1 Score

In [None]:
%%time

modelLR = Pipeline([("classifier", LogisticRegression(max_iter=1000,
                                                    C=0.01,
                                                    class_weight='balanced',
                                                    random_state=randomState))])

_ = modelLR.fit(dfTrainFull, yTrainFull)

Outer cross-validation(for testing the tunned model)

In [None]:
results = getBestModelResults(modelLR)

#### Curves and error measures

In [None]:
results['test_roc'], results['test_f1-score'] = getMeasures(modelLR)

### Logistic Regression results:

In [None]:
printResults(results)

### Random Forest

In [None]:
%%time

model_RF = Pipeline([("classifier", RandomForestClassifier(n_estimators=10,
                                                           class_weight='balanced',
                                                           random_state=randomState))])

param_grid = {
    'classifier__max_depth': (5,10,15,20,25),
    'classifier__max_leaf_nodes': (5,10,15,20,30),
    'classifier__max_features': (3,5,7,10)
}

scoring = {"auc": "roc_auc", "f1_weighted": "f1_weighted"}

model_grid_search_RF = GridSearchCV(model_RF,
                                 param_grid=param_grid,
                                 scoring=scoring,
                                 n_jobs=-1,
                                 cv=innerCV,
                                 return_train_score=True,
                                 refit=False)
_ = model_grid_search_RF.fit(dfTrainFull, yTrainFull)

In [None]:
paramsRF = ["param_classifier__max_depth",
            'param_classifier__max_leaf_nodes',
            "param_classifier__max_features"]

getResults(model_grid_search_RF, paramsRF)

#### Selecting best parameters
Choosing max_depth = 10, max_leaf_nodes=30 and max_features=10. It's a good AUC and a good F1 Score

In [None]:
%%time

modelRF = Pipeline([("classifier", RandomForestClassifier(n_estimators=10,
                                                          max_depth = 10,
                                                          max_leaf_nodes=30,
                                                          max_features=10,
                                                          class_weight='balanced',
                                                          random_state=randomState))])

_ = modelRF.fit(dfTrainFull, yTrainFull)

Outer cross-validation(for testing the tuned model)

In [None]:
results = getBestModelResults(modelRF)

#### Curves and error measures

In [None]:
results['test_roc'], results['test_f1-score'] = getMeasures(modelRF)

### Random Forest results:

In [None]:
printResults(results)

### XGBoost Classifier

In [None]:
imbalanceRatio = (yTrainFull==0).sum() / (yTrainFull==1).sum()
imbalanceRatio = round(imbalanceRatio, 2)
imbalanceRatio

In [None]:
%%time
# 58 min 15 s
# Inner cross-validation(for Hyperparameter tuning)
innerCV = StratifiedKFold(n_splits=3, shuffle=True, random_state=randomState)

model_XGB = Pipeline([("classifier", XGBClassifier(n_estimators=10,
                                                   random_state=randomState, 
                                                   tree_method='gpu_hist',
                                                   scale_pos_weight=imbalanceRatio)
                      )])

param_grid = {
    'classifier__max_depth' : (2, 5, 8, 10),
    'classifier__learning_rate' : (0.01, 0.1, 0.5, 0.8),
    'classifier__min_child_weight' : (1,10,20),
    'classifier__reg_lambda' : (1, 3, 5, 8),
}

scoring = {"auc": "roc_auc", "f1_weighted": "f1_weighted"}

model_grid_search_XGB = GridSearchCV(model_XGB,
                                 param_grid=param_grid,
                                 scoring=scoring,
                                 n_jobs=-1,
                                 cv=innerCV,
                                 return_train_score=True,
                                 verbose=4,                                 
                                 refit=False)

_ = model_grid_search_XGB.fit(dfTrain, yTrain)

In [None]:
paramsXGB = ["param_classifier__max_depth",
             'param_classifier__learning_rate',
             "param_classifier__min_child_weight",
             "param_classifier__reg_lambda"]

getResults(model_grid_search_XGB, paramsXGB)

#### Selecting best parameters
Choosing max_depth = 8, learning_rate = 0.5, min_child_weight = 20 and reg_lambda = 8,. It's a good compromise between a good AUC and F1 Score

In [None]:
%%time

modelXGB = Pipeline([("classifier", XGBClassifier(n_estimators = 10,
                                                  max_depth = 5,
                                                  learning_rate = 0.8,
                                                  min_child_weight = 10,
                                                  reg_lambda = 8,
                                                  random_state=randomState, 
                                                  tree_method='gpu_hist',
                                                  scale_pos_weight=imbalanceRatio))
                    ])

_ = modelXGB.fit(dfTrainFull, yTrainFull)

Outer cross-validation(for testing the tuned model)

In [None]:
getBestModelResults(modelXGB)

#### Curves and measures of error

In [None]:
_ = getMeasures(modelXGB)

### XGBoost results:
| Measure           |    mean +/- std   |
|:-                 |:-:                |
| val f1_weighted   | 0.7505 +/- 0.0032 |
| val roc_auc       | 0.7591 +/- 0.0033 |
| train f1_weighted | 0.8114 +/- 0.0008 |
| train roc_auc     | 0.835  +/- 0.0008 |
|test f1_weighted   | 0.749             |
|test roc_auc       | 0.813             |

### AdaBoostClassifier

In [None]:
%%time
model_AB = Pipeline([("classifier", AdaBoostClassifier(base_estimator = DecisionTreeClassifier(),
                                                       random_state=randomState)
                      )])

param_grid = {
    'classifier__learning_rate' : (0.1, 0.5, 1, 2),
    'classifier__base_estimator__class_weight': [None, 'balanced'],
    'classifier__base_estimator__max_depth': [1, 3, 5],
    'classifier__base_estimator__max_leaf_nodes': [1, 3, 5],
    
}

scoring = {"auc": "roc_auc", "f1_weighted": "f1_weighted"}

model_grid_search_AB = GridSearchCV(model_AB,
                                 param_grid=param_grid,
                                 scoring=scoring,
                                 n_jobs=-1,
                                 cv=innerCV,
                                 return_train_score=True,
                                 verbose=4,                                 
                                 refit=False)

_ = model_grid_search_AB.fit(dfTrainFull, yTrainFull)

In [None]:
paramsAB = ['param_classifier__learning_rate',
            'param_classifier__base_estimator__class_weight',
            'param_classifier__base_estimator__max_depth',
            'param_classifier__base_estimator__max_leaf_nodes'
           ]

getResults(model_grid_search_AB, paramsAB)

#### Selecting best parameters
Choosing max_depth = 3, max_leaf_nodes= 3  and learning_rate = 0.8 and class_weight = None.

In [None]:
%%time

modelAB = Pipeline([("classifier", AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(class_weight = 'balanced',
                                            max_depth = 1,
                                            max_leaf_nodes = 5,
                                           ), learning_rate = 1.0, random_state=randomState))
                   ])

_ = modelAB.fit(dfTrainFull, yTrainFull)

Outer cross-validation(for testing the tuned model)

In [None]:
getBestModelResults(modelAB)

#### Curves and error measures

In [None]:
_ = getMeasures(modelAB)

### AdaBoostClassifier results:
| Measure           |    mean +/- std   |
|:-                 |:-:                |
| val f1_weighted   | 0.7551 +/- 0.0015 |
| val roc_auc       | 0.7554 +/- 0.001  |
| train f1_weighted | 0.8116 +/- 0.001 |
| train roc_auc     | 0.8126 +/- 0.0006 |
|test f1_weighted   | 0.756             |
|test roc_auc       | 0.813             |

## Comparing models

In [None]:
models = [modelDT, modelLR, modelRF, modelXGB, modelAB]

fig, ax = plt.subplots(figsize=(15, 10))
delta = 0

for model in models:

    yTestpredProb_ = model.predict_proba(dfTest)[:,1]
    yTestpred_ = model.predict(dfTest)
    modelName = type(model.named_steps.classifier).__name__
    
    auc = round(roc_auc_score(yTest, yTestpredProb_),3)
    f1Score = round(f1_score(yTest, yTestpred_, average='weighted'),3)
    
    fpr, tpr, _ = roc_curve(yTest.values, yTestpredProb_)
    roc_display1 = RocCurveDisplay(fpr=fpr, tpr=tpr)
    roc_display1.plot(ax=ax, name=modelName)
    #roc_display1.ax_.(linestyle='dashed', marker='o')
    roc_display1.ax_.set_title('ROC_AUC and F1 SCORE', size= 16)
    ax.text(0.65, 0.155 - delta, f'auc = {auc}', size=14 )
    ax.text(0.78, 0.155 - delta, f'weighted f1  = {f1Score}', size=14)
    ax.legend(loc='lower center', prop={'size': 14})
    delta += 0.043  

#### In this case the best model is the AdaBoostClassifier.

## Saving Models with Pickle

In [None]:
import os

if not os.path.exists('models'):
    os.mkdir('models')

models = [modelDT, modelLR, modelRF, modelXGB, modelAB]
for model in models:
    modelName = type(model.named_steps.classifier).__name__
    print(f'Saving pickle file for {modelName}')
    outputFile = f'models/{modelName}.bin'
    with open(outputFile, 'wb') as f:
        pickle.dump(model, f)

## Testing model predictions

In [None]:
inputData = {
    'highbp': 'No',
    'highchol': 'Yes',
     'cholcheck': 'Yes',
     'bmi': 27,
     'smoker': 'Yes',
     'stroke': 'No',
     'heartdiseaseorattack': 'No',
     'physactivity': 'No',
     'fruits': 'Yes',
     'veggies': 'Yes',
     'hvyalcoholconsump': 'No',
     'anyhealthcare': 'No',
     'nodocbccost': 'No',
     'genhlth': 'Yes',
     'menthlth': 0,
     'physhlth': 0,
     'diffwalk': 'No',
     'sex': 'Female',
     'age': 25,
     'education': 'Some high school',
     'income': '$25,000 to $35,000'
}
inputData

In [None]:
#Transform inputdata to an inputdataframe
data = pd.DataFrame(inputData, index=[1])
data.drop(columns=['nodocbccost', 'fruits', 'anyhealthcare', 'veggies'], inplace=True)
# Mappings
sex = {'Male':0, 'Female':1}
binary = {'Yes': 1, 'No': 0}
education = {'Never attended school or only kindergarten' : 1,
             'Elementary school' : 2,
             'Some high school' : 3,
             'High school graduate' : 4,
             'Some college or technical school' : 5,
             'College graduate' : 6 }
income = {'less than $10,000'  : 1,
          '$10,000 to $15,000' : 2,
          '$15,000 to $20,000' : 3, 
          '$20,000 to $25,000' : 4,
          '$25,000 to $35,000' : 5,
          '$35,000 to $50,000' : 6,
          '$50,000 to $75,000' : 7,
          '$75,000 or  more'   : 8}


def getAgeRange(age):
    
    if 18 <= age <= 24:
        ageRange = 1
    elif 25 <= age <= 29:
        ageRange = 2
    elif 30 <= age <= 34: 
        ageRange = 3
    elif 35 <= age <= 39: 
        ageRange = 4
    elif 40 <= age <= 44:
        ageRange = 5
    elif 45 <= age <= 49:
        ageRange = 6
    elif 50 <= age <= 54:
        ageRange = 7
    elif 55 <= age <= 59:
        ageRange = 8
    elif 60 <= age <= 64:
        ageRange = 9
    elif 65 <= age <= 69:
        ageRange = 10
    elif 70 <= age <= 74:
        ageRange = 11
    elif 75 <= age <= 79:
        ageRange = 12
    elif 80 <= age:
        ageRange = 13
    return ageRange

ageRange = getAgeRange(data.age.values)


In [None]:
# Replace values
data.replace( binary | sex | education | income , inplace=True)
data.age = ageRange

# get models from folder and load models to a dictionary
path = 'models/'
files = os.listdir(path)
models = {}

for file in files:
    filename = file.split('.')[0]
    with open('models/' + file, 'rb') as f:
        models[filename] = pickle.load(f)        

# Make predictions
for name, model in models.items():
    yProb_ = model.predict_proba(data)[:,1]
    yPred_ = model.predict(data)
    print(f'{name:<22} : {yProb_} = {yPred_}')

In [None]:
inputData2 = {
    'highbp': 'No',
    'highchol': 'Yes',
    'cholcheck': 'Yes',
    'bmi': 30,
    'smoker': 'Yes',
    'stroke': 'No',
    'heartdiseaseorattack': 'No',
    'physactivity': 'Yes',
    'fruits': 'Yes',
    'veggies': 'Yes',
    'hvyalcoholconsump': 'No',
    'anyhealthcare': 'Yes',
    'nodocbccost': 'No',
    'genhlth': 'Yes',
    'menthlth': 'No',
    'physhlth': 'No',
    'diffwalk': 'No',
    'sex': 'Male',
    'age': 57,
    'education': 'High school graduate',
    'income': '$75,000 or  more'}

In [None]:
# Replace values
data.replace( binary | sex | education | income , inplace=True)
data.age = ageRange

# get models from folder and load models to a dictionary
path = 'models/'
files = os.listdir(path)
models = {}

for file in files:
    filename = file.split('.')[0]
    with open('models/' + file, 'rb') as f:
        models[filename] = pickle.load(f) 
        
# Make predictions
for name, model in models.items():
    yProb_ = model.predict_proba(data)[:,1]
    yPred_ = model.predict(data)
    print(f'{name:<22} : {yProb_} = {yPred_}')        