# Results evaluation (diagnosis)
by: masood janbackloo

## Modules

In [None]:
# Standard modules
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

# Graphical modules
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Models
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Bootstrap
from sklearn.utils import resample
# Evaluation
from scipy import stats
from sklearn.metrics import classification_report, confusion_matrix, brier_score_loss
from sklearn.metrics import f1_score,roc_auc_score,recall_score, precision_score, accuracy_score, balanced_accuracy_score

# Import models
import joblib

# Import data & models

In [None]:
# Models
mdl_lre_ove = joblib.load('model_lre_oversampled.sav')
mdl_lre_bal = joblib.load('model_lre_balanced.sav')
mdl_svm_ove = joblib.load('model_svm_oversampled.sav')
mdl_svm_bal = joblib.load('model_svm_balanced.sav')
mdl_rfc_ove = joblib.load('model_rfc_oversampled.sav')
mdl_rfc_bal = joblib.load('model_rfc_balanced.sav')
mdl_bst_ove = joblib.load('model_bst_oversampled.sav')
mdl_bst_bal = joblib.load('model_bst_balanced.sav')
mdl_xgb_ove = joblib.load('model_xgb_oversampled.sav')
mdl_xgb_bal = joblib.load('model_xgb_balanced.sav')
# Dummy models
#mdl_dmf = joblib.load('results_modelsDevelopment/model_dummy_mf_oversampled.sav')
#mdl_dst = joblib.load('results_modelsDevelopment/model_dummy_st_oversampled.sav')
# Testset
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')
# BOCV-5 results
bocv5_results = pd.read_csv('BO5CV_best_results.csv')
# Backtest
x_resp = pd.read_csv('x_resp.csv')
y_resp = pd.Series(np.zeros((len(x_resp),)))

# Auxilar functions & Parameters

In [None]:
# Bootstrap parameters
REPETITIONS = 999
RS_GENERATOR = range(0,REPETITIONS)

In [None]:
def calculate_metrics(model, x_test, y_true):
    # Generate a prediction using the model
    y_pred = model.predict(x_test)
    # Calculate metrics
    f1s = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    # return results
    return {'F1-Score':round(100*f1s,3),'AUC ROC':round(100*auc,3),'Accuracy':round(100*acc,3)}

In [None]:
def calculate_confusion_matrix_results(model, x_test, y_true):
    # Generate a prediction using the model
    y_pred = model.predict(x_test)
    
    # Calculate a confusion matrix to retrieve the binary CM values
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Return
    return {'TN':tn,'TP':tp,'FN':fn,'FP':fp}

In [None]:
def calculate_1_minus_brier_score_loss(model, x_test, y_true):
    y_pred = model.predict_proba(x_test)[:,1]
    prob_score = brier_score_loss(y_true, y_pred, pos_label=1)
    return {'Brier Score':round(prob_score, 3), '1 - Brier Score':round(1-prob_score, 3)}

In [None]:
def bootstrap_resampling(x_test, y_test, rs_number, sample_size):
    # Generate X sample
    bootstrap_x = resample(x_test, replace=True, n_samples=sample_size, random_state=rs_number)
    # Get index for X to get Y value
    bootstrap_y = y_test.loc[bootstrap_x.index]
    # Return
    return bootstrap_x, bootstrap_y

In [None]:
def bootstrap_confidence_interval(values):
    percents = np.percentile(values, [2.5, 97.5])
    lower_bound = round(max(0.0, percents[0]), 3)
    upper_bound = round(min(1.0, percents[1]), 3)
    mean_value = round(np.mean(values), 3)
    return (lower_bound, mean_value, upper_bound)

In [None]:
def calculate_metrics_best_model(model, x_test, y_true):
    # Generate a prediction using the model
    y_pred = model.predict(x_test)
       
    # Calculate several metrics with SKlearn
    f1score = f1_score(y_true, y_pred) #
    roc_auc = roc_auc_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred,pos_label=1)
    specificity = recall_score(y_true, y_pred,pos_label=0)
    precision = precision_score(y_true, y_pred)

    # Return
    return f1score, roc_auc, sensitivity, specificity, precision  

In [None]:
def calculate_backtest_accuracy(model, x_resp, y_resp):
    # Generate a prediction using the model
    y_pred = model.predict(x_resp)
    # Calculate accuracy
    acc = accuracy_score(y_resp, y_pred)
    # return
    return acc

In [None]:
def calculate_backtest_real_accuracy(mdl_object, x_resp, y_resp):
    return {'ACC (%)':100*accuracy_score(y_resp.values.ravel(),mdl_object.predict(x_resp)).round(2)}

In [None]:
def get_metrics_using_bootstrap_best_model(model, x_test, y_test, rs_generator):
    # Metrics list
    f1s_list = []
    roc_list = []
    sen_list = []
    spe_list = []
    pre_list = []
    
    # Bootstrap Stratified
    # Get indexes for positive and negative cases
    neg_idx = y_test.loc[y_test['COVID-19 Exam result'] == 0, 'COVID-19 Exam result'].index
    pos_idx = y_test.loc[y_test['COVID-19 Exam result'] == 1, 'COVID-19 Exam result'].index
    
    # Split X set for positive and negative cases
    x_test_neg = x_test.loc[neg_idx,:]
    x_test_pos = x_test.loc[pos_idx,:]
    
    # Split Y set for positive and negative cases
    y_test_neg = y_test.loc[neg_idx,:]
    y_test_pos = y_test.loc[pos_idx,:]
    
    # Loop to generate a sample and generate metrics
    for rs in rs_generator:
        # Bootstrap resampling - negative
        x_sample_neg, y_sample_neg = bootstrap_resampling(x_test_neg, y_test_neg, rs_number=rs, sample_size=len(y_test_neg))
        # Bootstrap resampling - positive
        x_sample_pos, y_sample_pos = bootstrap_resampling(x_test_pos, y_test_pos, rs_number=rs, sample_size=len(y_test_pos))
        # Merge them into one
        x_sample = x_sample_neg.append(x_sample_pos)
        y_sample = y_sample_neg.append(y_sample_pos)
        # Calculate the metrics
        f1s, roc, sen, spe, pre = calculate_metrics_best_model(model, x_sample, y_sample)
        # Append results
        f1s_list.append(f1s)
        roc_list.append(roc)
        sen_list.append(sen)
        spe_list.append(spe)
        pre_list.append(pre)
    
    # Return
    return {'F1S':f1s_list,'ROC':roc_list,'SEN':sen_list,'SPE':spe_list,'PRE':pre_list}

In [None]:
def get_metrics_using_bootstrap_for_backtest(model, x_resp, y_resp, rs_generator):
    # Metrics list
    acc_list = []
    
    # Loop to generate a sample and generate metrics
    for rs in rs_generator:
        x_sample, y_sample = bootstrap_resampling(x_resp, y_resp, rs_number=rs)
        acc = calculate_backtest_accuracy(model, x_sample, y_sample)
        # Append results
        acc_list.append(acc)
    
    # Return
    return acc_list

In [None]:
def eval_pvalue(pval):
    if pval > 0.05:
        return 'No significant difference between distributions (fail to reject H0)'
    else:
        return 'Different distributions (reject H0)'

# EVALUATION 0 - Model Development results

In [None]:
# Rounding up
bocv5_results['mean_test_score'] = np.round(100*bocv5_results['mean_test_score'],3)
bocv5_results['std_test_score'] = np.round(100*bocv5_results['std_test_score'],3)

# Show
bocv5_results[['mean_test_score','std_test_score','ai_algorithm']]

Unnamed: 0,mean_test_score,std_test_score,ai_algorithm
0,83.136,2.661,lre
1,95.561,2.56,svm
2,95.561,2.56,rfc
3,95.561,2.56,bst
4,95.561,2.56,xgb


# EVALUATION 1 - Calculate several metrics for test set 

**Metrics**
- F1 Score (related to precision and recall)
- AUC ROC Score
- Accuracy

## Test Overall results

In [None]:
# List to keep our results
test_results = []
# Loop to calculate test results
for mdl_object, mdl_name in zip([mdl_lre_ove,mdl_lre_bal, mdl_svm_ove,mdl_svm_bal, mdl_rfc_ove, mdl_rfc_bal,mdl_bst_ove, mdl_bst_bal,mdl_xgb_ove,mdl_xgb_bal],['LRE_ov','LRE_bal','SVM_ove','SVM_bal','RFC_ove','RFC_bal','BST_ove', 'BST_bal','XGB_ove','XGB_bal']):
    test_result = calculate_metrics(mdl_object, x_test, y_test)
    test_result['Model'] = mdl_name
    test_results.append(test_result)

# Transform into a dataframe
df_test_results = pd.DataFrame(test_results)
df_test_results

Unnamed: 0,F1-Score,AUC ROC,Accuracy,Model
0,51.282,80.769,78.889,LRE_ov
1,50.0,77.885,80.0,LRE_bal
2,46.154,69.872,84.444,SVM_ove
3,53.333,76.923,84.444,SVM_bal
4,78.261,86.218,94.444,RFC_ove
5,54.545,72.436,88.889,RFC_bal
6,72.727,82.051,93.333,BST_ove
7,60.0,73.718,91.111,BST_bal
8,61.538,79.487,88.889,XGB_ove
9,57.143,73.077,90.0,XGB_bal


# EVALUATION 2 - Get Confusion Matrix results from test set

In [None]:
# List to keep our CM results
test_results = []
# Loop to calculate CM results
for mdl_object, mdl_name in zip([mdl_lre_ove,mdl_lre_bal, mdl_svm_ove,mdl_svm_bal, mdl_rfc_ove, mdl_rfc_bal,mdl_bst_ove, mdl_bst_bal,mdl_xgb_ove,mdl_xgb_bal],['LRE_ov','LRE_bal','SVM_ove','SVM_bal','RFC_ove','RFC_bal','BST_ove', 'BST_bal','XGB_ove','XGB_bal']):
    test_result = calculate_confusion_matrix_results(mdl_object, x_test, y_test)
    test_result['Model'] = mdl_name
    test_results.append(test_result)

# Transform into a dataframe
df_cm_results = pd.DataFrame(test_results)
df_cm_results

Unnamed: 0,TN,TP,FN,FP,Model
0,61,10,2,17,LRE_ov
1,63,9,3,15,LRE_bal
2,70,6,6,8,SVM_ove
3,68,8,4,10,SVM_bal
4,76,9,3,2,RFC_ove
5,74,6,6,4,RFC_bal
6,76,8,4,2,BST_ove
7,76,6,6,2,BST_bal
8,72,8,4,6,XGB_ove
9,75,6,6,3,XGB_bal


# EVALUATION 3 - Get a probabilistic metric approach

In [None]:
# List to keep our CM results
test_results = []
# Loop to calculate CM results
for mdl_object, mdl_name in zip([mdl_lre_ove,mdl_lre_bal, mdl_svm_ove,mdl_svm_bal, mdl_rfc_ove, mdl_rfc_bal,mdl_bst_ove, mdl_bst_bal,mdl_xgb_ove,mdl_xgb_bal],['LRE_ov','LRE_bal','SVM_ove','SVM_bal','RFC_ove','RFC_bal','BST_ove', 'BST_bal','XGB_ove','XGB_bal']):
    test_result = calculate_1_minus_brier_score_loss(mdl_object, x_test, y_test)
    test_result['Model'] = mdl_name
    test_results.append(test_result)

# Transform into a dataframe
df_brier_results = pd.DataFrame(test_results)
df_brier_results

Unnamed: 0,Brier Score,1 - Brier Score,Model
0,0.139,0.861,LRE_ov
1,0.146,0.854,LRE_bal
2,0.104,0.896,SVM_ove
3,0.084,0.916,SVM_bal
4,0.074,0.926,RFC_ove
5,0.073,0.927,RFC_bal
6,0.061,0.939,BST_ove
7,0.084,0.916,BST_bal
8,0.079,0.921,XGB_ove
9,0.079,0.921,XGB_bal


# EVALUATION 5 - Backtests Accuracy

In [None]:
# List to keep our CM results
test_results = []
# Loop to calculate CM results
for mdl_object, mdl_name in zip([mdl_lre_ove,mdl_lre_bal, mdl_svm_ove,mdl_svm_bal, mdl_rfc_ove, mdl_rfc_bal,mdl_bst_ove, mdl_bst_bal,mdl_xgb_ove,mdl_xgb_bal],['LRE_ov','LRE_bal','SVM_ove','SVM_bal','RFC_ove','RFC_bal','BST_ove', 'BST_bal','XGB_ove','XGB_bal']):
    test_result = calculate_backtest_real_accuracy(mdl_object, x_resp, y_resp)
    test_result['Model'] = mdl_name
    test_results.append(test_result)

# Transform into a dataframe
df_backtest_results = pd.DataFrame(test_results)
df_backtest_results

Unnamed: 0,ACC (%),Model
0,85.0,LRE_ov
1,85.0,LRE_bal
2,85.0,SVM_ove
3,85.0,SVM_bal
4,92.0,RFC_ove
5,92.0,RFC_bal
6,92.0,BST_ove
7,100.0,BST_bal
8,96.0,XGB_ove
9,96.0,XGB_bal


# Overall Commentary over previous results
As you can see **Random Forest** performs better compared to the others (considering the best results into CM/Metrics/Brier Score):
- Metrics: Best F1-Score and AUC
- CM: Best TN/TP/FP/FN
- Best Brier Score
- Lose by one missclassified sample

The only result where RF was not the best, by one sample, was backtest evaluation. So the selected model will be the RF.

# FINAL RESULT - Bootstrap CI 95% for selected AI algorithm with several metrics
- F1-Score
- AUC ROC
- Sensitivity
- Specificity
- Precision

## Get BCI 95% for RF

In [None]:
# Generate a list of metrics using Bootstrap
selected_model_bci_results = get_metrics_using_bootstrap_best_model(mdl_rfc_ove, x_test, y_test, rs_generator=RS_GENERATOR)

In [None]:
# Create dataframe result
df_bci_best_model = pd.DataFrame(index=[0],columns=list(selected_model_bci_results.keys()))

In [None]:
# Calculate BCI95%
for i in df_bci_best_model.columns:
    df_bci_best_model.loc[0, i] = bootstrap_confidence_interval(selected_model_bci_results[i])

# Show results
df_bci_best_model

Unnamed: 0,F1S,ROC,SEN,SPE,PRE
0,"(0.571, 0.777, 0.923)","(0.737, 0.859, 0.959)","(0.5, 0.744, 0.921)","(0.936, 0.975, 1.0)","(0.615, 0.83, 1.0)"


In [None]:
# Show Evaluation 1 results for F1S and ROC
df_test_results.loc[df_test_results['Model'] == 'RFC_ove']

Unnamed: 0,F1-Score,AUC ROC,Accuracy,Model
4,78.261,86.218,94.444,RFC_ove


# **Export results**

In [None]:
df_test_results.to_csv('eval_1_testSet_metrics.csv',index=False)
df_cm_results.to_csv('eval_2_confusionMatrix_metrics.csv',index=False)
df_brier_results.to_csv('eval_3_testSet_brierscore.csv',index=False)
df_backtest_results.to_csv('eval_4_backtest.csv',index=False)
df_bci_best_model.to_csv('best_model_bci_result.csv',index=False)