# This is an Notebook which can evalute multiple files.  It implements a number of features:

- TopK
- Removing recent referrals
- Thresholding 
- Tables and visualization
- Config files for standardized processes. 



In [83]:
%reload_ext autoreload
%autoreload 2
#ignore warnings.
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports
import pandas as pd
import os, sys
import importlib
import yaml, json
from pathlib import Path

#path append
sys.path.append(os.path.join(Path.cwd(), 'modules'))
import Evaluate, Helper, Present, Score, Synthetic


<H3> Imports</H3>

### File configuration
<p> Just edit the configuration file path/address. prediction_files contain configs for different prediction files. config_file contains other configs for referral, visualization, experiment etc. </p>

In [84]:
config_file = 'config/config.yaml'
prediction_config_files = ['config/predictions/pred1.yaml', 'config/predictions/pred2.yaml']
generate_data=False


### Load Configuration and Referrals

In [85]:
#Load Configuration
c_r, c_e, c_gen, c_aws, c_visual, predictions=Helper.load_configuration(config_file, prediction_config_files)

#Generate Data (if required)
if generate_data:
    Synthetic.generate_synthetic_event_data(c_gen) 
    Synthetic.generate_synthetic_prediction_data(c_gen)
    
#Load Referral Data
referrals=Helper.read_file(directory=c_r['dir'],file=c_r['file'],file_format=c_r['file_format'],aws=c_r['aws'],bucket= c_r['bucket'])


### Evaluate Models
For each configuration file provided it will evaluate the models. 

In [86]:
for c_p in predictions:    
    prediction=Helper.read_file(directory=c_p['dir'],file=c_p['file'],file_format=c_p['file_format'],aws=c_p['aws'],bucket= c_p['bucket'])
    
    #TBD Implement a check that there is a match between prediction and evaluation file.
    
    all_model_evaluations = Evaluate.evaluate(c_p, c_e, c_r, referrals, prediction) 
    Present.present_evaluation(c_p, c_r, c_e, c_visual, all_model_evaluations)

Unnamed: 0,Experiment Name,Model,Window,Eval Date,Num Samples,TN_@k=50,FP_@k=50,FN_@k=50,TP_@k=50,TN_@k=60,FP_@k=60,FN_@k=60,TP_@k=60,TN_@p>=0.5,FP_@p>=0.5,FN_@p>=0.5,TP_@p>=0.5,TN_@p>=0.6,FP_@p>=0.6,FN_@p>=0.6,TP_@p>=0.6,precision_@k=50,precision_@k=60,precision_@p>=0.5,precision_@p>=0.6,recall_@k=50,recall_@k=60,recall_@p>=0.5,recall_@p>=0.6,accuracy_@k=50,accuracy_@k=60,accuracy_@p>=0.5,accuracy_@p>=0.6,balanced_acc_@k=50,balanced_acc_@k=60,balanced_acc_@p>=0.5,balanced_acc_@p>=0.6,f1_score_@k=50,f1_score_@k=60,f1_score_@p>=0.5,f1_score_@p>=0.6,log_loss,roc_auc_score,brier_score_loss,Prediction Source,Referral Source,Result Output,Start Time,End Time,Total Time
0,Version 1.0,lin_reg,2017/01-2017/04,2017-01-01,495,442.0,49.0,3.0,1.0,433.0,58.0,2.0,2.0,236.0,255.0,2.0,2.0,299.0,192.0,2.0,2.0,0.02,0.03333,0.00778,0.01031,0.25,0.5,0.5,0.5,0.25,0.5,0.5,0.5,0.5751,0.69094,0.49033,0.55448,0.03704,0.0625,0.01533,0.0202,0.98694,0.52749,0.32991,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.033965-04:00,2020-06-03 02:17:23.082834-04:00,00:00:00.048869
1,Version 1.0,lin_reg,2017/01-2017/07,2017-01-01,495,442.0,49.0,3.0,1.0,433.0,58.0,2.0,2.0,236.0,255.0,2.0,2.0,299.0,192.0,2.0,2.0,0.02,0.03333,0.00778,0.01031,0.25,0.5,0.5,0.5,0.25,0.5,0.5,0.5,0.5751,0.69094,0.49033,0.55448,0.03704,0.0625,0.01533,0.0202,0.98694,0.52749,0.32991,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.086844-04:00,2020-06-03 02:17:23.102781-04:00,00:00:00.015937
2,Version 1.0,lin_reg,2017/01-2018/01,2017-01-01,495,439.0,49.0,6.0,1.0,430.0,58.0,5.0,2.0,235.0,253.0,3.0,4.0,298.0,190.0,3.0,4.0,0.02,0.03333,0.01556,0.02062,0.14286,0.28571,0.57143,0.57143,0.14286,0.28571,0.57143,0.57143,0.52122,0.58343,0.52649,0.59104,0.03509,0.0597,0.0303,0.0398,0.98211,0.59456,0.3278,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.103802-04:00,2020-06-03 02:17:23.118747-04:00,00:00:00.014945
3,Version 1.0,rand_forest,2017/01-2017/04,2017-01-01,495,443.0,48.0,2.0,2.0,433.0,58.0,2.0,2.0,250.0,241.0,2.0,2.0,304.0,187.0,2.0,2.0,0.04,0.03333,0.00823,0.01058,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.70112,0.69094,0.50458,0.55957,0.07407,0.0625,0.01619,0.02073,0.98813,0.64358,0.32953,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.119748-04:00,2020-06-03 02:17:23.133704-04:00,00:00:00.013956
4,Version 1.0,rand_forest,2017/01-2017/07,2017-01-01,495,443.0,48.0,2.0,2.0,433.0,58.0,2.0,2.0,250.0,241.0,2.0,2.0,304.0,187.0,2.0,2.0,0.04,0.03333,0.00823,0.01058,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.70112,0.69094,0.50458,0.55957,0.07407,0.0625,0.01619,0.02073,0.98813,0.64358,0.32953,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.135703-04:00,2020-06-03 02:17:23.149666-04:00,00:00:00.013963
5,Version 1.0,rand_forest,2017/01-2018/01,2017-01-01,495,440.0,48.0,5.0,2.0,430.0,58.0,5.0,2.0,249.0,239.0,3.0,4.0,302.0,186.0,4.0,3.0,0.04,0.03333,0.01646,0.01587,0.28571,0.28571,0.57143,0.42857,0.28571,0.28571,0.57143,0.42857,0.59368,0.58343,0.54084,0.52371,0.07018,0.0597,0.032,0.03061,0.98659,0.613,0.32879,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.150663-04:00,2020-06-03 02:17:23.165623-04:00,00:00:00.014960
6,Version 1.0,xg_boost,2017/01-2017/04,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,242.0,249.0,3.0,1.0,292.0,199.0,3.0,1.0,0.0,0.0,0.004,0.005,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.37144,0.42235,0.0,0.0,0.00787,0.0098,1.04143,0.31976,0.34027,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.166621-04:00,2020-06-03 02:17:23.180583-04:00,00:00:00.013962
7,Version 1.0,xg_boost,2017/01-2017/07,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,242.0,249.0,3.0,1.0,292.0,199.0,3.0,1.0,0.0,0.0,0.004,0.005,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.37144,0.42235,0.0,0.0,0.00787,0.0098,1.04143,0.31976,0.34027,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.181582-04:00,2020-06-03 02:17:23.196556-04:00,00:00:00.014974
8,Version 1.0,xg_boost,2017/01-2018/01,2017-01-01,495,439.0,49.0,6.0,1.0,429.0,59.0,6.0,1.0,242.0,246.0,3.0,4.0,291.0,197.0,4.0,3.0,0.02,0.01667,0.016,0.015,0.14286,0.14286,0.57143,0.42857,0.14286,0.14286,0.57143,0.42857,0.52122,0.51098,0.53367,0.51244,0.03509,0.02985,0.03113,0.02899,1.02345,0.51727,0.3369,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.196556-04:00,2020-06-03 02:17:23.212498-04:00,00:00:00.015942
9,Version 1.0,sgmm,2017/01-2017/04,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,257.0,234.0,3.0,1.0,299.0,192.0,3.0,1.0,0.0,0.0,0.00426,0.00518,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.38671,0.42948,0.0,0.0,0.00837,0.01015,0.96311,0.41599,0.32183,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.213495-04:00,2020-06-03 02:17:23.228457-04:00,00:00:00.014962


Unnamed: 0,Experiment Name,Model,Window,Eval Date,Num Samples,TN_@k=50,FP_@k=50,FN_@k=50,TP_@k=50,TN_@k=60,FP_@k=60,FN_@k=60,TP_@k=60,TN_@p>=0.5,FP_@p>=0.5,FN_@p>=0.5,TP_@p>=0.5,TN_@p>=0.6,FP_@p>=0.6,FN_@p>=0.6,TP_@p>=0.6,precision_@k=50,precision_@k=60,precision_@p>=0.5,precision_@p>=0.6,recall_@k=50,recall_@k=60,recall_@p>=0.5,recall_@p>=0.6,accuracy_@k=50,accuracy_@k=60,accuracy_@p>=0.5,accuracy_@p>=0.6,balanced_acc_@k=50,balanced_acc_@k=60,balanced_acc_@p>=0.5,balanced_acc_@p>=0.6,f1_score_@k=50,f1_score_@k=60,f1_score_@p>=0.5,f1_score_@p>=0.6,log_loss,roc_auc_score,brier_score_loss,Prediction Source,Referral Source,Result Output,Start Time,End Time,Total Time
0,Version 1.0,lin_reg,2017/01-2017/04,2017-01-01,495,442.0,49.0,3.0,1.0,433.0,58.0,2.0,2.0,236.0,255.0,2.0,2.0,299.0,192.0,2.0,2.0,0.02,0.03333,0.00778,0.01031,0.25,0.5,0.5,0.5,0.25,0.5,0.5,0.5,0.5751,0.69094,0.49033,0.55448,0.03704,0.0625,0.01533,0.0202,0.98694,0.52749,0.32991,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.696768-04:00,2020-06-03 02:17:23.741648-04:00,00:00:00.044880
1,Version 1.0,lin_reg,2017/01-2017/07,2017-01-01,495,442.0,49.0,3.0,1.0,433.0,58.0,2.0,2.0,236.0,255.0,2.0,2.0,299.0,192.0,2.0,2.0,0.02,0.03333,0.00778,0.01031,0.25,0.5,0.5,0.5,0.25,0.5,0.5,0.5,0.5751,0.69094,0.49033,0.55448,0.03704,0.0625,0.01533,0.0202,0.98694,0.52749,0.32991,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.743648-04:00,2020-06-03 02:17:23.759613-04:00,00:00:00.015965
2,Version 1.0,lin_reg,2017/01-2018/01,2017-01-01,495,439.0,49.0,6.0,1.0,430.0,58.0,5.0,2.0,235.0,253.0,3.0,4.0,298.0,190.0,3.0,4.0,0.02,0.03333,0.01556,0.02062,0.14286,0.28571,0.57143,0.57143,0.14286,0.28571,0.57143,0.57143,0.52122,0.58343,0.52649,0.59104,0.03509,0.0597,0.0303,0.0398,0.98211,0.59456,0.3278,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.759613-04:00,2020-06-03 02:17:23.774560-04:00,00:00:00.014947
3,Version 1.0,rand_forest,2017/01-2017/04,2017-01-01,495,443.0,48.0,2.0,2.0,433.0,58.0,2.0,2.0,250.0,241.0,2.0,2.0,304.0,187.0,2.0,2.0,0.04,0.03333,0.00823,0.01058,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.70112,0.69094,0.50458,0.55957,0.07407,0.0625,0.01619,0.02073,0.98813,0.64358,0.32953,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.775557-04:00,2020-06-03 02:17:23.790525-04:00,00:00:00.014968
4,Version 1.0,rand_forest,2017/01-2017/07,2017-01-01,495,443.0,48.0,2.0,2.0,433.0,58.0,2.0,2.0,250.0,241.0,2.0,2.0,304.0,187.0,2.0,2.0,0.04,0.03333,0.00823,0.01058,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.70112,0.69094,0.50458,0.55957,0.07407,0.0625,0.01619,0.02073,0.98813,0.64358,0.32953,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.790525-04:00,2020-06-03 02:17:23.806475-04:00,00:00:00.015950
5,Version 1.0,rand_forest,2017/01-2018/01,2017-01-01,495,440.0,48.0,5.0,2.0,430.0,58.0,5.0,2.0,249.0,239.0,3.0,4.0,302.0,186.0,4.0,3.0,0.04,0.03333,0.01646,0.01587,0.28571,0.28571,0.57143,0.42857,0.28571,0.28571,0.57143,0.42857,0.59368,0.58343,0.54084,0.52371,0.07018,0.0597,0.032,0.03061,0.98659,0.613,0.32879,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.806475-04:00,2020-06-03 02:17:23.821457-04:00,00:00:00.014982
6,Version 1.0,xg_boost,2017/01-2017/04,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,242.0,249.0,3.0,1.0,292.0,199.0,3.0,1.0,0.0,0.0,0.004,0.005,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.37144,0.42235,0.0,0.0,0.00787,0.0098,1.04143,0.31976,0.34027,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.822432-04:00,2020-06-03 02:17:23.838390-04:00,00:00:00.015958
7,Version 1.0,xg_boost,2017/01-2017/07,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,242.0,249.0,3.0,1.0,292.0,199.0,3.0,1.0,0.0,0.0,0.004,0.005,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.37144,0.42235,0.0,0.0,0.00787,0.0098,1.04143,0.31976,0.34027,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.839401-04:00,2020-06-03 02:17:23.855346-04:00,00:00:00.015945
8,Version 1.0,xg_boost,2017/01-2018/01,2017-01-01,495,439.0,49.0,6.0,1.0,429.0,59.0,6.0,1.0,242.0,246.0,3.0,4.0,291.0,197.0,4.0,3.0,0.02,0.01667,0.016,0.015,0.14286,0.14286,0.57143,0.42857,0.14286,0.14286,0.57143,0.42857,0.52122,0.51098,0.53367,0.51244,0.03509,0.02985,0.03113,0.02899,1.02345,0.51727,0.3369,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.856352-04:00,2020-06-03 02:17:23.872299-04:00,00:00:00.015947
9,Version 1.0,sgmm,2017/01-2017/04,2017-01-01,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,257.0,234.0,3.0,1.0,299.0,192.0,3.0,1.0,0.0,0.0,0.00426,0.00518,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.44908,0.4389,0.38671,0.42948,0.0,0.0,0.00837,0.01015,0.96311,0.41599,0.32183,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 02:17:23.873295-04:00,2020-06-03 02:17:23.890258-04:00,00:00:00.016963
