# This is an Notebook which can evalute multiple files.  It implements a number of features:

- TopK
- Removing recent referrals
- Thresholding 
- Tables and visualization
- Config files for standardized processes. 



In [298]:
%reload_ext autoreload
%autoreload 2
#ignore warnings.
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports
import pandas as pd
import os, sys
import importlib
import yaml, json
from pathlib import Path

#path append
sys.path.append(os.path.join(Path.cwd(), 'modules'))
import Evaluate, Helper, Present, Score, Synthetic


<H3> Imports</H3>

### File configuration
<p> Just edit the configuration file path/address. prediction_files contain configs for different prediction files. config_file contains other configs for referral, visualization, experiment etc. </p>

In [299]:
config_file = 'config/config.yaml'
prediction_config_files = ['config/predictions/pred1.yaml', 'config/predictions/pred2.yaml']
generate_data=True


### Load Configuration and Referrals

In [300]:
'''
Transferred referral data loading inside the c_p for loop below, because referral_data is being changed for date format adjustment. 
As referral_data.copy() is not allowed for memory issues. The overhead shouldn't be much.
'''

#Load Configuration
c_r, c_e, c_gen, c_aws, c_visual, predictions=Helper.load_configuration(config_file, prediction_config_files)

#Generate Data (if required)
if generate_data:
    Synthetic.generate_synthetic_event_data(c_gen)
    Synthetic.generate_synthetic_prediction_data(c_gen)

### Evaluate Models
For each configuration file provided it will evaluate the models. 

In [301]:
for c_p in predictions:    
    prediction=Helper.read_file(directory=c_p['dir'],file=c_p['file'],file_format=c_p['file_format'],aws=c_p['aws'],bucket= c_p['bucket'])
    referrals=Helper.read_file(directory=c_r['dir'],file=c_r['file'],file_format=c_r['file_format'],aws=c_r['aws'],bucket= c_r['bucket'])
    
    #TBD Implement a check that there is a match between prediction and evaluation file.
    
    all_model_evaluations = Evaluate.evaluate(c_p, c_e, c_r, referrals, prediction) 
    Present.present_evaluation(c_p, c_r, c_e, c_visual, all_model_evaluations)

Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR', 'lin_reg', 'rand_forest', 'xg_boost', 'sgmm']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Number of Unique IDs in Referral: 25
Number of Unique IDs in Prediction: 500
Number of Intersected IDs: 15


Unnamed: 0,Experiment Name,Model,Window,Eval Date,Num Samples,TN_@k=50,FP_@k=50,FN_@k=50,TP_@k=50,TN_@k=60,FP_@k=60,FN_@k=60,TP_@k=60,TN_@p>=0.5,FP_@p>=0.5,FN_@p>=0.5,TP_@p>=0.5,TN_@p>=0.6,FP_@p>=0.6,FN_@p>=0.6,TP_@p>=0.6,precision_@k=50,precision_@k=60,precision_@p>=0.5,precision_@p>=0.6,recall_@k=50,recall_@k=60,recall_@p>=0.5,recall_@p>=0.6,accuracy_@k=50,accuracy_@k=60,accuracy_@p>=0.5,accuracy_@p>=0.6,balanced_acc_@k=50,balanced_acc_@k=60,balanced_acc_@p>=0.5,balanced_acc_@p>=0.6,f1_score_@k=50,f1_score_@k=60,f1_score_@p>=0.5,f1_score_@p>=0.6,log_loss,roc_auc_score,brier_score_loss,Prediction Source,Referral Source,Result Output,Start Time,End Time,Total Time
0,Version 1.0,lin_reg,201701-201704,201701,495,441.0,50.0,4.0,0.0,432.0,59.0,3.0,1.0,235.0,256.0,3.0,1.0,298.0,193.0,3.0,1.0,0.0,0.01667,0.00389,0.00515,0.0,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.44908,0.56492,0.36431,0.42846,0.0,0.03125,0.00766,0.0101,1.0133,0.31237,0.33334,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.753429-04:00,2020-06-03 22:32:17.800290-04:00,00:00:00.046861
1,Version 1.0,lin_reg,201701-201707,201701,495,441.0,50.0,4.0,0.0,432.0,59.0,3.0,1.0,235.0,256.0,3.0,1.0,298.0,193.0,3.0,1.0,0.0,0.01667,0.00389,0.00515,0.0,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.44908,0.56492,0.36431,0.42846,0.0,0.03125,0.00766,0.0101,1.0133,0.31237,0.33334,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.803298-04:00,2020-06-03 22:32:17.820237-04:00,00:00:00.016939
2,Version 1.0,lin_reg,201701-201801,201701,495,438.0,50.0,7.0,0.0,429.0,59.0,6.0,1.0,233.0,255.0,5.0,2.0,296.0,192.0,5.0,2.0,0.0,0.01667,0.00778,0.01031,0.0,0.14286,0.28571,0.28571,0.0,0.14286,0.28571,0.28571,0.44877,0.51098,0.38159,0.44614,0.0,0.02985,0.01515,0.0199,1.01966,0.32216,0.33557,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.821239-04:00,2020-06-03 22:32:17.834200-04:00,00:00:00.012961
3,Version 1.0,rand_forest,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,250.0,241.0,2.0,2.0,303.0,188.0,3.0,1.0,0.0,0.0,0.00823,0.00529,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.25,0.44908,0.4389,0.50458,0.43355,0.0,0.0,0.01619,0.01036,1.01271,0.42006,0.33323,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.835214-04:00,2020-06-03 22:32:17.850158-04:00,00:00:00.014944
4,Version 1.0,rand_forest,201701-201707,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,250.0,241.0,2.0,2.0,303.0,188.0,3.0,1.0,0.0,0.0,0.00823,0.00529,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.25,0.44908,0.4389,0.50458,0.43355,0.0,0.0,0.01619,0.01036,1.01271,0.42006,0.33323,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.850158-04:00,2020-06-03 22:32:17.865117-04:00,00:00:00.014959
5,Version 1.0,rand_forest,201701-201801,201701,495,438.0,50.0,7.0,0.0,428.0,60.0,7.0,0.0,249.0,239.0,3.0,4.0,301.0,187.0,5.0,2.0,0.0,0.0,0.01646,0.01058,0.0,0.0,0.57143,0.28571,0.0,0.0,0.57143,0.28571,0.44877,0.43852,0.54084,0.45126,0.0,0.0,0.032,0.02041,1.01261,0.46107,0.33318,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.866114-04:00,2020-06-03 22:32:17.890049-04:00,00:00:00.023935
6,Version 1.0,xg_boost,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,243.0,248.0,2.0,2.0,293.0,198.0,2.0,2.0,0.0,0.0,0.008,0.01,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.44908,0.4389,0.49745,0.54837,0.0,0.0,0.01575,0.01961,1.03231,0.52138,0.33703,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.891046-04:00,2020-06-03 22:32:17.908997-04:00,00:00:00.017951
7,Version 1.0,xg_boost,201701-201707,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,243.0,248.0,2.0,2.0,293.0,198.0,2.0,2.0,0.0,0.0,0.008,0.01,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.44908,0.4389,0.49745,0.54837,0.0,0.0,0.01575,0.01961,1.03231,0.52138,0.33703,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.909995-04:00,2020-06-03 22:32:17.927949-04:00,00:00:00.017954
8,Version 1.0,xg_boost,201701-201801,201701,495,438.0,50.0,7.0,0.0,428.0,60.0,7.0,0.0,242.0,246.0,3.0,4.0,292.0,196.0,3.0,4.0,0.0,0.0,0.016,0.02,0.0,0.0,0.57143,0.57143,0.0,0.0,0.57143,0.57143,0.44877,0.43852,0.53367,0.58489,0.0,0.0,0.03113,0.03865,1.0288,0.56382,0.33549,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.928946-04:00,2020-06-03 22:32:17.953879-04:00,00:00:00.024933
9,Version 1.0,sgmm,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,259.0,232.0,1.0,3.0,299.0,192.0,3.0,1.0,0.0,0.0,0.01277,0.00518,0.0,0.0,0.75,0.25,0.0,0.0,0.75,0.25,0.44908,0.4389,0.63875,0.42948,0.0,0.0,0.0251,0.01015,0.95412,0.5835,0.31913,./data/simulated_prediction_data/predictions.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:17.954876-04:00,2020-06-03 22:32:17.973825-04:00,00:00:00.018949


Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR', 'lin_reg', 'rand_forest', 'xg_boost', 'sgmm']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Number of Unique IDs in Referral: 25
Number of Unique IDs in Prediction: 500
Number of Intersected IDs: 15


Unnamed: 0,Experiment Name,Model,Window,Eval Date,Num Samples,TN_@k=50,FP_@k=50,FN_@k=50,TP_@k=50,TN_@k=60,FP_@k=60,FN_@k=60,TP_@k=60,TN_@p>=0.5,FP_@p>=0.5,FN_@p>=0.5,TP_@p>=0.5,TN_@p>=0.6,FP_@p>=0.6,FN_@p>=0.6,TP_@p>=0.6,precision_@k=50,precision_@k=60,precision_@p>=0.5,precision_@p>=0.6,recall_@k=50,recall_@k=60,recall_@p>=0.5,recall_@p>=0.6,accuracy_@k=50,accuracy_@k=60,accuracy_@p>=0.5,accuracy_@p>=0.6,balanced_acc_@k=50,balanced_acc_@k=60,balanced_acc_@p>=0.5,balanced_acc_@p>=0.6,f1_score_@k=50,f1_score_@k=60,f1_score_@p>=0.5,f1_score_@p>=0.6,log_loss,roc_auc_score,brier_score_loss,Prediction Source,Referral Source,Result Output,Start Time,End Time,Total Time
0,Version 1.0,lin_reg,201701-201704,201701,495,441.0,50.0,4.0,0.0,432.0,59.0,3.0,1.0,235.0,256.0,3.0,1.0,298.0,193.0,3.0,1.0,0.0,0.01667,0.00389,0.00515,0.0,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.44908,0.56492,0.36431,0.42846,0.0,0.03125,0.00766,0.0101,1.0133,0.31237,0.33334,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.772324-04:00,2020-06-03 22:32:18.827202-04:00,00:00:00.054878
1,Version 1.0,lin_reg,201701-201707,201701,495,441.0,50.0,4.0,0.0,432.0,59.0,3.0,1.0,235.0,256.0,3.0,1.0,298.0,193.0,3.0,1.0,0.0,0.01667,0.00389,0.00515,0.0,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.44908,0.56492,0.36431,0.42846,0.0,0.03125,0.00766,0.0101,1.0133,0.31237,0.33334,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.830167-04:00,2020-06-03 22:32:18.844149-04:00,00:00:00.013982
2,Version 1.0,lin_reg,201701-201801,201701,495,438.0,50.0,7.0,0.0,429.0,59.0,6.0,1.0,233.0,255.0,5.0,2.0,296.0,192.0,5.0,2.0,0.0,0.01667,0.00778,0.01031,0.0,0.14286,0.28571,0.28571,0.0,0.14286,0.28571,0.28571,0.44877,0.51098,0.38159,0.44614,0.0,0.02985,0.01515,0.0199,1.01966,0.32216,0.33557,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.845136-04:00,2020-06-03 22:32:18.860087-04:00,00:00:00.014951
3,Version 1.0,rand_forest,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,250.0,241.0,2.0,2.0,303.0,188.0,3.0,1.0,0.0,0.0,0.00823,0.00529,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.25,0.44908,0.4389,0.50458,0.43355,0.0,0.0,0.01619,0.01036,1.01271,0.42006,0.33323,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.861085-04:00,2020-06-03 22:32:18.874073-04:00,00:00:00.012988
4,Version 1.0,rand_forest,201701-201707,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,250.0,241.0,2.0,2.0,303.0,188.0,3.0,1.0,0.0,0.0,0.00823,0.00529,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.25,0.44908,0.4389,0.50458,0.43355,0.0,0.0,0.01619,0.01036,1.01271,0.42006,0.33323,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.875058-04:00,2020-06-03 22:32:18.892011-04:00,00:00:00.016953
5,Version 1.0,rand_forest,201701-201801,201701,495,438.0,50.0,7.0,0.0,428.0,60.0,7.0,0.0,249.0,239.0,3.0,4.0,301.0,187.0,5.0,2.0,0.0,0.0,0.01646,0.01058,0.0,0.0,0.57143,0.28571,0.0,0.0,0.57143,0.28571,0.44877,0.43852,0.54084,0.45126,0.0,0.0,0.032,0.02041,1.01261,0.46107,0.33318,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.893024-04:00,2020-06-03 22:32:18.912946-04:00,00:00:00.019922
6,Version 1.0,xg_boost,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,243.0,248.0,2.0,2.0,293.0,198.0,2.0,2.0,0.0,0.0,0.008,0.01,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.44908,0.4389,0.49745,0.54837,0.0,0.0,0.01575,0.01961,1.03231,0.52138,0.33703,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.913949-04:00,2020-06-03 22:32:18.929900-04:00,00:00:00.015951
7,Version 1.0,xg_boost,201701-201707,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,243.0,248.0,2.0,2.0,293.0,198.0,2.0,2.0,0.0,0.0,0.008,0.01,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.44908,0.4389,0.49745,0.54837,0.0,0.0,0.01575,0.01961,1.03231,0.52138,0.33703,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.930898-04:00,2020-06-03 22:32:18.946855-04:00,00:00:00.015957
8,Version 1.0,xg_boost,201701-201801,201701,495,438.0,50.0,7.0,0.0,428.0,60.0,7.0,0.0,242.0,246.0,3.0,4.0,292.0,196.0,3.0,4.0,0.0,0.0,0.016,0.02,0.0,0.0,0.57143,0.57143,0.0,0.0,0.57143,0.57143,0.44877,0.43852,0.53367,0.58489,0.0,0.0,0.03113,0.03865,1.0288,0.56382,0.33549,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.947859-04:00,2020-06-03 22:32:18.963812-04:00,00:00:00.015953
9,Version 1.0,sgmm,201701-201704,201701,495,441.0,50.0,4.0,0.0,431.0,60.0,4.0,0.0,259.0,232.0,1.0,3.0,299.0,192.0,3.0,1.0,0.0,0.0,0.01277,0.00518,0.0,0.0,0.75,0.25,0.0,0.0,0.75,0.25,0.44908,0.4389,0.63875,0.42948,0.0,0.0,0.0251,0.01015,0.95412,0.5835,0.31913,./data/simulated_prediction_data/predictions2.csv,./data/simulated_prediction_data/referrals.csv,./results/results.csv,2020-06-03 22:32:18.964814-04:00,2020-06-03 22:32:18.979775-04:00,00:00:00.014961
