# This is an Notebook which can evalute multiple files.  It implements a number of features:

- TopK
- Removing recent referrals
- Thresholding 
- Tables and visualization
- Config files for standardized processes. 



In [18]:
%reload_ext autoreload
%autoreload 2
#ignore warnings.
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports
import pandas as pd
import os, sys
import importlib
import yaml, json
from pathlib import Path

#path append
sys.path.append(os.path.join(Path.cwd(), 'modules'))
import Evaluate, Helper, Present, Score, Synthetic


<H3> Imports</H3>

### File configuration
<p> Just edit the configuration file path/address. prediction_files contain configs for different prediction files. config_file contains other configs for referral, visualization, experiment etc. </p>

In [19]:
#multiple prediction file load still works, just loaded a single file because too many things are being displayed otherwise.
config_file = 'config/config.yaml'
prediction_config_files = ['config/predictions/pred1.yaml', 'config/predictions/pred2.yaml']
generate_data=False 


### Load Configuration and Referrals

In [20]:
'''
Transferred referral data loading inside the c_p for loop below, because referral_data is being changed for date format adjustment. 
As referral_data.copy() is not allowed for memory issues. The overhead shouldn't be much.
'''

#Load Configuration
c_r, c_e, c_gen, c_aws, c_visual, predictions=Helper.load_configuration(config_file, prediction_config_files)

#Generate Data (if required)
if generate_data:
    Synthetic.generate_synthetic_event_data(c_gen)
    Synthetic.generate_synthetic_prediction_data(c_gen)

### Precheck Prediction and Referral Files

In [21]:
import csv

#assert for column matches between config and data files
#check if columns mentioned under c_r['columns'] and c_p['columns'] are available in referral and prediction data
#this is the only check possible without loading the whole data into the memory

referral_file = c_r['dir'] + c_r['file']
with open(referral_file, "r") as f:
    reader = csv.reader(f)
    i = next(reader)
    assert Helper.column_exists(i, c_r['columns']), "Column mismatch for referral"
print("Referral Columns match")

for c_p in predictions:
    file = c_p['dir'] + c_p['file']
    with open(file, "r") as f:
        reader = csv.reader(f)
        i = next(reader)
        assert Helper.column_exists(i, c_p['columns']), "Column mismatch for prediction"
print("Prediction Columns match")

Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Referral Columns match
Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR', 'lin_reg', 'rand_forest', 'xg_boost', 'sgmm']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Columns from dataframe (referral/prediction): ['PERSON_ID', 'MYR', 'lin_reg', 'rand_forest', 'xg_boost', 'sgmm']
Columns from config(c_r/c_p): ['PERSON_ID', 'MYR']
Prediction Columns match


### Empty results from previous run

In [22]:
#delete results from previous run
result_file = c_e['dir'] + c_e['file']

if os.path.exists(result_file):
  os.remove(result_file)
else:
  print("The result directory is currently empty. Generating result file.")

### Evaluate Models
For each configuration file provided it will evaluate the models. It will save all the results for all prediction files in the current run in the CSV file

In [23]:
#load referrals
referrals=Helper.read_file(directory=c_r['dir'],file=c_r['file'],file_format=c_r['file_format'],aws=c_r['aws'],bucket= c_r['bucket'])

#run again for all prediction files
for c_p in predictions:    
    
    #load prediction
    prediction=Helper.read_file(directory=c_p['dir'],file=c_p['file'],file_format=c_p['file_format'],aws=c_p['aws'],bucket= c_p['bucket'])
    
    #eval date extract
    date_list = Helper.eval_date_extract(c_p, prediction)
    
    #run evaluation for all eval dates and process and save them in CSV
    for eval_date in date_list:
        c_e['eval_date'] = eval_date
        all_model_evaluations = Evaluate.evaluate(c_p, c_e, c_r, referrals, prediction)
        #Present.present_evaluation(c_p, c_r, c_e, c_visual, eval_date, all_model_evaluations)
        Present.process_evaluation_data(c_p, c_r, c_e, c_visual, eval_date, all_model_evaluations)

### Visualization 

In [29]:
#this section will present tables, comparison plots and will save them
Present.present_evaluation(c_e, c_visual)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy_@k=50,accuracy_@p>=0.5,precision_@k=50,precision_@p>=0.5
Model,Prediction Source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lin_reg,predictions.csv,0.892968,0.495665,0.009167,0.008205
lin_reg,predictions2.csv,0.892968,0.495665,0.009167,0.008205
rand_forest,predictions.csv,0.891957,0.503481,0.004167,0.006002
rand_forest,predictions2.csv,0.891957,0.503481,0.004167,0.006002
sgmm,predictions.csv,0.892628,0.504056,0.0075,0.00837
sgmm,predictions2.csv,0.892628,0.504056,0.0075,0.00837
xg_boost,predictions.csv,0.892799,0.497701,0.008333,0.009045
xg_boost,predictions2.csv,0.892799,0.497701,0.008333,0.009045
