# Create predictions and analyze the results

- Author: Marios Kokkodis 
- email: marios.kokkodis@gmail.com 

> Python notes: tested on PY38


#### Steps to evaluate

To evaluate in detail each algorithm we need:

1. Find the best hyperparameters $\times$ features for each fold based on validation performance (see [train.py](../python/train.py) for implementation details)
2. Based on best validation performance, use the appropriate trained model  to get predictions for the test set.
3. Create the necessary output file for the evaluation plots.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import sys
from os import path
if '../python/' not in sys.path: sys.path.insert(1, '../python/')
import custom_util_functions
from importlib import reload
reload(custom_util_functions)
from custom_util_functions import print_border_line,get_ranking_performance,get_within_opening_perf
from custom_util_functions import write_best_validation_models
header = ['model_key', 'fold' , 'algorithm', 'validation_average_auc', 'validation_auc_positive']


### Step 1: Choose best specification based on validation performance for each algorithm and fold

In [2]:
focalModels = ['logit','svm','xg','rf','lstm','hmm','sahoo']
fout = open("../../data/evaluation_results/model_selection.csv","w")
fout.write('fold,algorithm,model_key\n')#cur_avg,cur_avg_pos,
for algorithm in focalModels:
    d = pd.read_csv("../../data/results/"+algorithm+".csv", names = header)
    if algorithm == 'sahoo':
        print("Algorithm:",algorithm, "\n  >>total specifications tested on validation set:", len(d))
    else:
        print("Algorithm:",algorithm, "\n  >>total specifications tested on validation set range between", len(d),
              "and",len(d) * 3)
    write_best_validation_models(d,algorithm,fout) #'average_auc' auc_positive)
fout.close()




Algorithm: logit 
  >>total specifications tested on validation set range between 10 and 30
Algorithm: svm 
  >>total specifications tested on validation set range between 10 and 30
Algorithm: xg 
  >>total specifications tested on validation set range between 180 and 540
Algorithm: rf 
  >>total specifications tested on validation set range between 90 and 270
Algorithm: lstm 
  >>total specifications tested on validation set range between 180 and 540
Algorithm: hmm 
  >>total specifications tested on validation set range between 20 and 60
Algorithm: sahoo 
  >>total specifications tested on validation set: 20


### Step 2: Use the best specifications to make predictions on the test set

In [3]:
d = pd.read_csv("../../data/evaluation_results/model_selection.csv")
l = []
for _,row in d.iterrows():

    a = row['algorithm']
    if 'hmm' == a:
        fold,states = row['model_key'].split("::")
        !python ../python/train.py -a $a -f $fold -s $states    -P -o
    elif a in ['logit','svm']:
        fold = row['model_key']
        !python ../python/train.py -a $a -f $fold -P -o
    elif a == 'rf':
        fold, max_depth,n_estimators = row['model_key'].split("::")
        !python ../python/train.py -a $a -f $fold -P -o -d $max_depth -e $n_estimators
    elif a == 'lstm':
        fold,batch_size,epochs,stacked  = row['model_key'].split("_")
        if stacked == 'True':
            !python ../python/../python/train_lstm.py -a $a -f $fold -P -o  -e $epochs -b $batch_size -C
        else:
            !python ../python/../python/train_lstm.py -a $a -f $fold -P -o  -e $epochs -b $batch_size
    elif a == 'xg':
        fold, n_estimators, subsample,  max_depth = row['model_key'].split("::")
        !python ../python/train.py -a $a -f $fold -P -o  -d $max_depth -e $n_estimators -S $subsample
    elif a == 'sahoo':
        fold,states = row['model_key'].split("_")
        !python ../python/train_sahoo.py -f $fold  -s $states -P -o








Raw predictions for  logit  fold 0 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 1 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 2 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 3 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 4 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 5 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 6 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 7 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 8 posted in: ../../data/raw_predictions/
Raw predictions for  logit  fold 9 posted in: ../../data/raw_predictions/
Raw predictions for  svm  fold 0 posted in: ../../data/raw_predictions/
Raw predictions for  svm  fold 1 posted in: ../../data/raw_predictions/
Raw predictions for  svm  fold 2 posted in: ../../data/raw_predictions/
Raw predictions for  svm  fold 

### Step 3: Create the AUC-N file (to be used in [R2-plots.Rmd](../R/R2-plots.Rmd))

In [4]:
models = {}
models['single_assessment'] = ['hmm','logit','rf','svm','xg','lstm']
models['many_assessment'] = ['cnn','svdbinary','svdbinaryexplicit','svdtrinary','svdtrinaryexplicit','sahoo','hmm']
n_variable = 'employer_total_tasks_so_far'
for m in ['single_assessment','many_assessment']:
    fout = open("../../data/evaluation_results/auc_"+m+".csv","w")
    fout.write('algorithm,n,score,fold\n')
    for algorithm in models[m]:
        print("parsing results for ",algorithm)
        for fold in range(10):
            if not path.exists("../../data/raw_predictions/"+algorithm+str(fold)+".csv"): continue
            res = pd.read_csv("../../data/raw_predictions/"+algorithm+str(fold)+".csv")
            maxHiresInTest = res[n_variable].max()-3
            thr = 20 if maxHiresInTest >  20 else maxHiresInTest
            for n in range(thr):
                curDf = res[(res[n_variable]>=n)]
                if len(np.unique(curDf.hire_positive_truth))==1: continue #AUC score needs both positive and negative
                if 'label_2' in curDf.columns:
                    curScore = roc_auc_score(curDf['hire_positive_truth'].to_numpy(),curDf['label_2'].to_numpy())
                else:
                    curScore = roc_auc_score(curDf['hire_positive_truth'].to_numpy(),curDf['label_1'].to_numpy())
                fout.write(algorithm+","+str(n)+","+str(curScore)+","+str(fold)+'\n')
    fout.close()

parsing results for  hmm
parsing results for  logit
parsing results for  rf
parsing results for  svm
parsing results for  xg
parsing results for  lstm
parsing results for  cnn
parsing results for  svdbinary
parsing results for  svdbinaryexplicit
parsing results for  svdtrinary
parsing results for  svdtrinaryexplicit
parsing results for  sahoo
parsing results for  hmm


### Step 4: Ranking performance and  Lift

In [5]:
fout = open("../../data/evaluation_results/ranking_performance.csv","w")
fout.write('algorithm,prc,score,fold\n')
for algorithm in models['single_assessment']:
    print("parsing results for ",algorithm)
    for fold in range(10):
        curRes = pd.read_csv("../../data/raw_predictions/"+algorithm+str(fold)+".csv")
        if 'label_2' not in curRes.columns: curRes['label_2'] = curRes['label_1']
        g = get_ranking_performance(curRes)
        for ind,row in g.iterrows():
            fout.write(algorithm+","+str(row['percentile'])+","+str(row['rate'])+","+str(fold)+'\n')
fout.close()


parsing results for  hmm
parsing results for  logit
parsing results for  rf
parsing results for  svm
parsing results for  xg
parsing results for  lstm


### Within-task (opening) evaluation

In [6]:
#create opening -> total apps index using one algorithm (any):
opTotalApps = {}
for fold in range(10):
    focalDf = pd.read_csv("../../data/raw_predictions/logit"+str(fold)+".csv")
    grouped = focalDf[['task_id','application']].groupby(['task_id'],as_index=False).count()
    opTotalApps[fold] = { row['task_id']:row['application'] for _,row in grouped.iterrows()}
len(opTotalApps)


10

In [7]:
fout = open("../../data/evaluation_results/within_openings.csv","w")
fout.write('algorithm,n,score,fold,metric,employer_total_tasks_so_far\n')
for algorithm in models['single_assessment']:
    print("parsing results for ",algorithm)
    top_n_thr = 17 #top 25% of options within each opening.
    for fold in range(10):
        curRes = pd.read_csv("../../data/raw_predictions/"+algorithm+str(fold)+".csv")
        if 'task_id' not in curRes.columns: curRes['task_id'] = curRes['choicesetId']
        if 'label_2' not in curRes.columns: curRes['label_2'] = curRes['label_1']
        nRes = get_within_opening_perf(curRes,opTotalApps[fold],top_n_thr)
        for curT,nResIn in nRes.items():
            for ed,vdic in nResIn.items():
                for k,v in vdic.items():
                    fout.write(algorithm+","+str(k)+","+str(v)+","+str(fold)+","+curT+","+str(ed)+'\n')
fout.close()









parsing results for  hmm
parsing results for  logit
parsing results for  rf
parsing results for  svm
parsing results for  xg
parsing results for  lstm
