#### **This notebook combines the features for user.**

In [1]:
import pandas as pd
import numpy as np


import importlib

#### packages
import helper.strategy_helper as st_hp
import helper.visualization as vz_hp

#### **Load files**

In [2]:
all_feature = './../data/replier_classifier_features.pkl.gz'

df_stat = pd.read_pickle(all_feature)

#### **Different Algorithms**

In [3]:
import helper.stat_helper as stat_hp
importlib.reload(stat_hp)

def train_test(df_stat, algo_list, filename=None):
    all_cv_scores = []
    for algo in algo_list:
        cv_results_tuned_model = \
        stat_hp.run_oversample_model_with_best_threshold(
            df_stat,
            columns_not_include=['list_age'],
            model_type=algo, 
            y_column = 'replier_label',
            filename=None,
        )

        all_cv_scores.append(cv_results_tuned_model)

    (pd.concat(all_cv_scores)
    ).to_pickle(filename)

In [4]:
# filename ='./../results/replier_classifier_different_algorithm_oversample.pkl.gz'

# algo_list = ['logistic', 'random', 'ada', 'tree', 'naive']

# importlib.reload(stat_hp)

# train_test(df_stat, algo_list, filename=filename)

In [5]:
filename ='./../results/replier_classifier_different_algorithm_oversample.pkl.gz'

df_result = pd.read_pickle(filename)

In [6]:
df_result.columns

Index(['fit_time', 'score_time', 'estimator', 'test_precision',
       'train_precision', 'test_recall', 'train_recall', 'test_f1', 'train_f1',
       'test_roc_auc', 'train_roc_auc', 'threshold', 'algorithm'],
      dtype='object')

In [7]:
df_result.head()

Unnamed: 0,fit_time,score_time,estimator,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_roc_auc,train_roc_auc,threshold,algorithm
0,126.046771,0.301372,TunedThresholdClassifierCV(estimator=Pipeline(...,0.276804,0.293181,0.442634,0.468057,0.340607,0.360532,0.936596,0.940206,0.959596,logistic
1,121.280656,0.40981,TunedThresholdClassifierCV(estimator=Pipeline(...,0.279137,0.2717,0.505867,0.493807,0.359759,0.350532,0.94003,0.938991,0.949495,logistic
2,125.552725,0.318391,TunedThresholdClassifierCV(estimator=Pipeline(...,0.269176,0.269056,0.494133,0.492992,0.348506,0.348121,0.943401,0.93868,0.949495,logistic
3,124.444878,0.317353,TunedThresholdClassifierCV(estimator=Pipeline(...,0.285539,0.281788,0.454368,0.464472,0.350692,0.350769,0.930268,0.941545,0.959596,logistic
4,84.748517,0.269426,TunedThresholdClassifierCV(estimator=Pipeline(...,0.261637,0.26076,0.531291,0.521349,0.350613,0.347642,0.942552,0.93858,0.939394,logistic


In [8]:
columns = ['algorithm', 'test_precision', 
           'test_recall', 'test_f1', 'test_roc_auc']
df_mean = df_result[columns].groupby(['algorithm']).mean()

In [10]:
df_mean[
    ['test_precision', 'test_recall', 'test_f1', 'test_roc_auc']
]

Unnamed: 0_level_0,test_precision,test_recall,test_f1,test_roc_auc
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ada,0.473784,0.546415,0.505505,0.961757
logistic,0.274459,0.485658,0.350035,0.93857
naive,0.058188,0.503781,0.103773,0.865603
random,0.706654,0.719166,0.712803,0.963493
tree,0.554962,0.51369,0.533294,0.754838


#### **Oversampling: Standard Error**

In [11]:
import helper.stat_helper as stat_hp
importlib.reload(stat_hp)

<module 'helper.stat_helper' from '/N/slate/potem/project/infoOps-strategy/package/helper/stat_helper.py'>

In [12]:
df_grp = (df_result
          .groupby(['algorithm'])
         )

for grp, df_values in df_grp:
    print('Algorithm :', grp[0])
    mean_precision, std_prec = stat_hp.print_standard_error(df_values['test_precision'],
                                                    'precision'
                                                   )

    mean_recall , std_recall = stat_hp.print_standard_error(df_values['test_recall'],
                                                    'recall'
                                                   )
    mean_f1 , std_f1 = stat_hp.print_standard_error(df_values['test_f1'],
                                            'f1'
                                           )
    mean_auc , std_auc = stat_hp.print_standard_error(df_values['test_roc_auc'],
                                              'AUC'
                                             )

    print('\n ******************** \n\n')

Algorithm : ada
Mean precision: 0.474 ± standard error 0.015161990242015333
Mean recall: 0.546 ± standard error 0.015042039163403928
Mean f1: 0.506 ± standard error 0.006219580412010177
Mean AUC: 0.962 ± standard error 0.0015037476462861445

 ******************** 


Algorithm : logistic
Mean precision: 0.274 ± standard error 0.0037010574807436122
Mean recall: 0.486 ± standard error 0.014687079547993677
Mean f1: 0.350 ± standard error 0.002731640986141713
Mean AUC: 0.939 ± standard error 0.0021367848408948074

 ******************** 


Algorithm : naive
Mean precision: 0.058 ± standard error 0.002706529979451899
Mean recall: 0.504 ± standard error 0.030188878615573057
Mean f1: 0.104 ± standard error 0.003718219167286422
Mean AUC: 0.866 ± standard error 0.0014663885880985296

 ******************** 


Algorithm : random
Mean precision: 0.707 ± standard error 0.004066695483501059
Mean recall: 0.719 ± standard error 0.0064412585711308016
Mean f1: 0.713 ± standard error 0.004629005104659367
M