In [25]:
import numpy as np
import pandas as pd
import os
import itertools
from scipy.stats import ttest_ind

Define some functions

In [26]:
def ttest_onetail(a, b, axis=0, equal_var=False, alternative='greater'):
    t_raw, p_raw = scipy.stats.ttest_ind(a, b, axis=axis, equal_var=equal_var)
    
    if np.isnan(t_raw):
        return 1.0
    if alternative in ['greater', '>']:
        if t_raw > 0:
            p = 0.5 * p_raw
        else:
            p = 1 - 0.5 * p_raw 
    elif alternative in ['smaller', '<']:
        if t_raw < 0:
            p = 0.5 * p_raw
        else:
            p = 1 - 0.5 * p_raw
    else:
        print 'Unknown parameter: alternative="%s"' % alternative
    return p

In [6]:
# Define useful constants
available_metrics = ['prc_auc', 'roc_auc']
available_classifiers = ['logreg_l1', 'logreg_l2', 'knn', 'dtree']
available_resamplers = ['bootstrap', 'rus', 'smote1', 'smote3', 'smote5', 'smote7']
available_multipliers = [1.25 + i * 0.25 for i in range(4*9)]

In [10]:
# Load metafeatures
df_metafs = pd.read_csv('./1_metafeatures/metafeatures.csv', index_col = 0)
df_metafs.index = df_metafs['dataset']
df_metafs = df_metafs.drop('dataset', axis = 1)

Construct data frames for metrics and their aggregations. Save them to *.csv files for further use in recsyst-train-test.ipynb

In [17]:
df_scores_raw = None
decimals = 7
for fname in os.listdir('./1_test_results'):
    if (fname[:3] == 'exp') and (fname[-4:] == '.csv'):
        print fname
        df = pd.read_csv('./1_test_results/' + fname, index_col = 0)
        numeric_cols = df.columns.tolist()
        numeric_cols.remove('dataset')
        numeric_cols.remove('resample_strategy')
        # Round values
        df[numeric_cols] = np.round(df[numeric_cols], decimals = decimals)
        # Delete odd chars from strings in df
        df['dataset'] = df['dataset'].map(lambda x: x.replace("data_csv/", "").replace(".csv", ""))
        #df.to_csv('test.csv')
        #exit()
        if df_scores_raw is None:
            df_scores_raw = df.copy()
        else:
            df_scores_raw = pd.concat((df_scores_raw, df), axis = 0)
df_scores_raw.index = range(len(df_scores_raw))

exp_real_new_results_data2_0.csv
exp_real_new_results_data2_1.csv
exp_real_new_results_data2_14.csv
exp_real_new_results_data2_16_1.csv
exp_real_new_results_data2_19_0.csv
exp_real_new_results_data2_19_1.csv
exp_real_new_results_data2_19_2.csv
exp_real_new_results_data2_2.csv
exp_real_new_results_data2_26-data2_27-data2_28-data2_29-data2_30.csv
exp_real_new_results_data2_3.csv
exp_real_new_results_data2_31-data2_32-data2_33-data2_34-data2_35.csv
exp_real_new_results_data2_36.csv
exp_real_new_results_data2_37-data2_39-data2_40-data2_41-data2_42-data2_43-data2_44-data2_45-data2_46-data2_48-data2_49-data2_51.csv
exp_real_new_results_data2_4-6.csv
exp_real_new_results_data2_52.csv
exp_real_new_results_data2_54.csv
exp_real_new_results_data2_55.csv
exp_real_new_results_data2_56.csv
exp_real_new_results_data2_59.csv
exp_real_new_results_data2_63.csv
exp_real_new_results_data2_64.csv
exp_real_new_results_data2_66.csv
exp_real_new_results_data2_68-data2_69-data2_70-data2_71.csv
exp_real_new_re

In [28]:
print df_scores_raw.shape

(4393816, 13)


In [29]:
print df_scores_raw.iloc[:4]

   dataset  logreg_l2_prc_auc  logreg_l2_roc_auc  knn_prc_auc  knn_roc_auc  \
0  data2_0           0.820567           0.878103     0.801836     0.875671   
1  data2_0           0.840532           0.894214     0.814318     0.875418   
2  data2_0           0.860166           0.890566     0.843584     0.871264   
3  data2_0           0.885521           0.930388     0.854214     0.892492   

   logreg_l1_prc_auc  logreg_l1_roc_auc  dtree_prc_auc  dtree_roc_auc  \
0           0.821153           0.878103       0.710047       0.737461   
1           0.840532           0.894214       0.742842       0.759196   
2           0.860190           0.890566       0.721557       0.748100   
3           0.883737           0.927754       0.789055       0.815230   

  resample_strategy  resample_multiplier  min_maj_ratio_final  q_iteration  
0           nothing                  1.0             0.507792          1.0  
1           nothing                  1.0             0.507792          2.0  
2           

In [30]:
# Construct data frames for metrics and their aggregations. Save them to *.csv files
# Very slow!
winsize = 3

dict_scores_nores = {}
dict_scores_res = {}
dict_scores_agg = {}

for clf, met in itertools.product(available_classifiers, available_metrics):
    metric = clf + '_' + met
    print metric
    
    #pval - p-value for t-test which compares current resample strategy and default (no resampling)
    stats_nothing_names = ['mean', 'std']
    stats_resampler_mult_names = ['mean', 'std', 'pval', 'pval_winmax']
    stats_resampler_aggmult_names = ['mean', 'std', 'pval', 'mult']
    aggregations_names = ['max_mean', 'min_pval', 'min_pval_winmax']

    df_scores_nores = pd.DataFrame(index = df_metafs.index, columns = ['mean', 'std'])
    df_scores_res = pd.DataFrame(index = df_metafs.index, columns = [
        [res for res in available_resamplers for stat in [stat for mult in available_multipliers for stat in stats_resampler_mult_names]],
        [mult for res in available_resamplers for mult in [mult for mult in available_multipliers for stat in stats_resampler_mult_names]],
        [stat for res in available_resamplers for mult in available_multipliers for stat in stats_resampler_mult_names]])
    df_scores_agg = pd.DataFrame(index = df_metafs.index, columns = [
        [res for res in available_resamplers for stat in [stat for agg in aggregations_names for stat in stats_resampler_aggmult_names]],
        [agg for res in available_resamplers for agg in [agg for agg in aggregations_names for stat in stats_resampler_aggmult_names]],
        [stat for res in available_resamplers for agg in aggregations_names for stat in stats_resampler_aggmult_names]])

    df_nothing = df_scores_raw.loc[df_scores_raw['resample_strategy'] == 'nothing'][['dataset', metric]]
    grouped = df_nothing.groupby(['dataset'])
    df_scores_nores['mean'] = grouped.mean()
    df_scores_nores['std'] = grouped.std()

    for res in available_resamplers:
        df_r = df_scores_raw.loc[df_scores_raw['resample_strategy'] == res][['dataset', 'resample_multiplier', metric]]

        grouped = df_r.groupby(['dataset', 'resample_multiplier'])
        df_mean = grouped.mean().reset_index().pivot(index='dataset', columns='resample_multiplier', values=metric)
        df_std = grouped.std().reset_index().pivot(index='dataset', columns='resample_multiplier', values=metric)
        df_pval = grouped.apply(
                lambda x: ttest_onetail(x[metric], df_nothing.loc[df_nothing['dataset'] == x['dataset'].values[0]][metric])
                ).reset_index().pivot(index='dataset', columns='resample_multiplier', values=0)
        df_scores_res[[(res, mult, 'mean') for mult in available_multipliers]] = df_mean
        df_scores_res[[(res, mult, 'std') for mult in available_multipliers]] = df_std
        df_scores_res[[(res, mult, 'pval') for mult in available_multipliers]] = df_pval
        df_scores_res[[(res, mult, 'pval_winmax') for mult in available_multipliers]] = \
            df_pval.rolling(window=winsize,center=False,axis=1).max().fillna(method='bfill', axis=1)

        for agg in aggregations_names:
            if agg == 'max_mean':
                # Here rel means 'relative', i.e. index in particular row
                rel_idx_best = np.argmax(df_mean.values, axis = 1)
            elif agg == 'min_pval':
                rel_idx_best = np.argmin(df_pval.values, axis = 1)
            elif agg == 'min_pval_winmax':
                rel_idx_best = np.argmin(df_pval.rolling(window=winsize,center=False,axis=1).max().fillna(method='bfill', axis=1).values, axis = 1)
            else:
                print 'Aggregation %s is not supported' % agg
            # Indexes for flatten dataframe
            idx_best = rel_idx_best + np.arange(df_pval.shape[0]) * df_pval.shape[1]
            df_scores_agg[(res, agg, 'mean')] = df_mean.values.flatten()[idx_best]
            df_scores_agg[(res, agg, 'std')] = df_std.values.flatten()[idx_best]
            df_scores_agg[(res, agg, 'pval')] = df_pval.values.flatten()[idx_best]
            df_scores_agg[(res, agg, 'mult')] = [available_multipliers[i] for i in rel_idx_best]
    df_scores_nores.to_csv('2_df_scores/df_scores_nores_' + metric + '.csv')
    df_scores_res.to_csv('2_df_scores/df_scores_res_' + metric + '.csv')
    df_scores_agg.to_csv('2_df_scores/df_scores_agg_' + metric + '.csv')

logreg_l1_prc_auc
logreg_l1_roc_auc
logreg_l2_prc_auc
logreg_l2_roc_auc
knn_prc_auc
knn_roc_auc
dtree_prc_auc
dtree_roc_auc
