In [1]:
%matplotlib inline
import logging
from utils import logging as lg
lg.set_logging(logging.ERROR)

from scipy import stats
from skimage.measure import block_reduce
import numpy as np

import logging

import pandas as pd
import pickle
from notebook_utils import plot 
import seaborn as sns
import glob
sns.set(color_codes=True, font_scale=2, style="whitegrid", palette="bright")

from model import base, provider, heatmap_evaluation
import config
import matplotlib.pyplot as plt
import tensorflow as tf
import yaml
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
def hypothesis_testing(base, model_paths, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha1_5_beta_5'],
                       significance_level=0.025, metric_name='adjusted_rel_dist_in_data_region'):

    def highlight(s):
        return ['font-weight: bold' if v < significance_level else '' for v in s]

    results = []
    print('paired t-test p value against `%s` architecture\n highlighted when the value <= %f' % (base, significance_level))
    df = plot.get_stats_from_models(model_paths)

    architectures = set(df.architecture.values)
    
    models = list(filter(lambda x: x != base, set(df.architecture.values)))
    for m in models: 
        dd = [('architecture', m)]
        for med in methods:
            baseline = df[(df.architecture == base) & (df.method==med)][metric_name]
            comparing_model = df[(df.architecture == m) & (df.method==med)][metric_name]
            s = stats.ttest_rel(comparing_model, baseline)
            pv = s.pvalue/2.0
            if s.statistic < 0 :
                pv = 1-pv
            dd.append((med, pv))
        results.append(dict(dd))
        
    df_res = pd.DataFrame(results)[['architecture'] + methods].style.apply(highlight, subset=methods)
    return df_res


exp2_models = [
    '../final-models-group/shallow-m*-maj-seq-12-fold-*',
    '../final-models-group/shallow-f*-maj-seq-12-fold-*',

    '../final-models-group/deep-*-maj-seq-12-fold-*',
    '../final-models-group/deep_v2-*-maj-seq-12-fold-*',
    '../final-models-group/convdeep-*-maj-seq-12-fold-*',

]
hypothesis_testing(base='shallow', model_paths=exp2_models)

paired t-test p value against `shallow` architecture
 highlighted when the value <= 0.025000


Unnamed: 0,architecture,sensitivity,guided_backprop,lrp_deep_taylor,lrp_alpha1_5_beta_5
0,deep_v2,0.00348509,0.117184,1.31124e-13,7.94346e-12
1,convdeep,0.000380271,0.022609,5.00673e-13,1.42664e-11
2,deep,0.0535331,0.451835,2.31764e-11,3.94069e-11


In [3]:
exp2_models = [
    '../final-models-group/shallow-m*-maj-seq-12-fold-*',
    '../final-models-group/shallow-f*-maj-seq-12-fold-*',

    '../final-models-group/deep-*-maj-seq-12-fold-*',
    '../final-models-group/deep_v2-*-maj-seq-12-fold-*',
    '../final-models-group/convdeep-*-maj-seq-12-fold-*',

]
hypothesis_testing(base='shallow', model_paths=exp2_models)

paired t-test p value against `shallow` architecture
 highlighted when the value <= 0.025000


Unnamed: 0,architecture,sensitivity,guided_backprop,lrp_deep_taylor,lrp_alpha1_5_beta_5
0,deep_v2,0.00348509,0.117184,1.31124e-13,7.94346e-12
1,convdeep,0.000380271,0.022609,5.00673e-13,1.42664e-11
2,deep,0.0535331,0.451835,2.31764e-11,3.94069e-11


In [4]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
from statsmodels.formula.api import ols
import statsmodels.api as sm
                


  from pandas.core import datetools


In [19]:
def pair_tests(models, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha1_5_beta_5'],
               alpha=0.05, metric_name = 'adjusted_rel_dist_in_data_region'):
    df = plot.get_stats_from_models(models)
    res = []
    for m in methods:
        df_method = df.loc[df['method'] == m]
        df_removed_na = df_method.dropna()
        print('============= Hypothesis Testing  =============')
        print('for %s method with significant level at %.2f' % (m, alpha))
        if len(df_removed_na) != len(df_method):
            print("Have %d/%d rows after removing nan" % (len(df_removed_na), len(df_method)))
        print('='*20, 'ANOVA', '='*20)
        mod = ols('%s ~ architecture' % metric_name, data=df_removed_na).fit()
        aov_table = sm.stats.anova_lm(mod, typ=2)
        print(aov_table)

        esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
        print('effective size(eta squared) : %f' % esq_sm)
        print('')
        print('-'*50)
        mc = MultiComparison(   df_removed_na[metric_name],     df_removed_na['architecture'])
        result = mc.tukeyhsd(alpha=alpha)
        print(result.summary())
        res.append((m, result.summary()))
        print('\n')
    return res

In [28]:
x = pair_tests(exp2_models)

for sensitivity method with significant level at 0.05
                sum_sq    df          F        PR(>F)
architecture  0.017522   3.0  16.995769  8.038269e-08
Residual      0.017870  52.0        NaN           NaN
effective size(eta squared) : 0.495083

--------------------------------------------------
Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2 meandiff  lower   upper  reject
------------------------------------------------
convdeep   deep  -0.0353  -0.0539 -0.0167  True 
convdeep deep_v2  -0.019  -0.0376 -0.0005  True 
convdeep shallow  -0.047  -0.0656 -0.0284  True 
  deep   deep_v2  0.0163  -0.0023  0.0348 False 
  deep   shallow -0.0117  -0.0303  0.0069 False 
deep_v2  shallow  -0.028  -0.0466 -0.0094  True 
------------------------------------------------


for guided_backprop method with significant level at 0.05
                sum_sq    df         F    PR(>F)
architecture  0.021354   3.0  5.070579  0.003727
Residual      0.072995  52.0       NaN     

TypeError: list indices must be integers or slices, not tuple

In [20]:
deep_vs_lstm_models = [
    '../final-models-group/deep-m*-maj-seq-12-fold-*',
    '../final-models-group/deep-f*-maj-seq-12-fold-*',

    '../final-models-group/deep_persisted_dropout-*-maj-seq-12-fold-*',
    
    '../final-models-group/rlstm-*-maj-seq-12-fold-*',
    '../final-models-group/rlstm_persisted_dropout-*-maj-seq-12-fold-*']
pair_tests(deep_vs_lstm_models)

### Testing for method sensitivity ###
Have 56/56 rows after removing nan
                sum_sq    df          F        PR(>F)
architecture  0.014660   3.0  44.870051  1.862269e-14
Residual      0.005663  52.0        NaN           NaN
effective size(eta squared) : 0.721344
--------------------------------------------------
              Multiple Comparison of Means - Tukey HSD,FWER=0.05             
        group1                  group2         meandiff  lower  upper  reject
-----------------------------------------------------------------------------
         deep           deep_persisted_dropout -0.0063  -0.0168 0.0041 False 
         deep                   rlstm           0.0329   0.0224 0.0434  True 
         deep          rlstm_persisted_dropout  0.0234   0.013  0.0339  True 
deep_persisted_dropout          rlstm           0.0392   0.0288 0.0497  True 
deep_persisted_dropout rlstm_persisted_dropout  0.0298   0.0193 0.0403  True 
        rlstm          rlstm_persisted_dropout -0.

In [27]:
conv_vs_convliteral = [
    '../final-models-group/convdeep-mnist-3-digits-maj-seq-12-fold*',
    '../final-models-group/convdeep-fashion-mnist-3-items-maj-seq-12-fold*',
    '../final-models-group/convdeep_transcribe-*-maj-seq-12-fold*',    
    '../final-models-group/rlstm_persisted_dropout-*-maj-seq-12-fold*',
    '../final-models-group/convrlstm_persisted_dropout-*-maj-seq-12-fold*',
]

pair_tests(conv_vs_convliteral)

### Testing for method sensitivity ###
Have 56/56 rows after removing nan
                sum_sq    df           F        PR(>F)
architecture  0.100002   3.0  137.066087  1.115396e-24
Residual      0.012646  52.0         NaN           NaN
effective size(eta squared) : 0.887737
--------------------------------------------------
                   Multiple Comparison of Means - Tukey HSD,FWER=0.05                  
           group1                      group2           meandiff  lower   upper  reject
---------------------------------------------------------------------------------------
          convdeep              convdeep_transcribe     -0.1041  -0.1198 -0.0885  True 
          convdeep          convrlstm_persisted_dropout -0.0093  -0.0249  0.0063 False 
          convdeep            rlstm_persisted_dropout   -0.0119  -0.0275  0.0038 False 
    convdeep_transcribe     convrlstm_persisted_dropout  0.0948   0.0792  0.1105  True 
    convdeep_transcribe       rlstm_persisted_dropout  