In [1]:
%matplotlib inline
import logging
from utils import logging as lg
lg.set_logging(logging.ERROR)

from scipy import stats
from skimage.measure import block_reduce
import numpy as np

import logging

import pandas as pd
import pickle
from notebook_utils import plot 
import seaborn as sns
import glob
sns.set(color_codes=True, font_scale=2, style="whitegrid", palette="bright")

from model import base, provider, heatmap_evaluation
import config
import matplotlib.pyplot as plt
import tensorflow as tf
import yaml
tf.logging.set_verbosity(tf.logging.ERROR)

In [17]:
def hypothesis_testing(base, model_paths, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha1_5_beta_5'],
                       significance_level=0.025, metric_name='adjusted_rel_dist_in_data_region'):

    def highlight(s):
        return ['font-weight: bold' if v < significance_level else '' for v in s]

    results = []
    print('paired t-test p value against `%s` architecture\n highlighted when the value <= %f' % (base, significance_level))
    df = plot.get_stats_from_models(model_paths)

    architectures = set(df.architecture.values)
    
    models = list(filter(lambda x: x != base, set(df.architecture.values)))
    for m in models: 
        dd = [('architecture', m)]
        for med in methods:
            baseline = df[(df.architecture == base) & (df.method==med)][metric_name]
            comparing_model = df[(df.architecture == m) & (df.method==med)][metric_name]
            s = stats.ttest_rel(comparing_model, baseline)
            pv = s.pvalue/2.0
            if s.statistic < 0 :
                pv = 1-pv
            dd.append((med, pv))
        results.append(dict(dd))
        
    df_res = pd.DataFrame(results)[['architecture'] + methods].style.apply(highlight, subset=methods)
    return df_res


exp2_models = [
    '../final-models-group/shallow-m*-maj-seq-12-fold-*',
    '../final-models-group/shallow-f*-maj-seq-12-fold-*',

    '../final-models-group/deep-*-maj-seq-12-fold-*',
    '../final-models-group/deep_v2-*-maj-seq-12-fold-*',
    '../final-models-group/convdeep-*-maj-seq-12-fold-*',

]
hypothesis_testing(base='shallow', model_paths=exp2_mnist_models)

paired t-test p value against `shallow` architecture
 highlighted when the value <= 0.025000


Unnamed: 0,architecture,sensitivity,guided_backprop,lrp_deep_taylor,lrp_alpha1_5_beta_5
0,deep,0.0535348,0.451833,2.31764e-11,3.94069e-11
1,convdeep,0.000380304,0.0226089,5.00673e-13,1.42664e-11
2,deep_v2,0.0034859,0.117182,1.31124e-13,7.94346e-12


In [10]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
from statsmodels.formula.api import ols
import statsmodels.api as sm
                


In [18]:
def pair_tests(models, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha1_5_beta_5'], alpha=0.01):
    df = plot.get_stats_from_models(models)
    for m in methods:
        df_method = df.loc[df['method'] == m]
        df_removed_na = df_method.dropna()
        print('### Testing for method %s ###' % m)
        print("Have %d/%d rows after removing nan" % (len(df_removed_na), len(df_method)))
        print('='*20, 'ANOVA', '='*20)
        mod = ols('rel_dist_in_data_region ~ architecture', data=df_removed_na).fit()
        aov_table = sm.stats.anova_lm(mod, typ=2)
        print(aov_table)
        esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
        print('effective size(eta squared) : %f' % esq_sm)
        print('-'*50)
        mc = MultiComparison(    df_removed_na['adjusted_rel_dist_in_data_region'],     df_removed_na['architecture'])
        result = mc.tukeyhsd(alpha=alpha)
        print(result.summary())
        print('\n')

In [19]:
pair_tests(exp2_models)

### Testing for method sensitivity ###
Have 56/56 rows after removing nan
                sum_sq    df          F        PR(>F)
architecture  0.055717   3.0  21.132725  4.379611e-09
Residual      0.045700  52.0        NaN           NaN
effective size(eta squared) : 0.549386
--------------------------------------------------
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lower   upper  reject
------------------------------------------------
convdeep   deep  -0.0353  -0.0582 -0.0124  True 
convdeep deep_v2 -0.0191   -0.042  0.0039 False 
convdeep shallow  -0.047  -0.0699 -0.0241  True 
  deep   deep_v2  0.0163  -0.0067  0.0392 False 
  deep   shallow -0.0117  -0.0346  0.0112 False 
deep_v2  shallow  -0.028  -0.0509 -0.0051  True 
------------------------------------------------


### Testing for method guided_backprop ###
Have 56/56 rows after removing nan
                sum_sq    df         F    PR(>F)
architecture  0.014729   3.0  2.383742  0.079826
Resi

In [20]:
deep_vs_lstm_models = [
    '../final-models-group/deep-m*-maj-seq-12-fold-*',
    '../final-models-group/deep-f*-maj-seq-12-fold-*',

    '../final-models-group/deep_persisted_dropout-*-maj-seq-12-fold-*',
    
    
    '../final-models-group/rlstm-*-maj-seq-12-fold-*',
    '../final-models-group/rlstm_persisted_dropout-*-maj-seq-12-fold-*']
pair_tests(deep_vs_lstm_models)

### Testing for method sensitivity ###
Have 56/56 rows after removing nan
                sum_sq    df          F        PR(>F)
architecture  0.017848   3.0  19.103688  1.753310e-08
Residual      0.016194  52.0        NaN           NaN
effective size(eta squared) : 0.524293
--------------------------------------------------
              Multiple Comparison of Means - Tukey HSD,FWER=0.01             
        group1                  group2         meandiff  lower  upper  reject
-----------------------------------------------------------------------------
         deep           deep_persisted_dropout -0.0063  -0.0192 0.0066 False 
         deep                   rlstm           0.0329    0.02  0.0458  True 
         deep          rlstm_persisted_dropout  0.0234   0.0105 0.0363  True 
deep_persisted_dropout          rlstm           0.0392   0.0263 0.0521  True 
deep_persisted_dropout rlstm_persisted_dropout  0.0298   0.0169 0.0427  True 
        rlstm          rlstm_persisted_dropout -0.

In [21]:
conv_vs_convliteral = [
    '../final-models-group/convdeep-mnist-3-digits-maj-seq-12-fold*',
    '../final-models-group/convdeep-fashion-mnist-3-items-maj-seq-12-fold*',
    '../final-models-group/convdeep_transcribe-*-maj-seq-12-fold*',    
    '../final-models-group/convrlstm_persisted_dropout-*-maj-seq-12-fold*',
    '../final-models-group/convtran_rlstm_persisted_dropout-*-maj-seq-12-fold*',
]

pair_tests(conv_vs_convliteral)

### Testing for method sensitivity ###
Have 44/44 rows after removing nan
                sum_sq    df           F        PR(>F)
architecture  0.223879   3.0  123.317932  2.994343e-20
Residual      0.024206  40.0         NaN           NaN
effective size(eta squared) : 0.902428
--------------------------------------------------
                     Multiple Comparison of Means - Tukey HSD,FWER=0.01                     
           group1                        group2              meandiff  lower   upper  reject
--------------------------------------------------------------------------------------------
          convdeep                convdeep_transcribe        -0.1041  -0.1257 -0.0825  True 
          convdeep            convrlstm_persisted_dropout    -0.0093  -0.0309  0.0123 False 
          convdeep          convtran_rlstm_persisted_dropout -0.0766  -0.1197 -0.0334  True 
    convdeep_transcribe       convrlstm_persisted_dropout     0.0948   0.0732  0.1164  True 
    convdeep_transcr