In [2]:
%matplotlib inline
import logging
from utils import logging as lg
lg.set_logging(logging.ERROR)

from scipy import stats
from skimage.measure import block_reduce
import numpy as np

import logging

import pandas as pd
import pickle
from notebook_utils import plot 
import seaborn as sns
import glob
sns.set(color_codes=True, font_scale=2, style="whitegrid", palette="bright")

from model import base, provider, heatmap_evaluation
import config
import matplotlib.pyplot as plt
import tensorflow as tf
import yaml
tf.logging.set_verbosity(tf.logging.ERROR)

In [49]:
def hypothesis_testing(base, model_paths, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha2_beta1'],
                       significance_level=0.025):

    def highlight(s):
        return ['font-weight: bold' if v < significance_level else '' for v in s]

    results = []
    print('paired t-test p value against `%s` architecture\n highlighted when the value <= %f' % (base, significance_level))
    df = plot.get_stats_from_models(model_paths)

    architectures = set(df.architecture.values)
    
    models = list(filter(lambda x: x != base, set(df.architecture.values)))
    for m in models: 
        dd = [('architecture', m)]
        for med in methods:
            baseline = df[(df.architecture == base) & (df.method==med)]['rel_dist_in_data_region']
            comparing_model = df[(df.architecture == m) & (df.method==med)]['rel_dist_in_data_region']
            s = stats.ttest_rel(comparing_model, baseline)
            pv = s.pvalue/2.0
            if s.statistic < 0 :
                pv = 1-pv
            dd.append((med, pv))
        results.append(dict(dd))
        
    df_res = pd.DataFrame(results)[['architecture'] + methods].style.apply(highlight, subset=methods)
    return df_res


exp2_mnist_models = [
     '../experiment-results/models-for-exp3/shallow-mnist-3-digits-maj-seq-12---2018-03-15--01-28-55--fold-*',
    '../experiment-results/models-for-exp3/deep-mnist-3-digits-maj-seq-12---2018-03-15--01-28-42--fold-*',
    '../experiment-results/models-for-exp3/convdeep-mnist-3-digits-maj-seq-12---2018-03-15--01-30-08--fold-*',
       
]
hypothesis_testing(base='shallow', model_paths=exp2_mnist_models)

paired t-test p value against `shallow` architecture
 highlighted when the value <= 0.025000


Unnamed: 0,architecture,sensitivity,guided_backprop,lrp_deep_taylor,lrp_alpha2_beta1
0,convdeep,7.31932e-08,1.65662e-06,1.14295e-10,9.76723e-11
1,deep,1.20142e-07,0.997918,7.08453e-12,2.30055e-11


In [50]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison


In [142]:
def pair_tests(models, methods=['sensitivity', 'guided_backprop', 'lrp_deep_taylor', 'lrp_alpha2_beta1'], alpha=0.01):
    df = plot.get_stats_from_models(models)
    for m in methods:
        df_method = df.loc[df['method'] == m]
        df_removed_na = df_method.dropna()
        print('### Testing for method %s ###' % m)
        print("Have %d/%d rows after removing nan" % (len(df_removed_na), len(df_method)))

        mc = MultiComparison(    df_removed_na['rel_dist_in_data_region'],     df_removed_na['architecture'])
        result = mc.tukeyhsd(alpha=alpha)
        print(result.summary())
        print('\n')

In [141]:
pair_tests(exp2_mnist_models)

### Testing for method sensitivity ###
Have 30/30 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lower   upper  reject
------------------------------------------------
convdeep   deep  -0.0447  -0.0618 -0.0276  True 
convdeep shallow -0.0911  -0.1082  -0.074  True 
  deep   shallow -0.0464  -0.0635 -0.0292  True 
------------------------------------------------


### Testing for method guided_backprop ###
Have 30/30 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lower   upper  reject
------------------------------------------------
convdeep   deep  -0.0508  -0.0669 -0.0347  True 
convdeep shallow -0.0378  -0.0539 -0.0217  True 
  deep   shallow  0.0129  -0.0031  0.029  False 
------------------------------------------------


### Testing for method lrp_deep_taylor ###
Have 30/30 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff

In [122]:
exp2_models = [
    '../experiment-results/models-for-exp3/shallow-mnist-3-digits-maj-seq-12---2018-03-15--01-28-55--fold-*',
    '../experiment-results/models-for-exp3/deep-mnist-3-digits-maj-seq-12---2018-03-15--01-28-42--fold-*',
    '../experiment-results/models-for-exp3/convdeep-mnist-3-digits-maj-seq-12---2018-03-15--01-30-08--fold-*',
    
    '../experiment-results/models-for-exp3/shallow-fashion-mnist-3-items-maj-seq-12---2018-03-15--23-38-13--fold-*',
    '../experiment-results/models-for-exp3/deep-fashion-mnist-3-items-maj-seq-12---2018-03-15--23-38-13--fold-*',
    '../experiment-results/models-for-exp3/convdeep-fashion-mnist-3-items-maj-seq-12---2018-03-16--09-20-39--fold-*'    
]

In [123]:
pair_tests(exp2_models)

### Testing for method sensitivity ###
Have 60/60 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lower   upper  reject
------------------------------------------------
convdeep   deep  -0.0363  -0.0619 -0.0107  True 
convdeep shallow -0.0773  -0.1029 -0.0517  True 
  deep   shallow  -0.041  -0.0666 -0.0154  True 
------------------------------------------------


### Testing for method guided_backprop ###
Have 60/60 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lower  upper  reject
-----------------------------------------------
convdeep   deep  -0.0352  -0.0756 0.0051 False 
convdeep shallow -0.0296  -0.0699 0.0107 False 
  deep   shallow  0.0056  -0.0347 0.0459 False 
-----------------------------------------------


### Testing for method lrp_deep_taylor ###
Have 58/60 rows after removing nan
Multiple Comparison of Means - Tukey HSD,FWER=0.01
 group1   group2 meandiff  lowe

In [124]:
deep_vs_lstm_models = [
    '../experiment-results/models-for-exp3/deep-mnist-3-digits-maj-seq-12---2018-03-15--01-28-42--fold-*',
    '../experiment-results/models-for-exp3/rlstm-mnist-3-digits-maj-seq-12---2018-03-15--01-29-53--fold-*',
    '../experiment-results/models-for-exp3/rlstm_persisted_dropout-mnist-3-digits-maj-seq-12---2018-03-15--01-31-42--fold-*',
    
    '../experiment-results/models-for-exp3/deep-fashion-mnist-3-items-maj-seq-12---2018-03-15--23-38-13--fold-*',
    '../experiment-results/models-for-exp3/rlstm-fashion-mnist-3-items-maj-seq-12---2018-03-15--23-38-13--fold-*',
    '../experiment-results/models-for-exp3/rlstm_persisted_dropout-fashion-mnist-3-items-maj-seq-12---2018-03-15--23-38-22--fold-*'
]
pair_tests(deep_vs_lstm_models)

### Testing for method sensitivity ###
Have 60/60 rows after removing nan
      Multiple Comparison of Means - Tukey HSD,FWER=0.01      
group1          group2         meandiff  lower   upper  reject
--------------------------------------------------------------
 deep           rlstm          -0.0263  -0.0419 -0.0108  True 
 deep  rlstm_persisted_dropout -0.0369  -0.0524 -0.0213  True 
rlstm  rlstm_persisted_dropout -0.0105  -0.0261  0.005  False 
--------------------------------------------------------------


### Testing for method guided_backprop ###
Have 60/60 rows after removing nan
      Multiple Comparison of Means - Tukey HSD,FWER=0.01      
group1          group2         meandiff  lower   upper  reject
--------------------------------------------------------------
 deep           rlstm          -0.0594  -0.0801 -0.0387  True 
 deep  rlstm_persisted_dropout -0.0734  -0.0942 -0.0527  True 
rlstm  rlstm_persisted_dropout  -0.014  -0.0347  0.0067 False 
---------------------------

In [125]:
conv_vs_convliteral = [
    '../experiment-results/models-for-exp3/convdeep-mnist-3-digits-maj-seq-12---2018-03-15--01-30-08--fold-*',
    '../experiment-results/models-for-exp3/convdeep_transcribe-mnist-3-digits-maj-seq-12---2018-03-15--13-43-20--fold-*',
    
    '../experiment-results/models-for-exp3/convdeep-fashion-mnist-3-items-maj-seq-12---2018-03-16--09-20-39--fold-*',
    '../experiment-results/models-for-exp3/convdeep_transcribe-fashion-mnist-3-items-maj-seq-12---2018-03-19--08-39-23--fold-*'
]

pair_tests(conv_vs_convliteral)

### Testing for method sensitivity ###
Have 40/40 rows after removing nan
    Multiple Comparison of Means - Tukey HSD,FWER=0.01    
 group1         group2       meandiff  lower  upper reject
----------------------------------------------------------
convdeep convdeep_transcribe -0.0027  -0.0315 0.026 False 
----------------------------------------------------------


### Testing for method guided_backprop ###
Have 40/40 rows after removing nan
     Multiple Comparison of Means - Tukey HSD,FWER=0.01    
 group1         group2       meandiff  lower  upper  reject
-----------------------------------------------------------
convdeep convdeep_transcribe  0.0015  -0.0397 0.0427 False 
-----------------------------------------------------------


### Testing for method lrp_deep_taylor ###
Have 28/40 rows after removing nan
     Multiple Comparison of Means - Tukey HSD,FWER=0.01    
 group1         group2       meandiff  lower  upper  reject
---------------------------------------------------