In [3]:
import numpy as np
import pickle
import os
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy import stats
import glob

In [4]:
## path to the results from disease_metric_estimation results
curr_dir = os.getcwd()
folder_name = 'sir_feat'
gamma = 1 ## change it to recovery time (1/gamma) defined in the pipeline.sh
dup = 'True' ## variable to include or exclude duplicate sample points (put same as in pipeline.sh)

## Hypothesis testing - Disease Metrics

In [5]:
disease_df =  pd.DataFrame()
for network in ['ER','SF','SW']:
    master_df = pd.DataFrame()
    for algo in ['OG','RW','MHRW']:
        file_ls = glob.glob(f"{curr_dir}/{folder_name}/{network}*_{gamma}_{algo}.pkl")
        for filepath in file_ls:
            df = pickle.load(open(filepath,"rb"))
            if dup!='True': df['scaler'] = df['sample_size']
            else :
                if algo != 'OG' : df['scaler'] = 10 ## sample size
                else : df['scaler'] = 10000 ## network size 
            df['node_idx'] = df['node_idx']/df['scaler']
            master_df = pd.concat([master_df,df],axis=0)
    master_df['network'] = network
    disease_df = pd.concat([disease_df,master_df],axis=0)

#### Step 1: Normality test; KS test

In [8]:
norm_dis_df = []
for network in ['ER','SF','SW']:
    for algo in ['OG','RW','MHRW']:
        sub_df = disease_df[(disease_df['network']==network) & (disease_df['algo']==algo)]
        for col in ['node_idx','second_inf','inf_time']:
            test_stat = stats.ks_1samp(sub_df[col],stats.norm.cdf)
            norm_dis_df.append([network,algo,col,test_stat.pvalue, test_stat.statistic])
norm_dis_df = pd.DataFrame(norm_dis_df,columns = ['network','algo','col','p_value','ks_stats'])
norm_dis_df['Hypothesis'] = norm_dis_df['p_value'].apply(lambda x : 'Normal' if x >0.05 else 'Not Normal')  

In [9]:
norm_dis_df

Unnamed: 0,network,algo,col,p_value,ks_stats,Hypothesis
0,ER,OG,node_idx,1.067092e-05,0.503989,Not Normal
1,ER,OG,second_inf,1.050266e-07,0.585627,Not Normal
2,ER,OG,inf_time,2.200396e-13,0.751615,Not Normal
3,ER,RW,node_idx,2.396492e-10,0.539828,Not Normal
4,ER,RW,second_inf,4.643698e-12,0.580841,Not Normal
5,ER,RW,inf_time,3.80192e-24,0.785924,Not Normal
6,ER,MHRW,node_idx,2.955265e-09,0.539828,Not Normal
7,ER,MHRW,second_inf,6.368367e-08,0.5,Not Normal
8,ER,MHRW,inf_time,2.755386e-32,0.899003,Not Normal
9,SF,OG,node_idx,1.067092e-05,0.503989,Not Normal


#### Step 2: Two sample comparison test; Mann-Whitney U test

In [12]:
one_tail_dis_df = []
for network in ['ER','SF','SW']:
    sub_df = disease_df[(disease_df['network']==network)]
    for col in ['node_idx','second_inf','inf_time']:
        for algo1 in ['OG','RW','MHRW']:
            for algo2 in ['OG','RW','MHRW']:
                if algo1 != algo2:
                    U1, p = stats.mannwhitneyu(sub_df[sub_df['algo']==algo1][col],
                                                sub_df[sub_df['algo']==algo2][col]
                                               , method="asymptotic", alternative = 'less') 
                    # print(algo1,algo2, U1, p)
                    one_tail_dis_df.append([network,col,algo1,algo2, U1, p])
one_tail_dis_df = pd.DataFrame(one_tail_dis_df,columns = ['network','col','algo1','algo2','manwhit_stats','p_value'])
one_tail_dis_df['Hypothesis'] = one_tail_dis_df['p_value'].apply(lambda x : 'same distribution' if x > 0.05 else 'algo1<algo2')  

In [13]:
one_tail_dis_df

Unnamed: 0,network,col,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,ER,node_idx,OG,RW,361.0,0.289739,same distribution
1,ER,node_idx,OG,MHRW,408.0,0.841085,same distribution
2,ER,node_idx,RW,OG,431.0,0.715732,same distribution
3,ER,node_idx,RW,MHRW,739.5,0.979207,same distribution
4,ER,node_idx,MHRW,OG,296.0,0.163236,same distribution
5,ER,node_idx,MHRW,RW,412.5,0.021422,algo1<algo2
6,ER,second_inf,OG,RW,191.0,0.000515,algo1<algo2
7,ER,second_inf,OG,MHRW,359.0,0.552558,same distribution
8,ER,second_inf,RW,OG,601.0,0.999514,same distribution
9,ER,second_inf,RW,MHRW,758.0,0.987767,same distribution
