In [1]:
import numpy as np
import pickle
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy import stats

# Significance testing

## Clusterin Coefficient

In [2]:
## collating clustering coeff data in dataframe form
df_dict = {}
for network in ['SF','SW','ER']:
    df_dict[network] = {}
    if network == 'ER':
        avg_clust_dict = {'net':[],'RW':[],'MHRW':[]}
        global_clust_dict = {'net':[],'RW':[],'MHRW':[]}
        for network_idx in ['ER_1','ER_2','ER_3']:
        # network = 'ER_1'
            tmp_dict1  = pickle.load(open(f"/home/neha/results/{network_idx}_avg_clust_coeff.pkl","rb"))
            tmp_dict2  = pickle.load(open(f"/home/neha/results/{network_idx}_global_clust_coeff.pkl","rb"))
            for key in avg_clust_dict.keys():
                avg_clust_dict[key].extend(tmp_dict1[key])
                global_clust_dict[key].extend(tmp_dict2[key])
            # break
    else :
        avg_clust_dict = pickle.load(open(f"/home/neha/results/{network}_combined_avg_clust_coeff.pkl","rb"))
        global_clust_dict = pickle.load(open(f"/home/neha/results/{network}_combined_global_clust_coeff.pkl","rb"))

    dict_of_dict = {'avg':avg_clust_dict,'global':global_clust_dict}
    for key,value in dict_of_dict.items():
        avg_dict= {}
        avg_dict['OG'] = value['net']
        for algo in ['RW','MHRW']:
            ls_avg = [np.mean(value[algo][i:i+100]) for i in range(0,len(value[algo]),100)]
            avg_dict[algo] = ls_avg

        df = pd.DataFrame([(key, value) for key, values in avg_dict.items() for value in values], columns=['Network', 'Value'])
        df_dict[network][key] = df
        



#### Normality test
- Sample size >1000, so using Kolmorgorov-Smirnov Test
- Compares data to normal distribution. H0 is of normal distribution.
    - If p < 0.05 then not-normally distributed
    - If p > 0.05 then normally distributed



In [5]:
## normality test - H0: distribution it normal 
### not normal
normality_test_df = []
for key,value in df_dict.items():
    for clust,df in value.items(): 
        for network in ['OG','RW','MHRW']:
            test_stat = stats.ks_1samp(df[df['Network']==network]['Value'],stats.norm.cdf)
            normality_test_df.append([key, clust,network,test_stat.pvalue, test_stat.statistic])

normality_test_df = pd.DataFrame(normality_test_df,columns =['network','clust_coeff','algo','p_value','ks_stats'])           
normality_test_df['Hypothesis'] = normality_test_df['p_value'].apply(lambda x : 'Normal' if x >0.05 else 'Not Normal')  

In [6]:
normality_test_df

Unnamed: 0,network,clust_coeff,algo,p_value,ks_stats,Hypothesis
0,SF,avg,OG,0.0,0.5,Not Normal
1,SF,avg,RW,0.0,0.5,Not Normal
2,SF,avg,MHRW,0.0,0.5,Not Normal
3,SF,global,OG,0.0,0.5,Not Normal
4,SF,global,RW,0.0,0.5,Not Normal
5,SF,global,MHRW,0.0,0.5,Not Normal
6,SW,avg,OG,0.0,0.523693,Not Normal
7,SW,avg,RW,0.0,0.525062,Not Normal
8,SW,avg,MHRW,0.0,0.524628,Not Normal
9,SW,global,OG,0.0,0.519612,Not Normal


### Two-tailed, distribution similarity test
- H0 - distribution of two samples are same
- H1 - distribution of two samples are not differ
- Conclusion : p<0.05, reject null hypothesis

In [7]:
two_tail_clust_test_df = []
for key,value in df_dict.items():
    for clust,df in value.items(): 
        for algo1 in ['OG','RW','MHRW']:
            for algo2 in ['OG','RW','MHRW']:
                if algo1 != algo2:
                    U1, p = stats.mannwhitneyu(df[df['Network']==algo1]['Value'], df[df['Network']==algo2]['Value'], method="asymptotic") ## two-sided
                    # print(algo1,algo2, U1, p)
                    two_tail_clust_test_df.append([key,clust,algo1,algo2, U1, p])
two_tail_clust_test_df = pd.DataFrame(two_tail_clust_test_df,columns = ['network','clust_coeff','algo1','algo2','manwhit_stats','p_value'])
two_tail_clust_test_df['Hypothesis'] = two_tail_clust_test_df['p_value'].apply(lambda x : 'same distribution' if x >0.05 else 'different distribution')  

In [8]:
two_tail_clust_test_df

Unnamed: 0,network,clust_coeff,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,SF,avg,OG,RW,33924014.5,0.0,different distribution
1,SF,avg,OG,MHRW,72422012.5,0.0,different distribution
2,SF,avg,RW,OG,66075985.5,0.0,different distribution
3,SF,avg,RW,MHRW,76361113.5,0.0,different distribution
4,SF,avg,MHRW,OG,27577987.5,0.0,different distribution
5,SF,avg,MHRW,RW,23638886.5,0.0,different distribution
6,SF,global,OG,RW,56741453.0,6.549693000000001e-62,different distribution
7,SF,global,OG,MHRW,75654084.0,0.0,different distribution
8,SF,global,RW,OG,43258547.0,6.549693000000001e-62,different distribution
9,SF,global,RW,MHRW,73289097.5,0.0,different distribution


### One-tailed, distribution similarity test
- H0 - distribution of two samples are same
- H1 - distribution of sample1 is stochastically greater than sample2  
- Conclusion : p<0.05, reject null hypothesis

In [9]:
one_tail_clust_test_df = []
for key,value in df_dict.items():
    for clust,df in value.items(): 
        for algo1 in ['OG','RW','MHRW']:
            for algo2 in ['OG','RW','MHRW']:
                if algo1 != algo2:
                    U1, p = stats.mannwhitneyu(df[df['Network']==algo1]['Value'], df[df['Network']==algo2]['Value']
                                               , method="asymptotic", alternative = 'less') 
                    # print(algo1,algo2, U1, p)
                    one_tail_clust_test_df.append([key,clust,algo1,algo2, U1, p])
one_tail_clust_test_df = pd.DataFrame(one_tail_clust_test_df,columns = ['network','clust_coeff','algo1','algo2','manwhit_stats','p_value'])
one_tail_clust_test_df['Hypothesis'] = one_tail_clust_test_df['p_value'].apply(lambda x : 'same distribution' if x >0.05 else 'algo1>alog2')  

In [11]:
one_tail_clust_test_df[one_tail_clust_test_df['Hypothesis'] == 'algo1>alog2']

Unnamed: 0,network,clust_coeff,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,SF,avg,OG,RW,33924014.5,0.0,algo1>alog2
4,SF,avg,MHRW,OG,27577987.5,0.0,algo1>alog2
5,SF,avg,MHRW,RW,23638886.5,0.0,algo1>alog2
8,SF,global,RW,OG,43258547.0,3.274846e-62,algo1>alog2
10,SF,global,MHRW,OG,24345916.0,0.0,algo1>alog2
11,SF,global,MHRW,RW,26710902.5,0.0,algo1>alog2
12,SW,avg,OG,RW,8798885.0,0.0,algo1>alog2
13,SW,avg,OG,MHRW,14582755.0,0.0,algo1>alog2
17,SW,avg,MHRW,RW,41217379.0,5.958676e-103,algo1>alog2
18,SW,global,OG,RW,0.0,0.0,algo1>alog2


In [10]:
one_tail_clust_test_df

Unnamed: 0,network,clust_coeff,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,SF,avg,OG,RW,33924014.5,0.0,algo1>alog2
1,SF,avg,OG,MHRW,72422012.5,1.0,same distribution
2,SF,avg,RW,OG,66075985.5,1.0,same distribution
3,SF,avg,RW,MHRW,76361113.5,1.0,same distribution
4,SF,avg,MHRW,OG,27577987.5,0.0,algo1>alog2
5,SF,avg,MHRW,RW,23638886.5,0.0,algo1>alog2
6,SF,global,OG,RW,56741453.0,1.0,same distribution
7,SF,global,OG,MHRW,75654084.0,1.0,same distribution
8,SF,global,RW,OG,43258547.0,3.274846e-62,algo1>alog2
9,SF,global,RW,MHRW,73289097.5,1.0,same distribution


## Disease Characterstics

In [9]:
directory = 'results'
disease_df =  pd.DataFrame()
for network in ['ER','SF','SW']:
    if network == 'ER':
        master_df = pd.DataFrame()
        for idx in range(1,4,1):
            for algo in ['OG','RW','MHRW']:
                # df = pickle.load(open(f"/home/neha/{directory}/{network}_{idx}_{algo}_SIR_agg.pkl","rb"))
                df = pickle.load(open(f"/home/neha/{directory}/{network}/{network}_{idx}_{algo}_SIR_agg.pkl","rb"))
                # df['scaler'] = df['sample_size']
                if algo != 'OG' : df['scaler'] = 500
                else : df['scaler'] = 10000
                df['node_idx'] = df['node_idx']/df['scaler']
                master_df = pd.concat([master_df,df],axis=0)
        master_df['network'] = network
    else:
        master_df = pd.DataFrame()
        for algo in ['OG','RW','MHRW']:
            # df = pickle.load(open(f"/home/neha/{directory}/{network}_combined_{algo}_SIR_agg.pkl","rb"))
            df = pickle.load(open(f"/home/neha/{directory}/{network}/{network}_combined_{algo}_SIR_agg.pkl","rb"))
            if algo != 'OG' : df['scaler'] = 500
            else : df['scaler'] = 10000
            df['node_idx'] = df['node_idx']/df['scaler']
            master_df = pd.concat([master_df,df],axis=0)
        master_df['network'] = network
    disease_df = pd.concat([disease_df,master_df],axis=0)

In [10]:
disease_df.head()

Unnamed: 0,beta,net_idx,node_idx,second_inf,inf_time,algo,walk_idx,scaler,network
0,0.0,0,0.01,0.0,0.0,OG,-1,10000,ER
1,0.0,1,0.01,0.0,0.0,OG,-1,10000,ER
2,0.0,2,0.01,0.0,0.0,OG,-1,10000,ER
3,0.0,3,0.01,0.0,0.0,OG,-1,10000,ER
4,0.0,4,0.01,0.0,0.0,OG,-1,10000,ER


### Normality check

In [4]:
norm_dis_df = []
for network in ['ER','SF','SW']:
    for algo in ['OG','RW','MHRW']:
        sub_df = disease_df[(disease_df['network']==network) & (disease_df['algo']==algo)]
        for col in ['node_idx','second_inf','inf_time']:
            test_stat = stats.ks_1samp(sub_df[col],stats.norm.cdf)
            norm_dis_df.append([network,algo,col,test_stat.pvalue, test_stat.statistic])
norm_dis_df = pd.DataFrame(norm_dis_df,columns = ['network','algo','col','p_value','ks_stats'])
norm_dis_df['Hypothesis'] = norm_dis_df['p_value'].apply(lambda x : 'Normal' if x >0.05 else 'Not Normal')  

In [5]:
norm_dis_df

Unnamed: 0,network,algo,col,p_value,ks_stats,Hypothesis
0,ER,OG,node_idx,0.0,0.504005,Not Normal
1,ER,OG,second_inf,0.0,0.577882,Not Normal
2,ER,OG,inf_time,0.0,0.729283,Not Normal
3,ER,RW,node_idx,0.0,0.500979,Not Normal
4,ER,RW,second_inf,0.0,0.599776,Not Normal
5,ER,RW,inf_time,0.0,0.722369,Not Normal
6,ER,MHRW,node_idx,0.0,0.501144,Not Normal
7,ER,MHRW,second_inf,0.0,0.58214,Not Normal
8,ER,MHRW,inf_time,0.0,0.726433,Not Normal
9,SF,OG,node_idx,0.0,0.503989,Not Normal


### Two tail test

In [6]:
two_tail_dis_df = []
for network in ['ER','SF','SW']:
    sub_df = disease_df[(disease_df['network']==network)]
    for col in ['node_idx','second_inf','inf_time']:
        for algo1 in ['OG','RW','MHRW']:
            for algo2 in ['OG','RW','MHRW']:
                if algo1 != algo2:
                    U1, p = stats.mannwhitneyu(sub_df[sub_df['algo']==algo1][col],
                                                sub_df[sub_df['algo']==algo2][col]
                                               , method="asymptotic") 
                    # print(algo1,algo2, U1, p)
                    two_tail_dis_df.append([network,col,algo1,algo2, U1, p])
two_tail_dis_df = pd.DataFrame(two_tail_dis_df,columns = ['network','col','algo1','algo2','manwhit_stats','p_value'])
two_tail_dis_df['Hypothesis'] = two_tail_dis_df['p_value'].apply(lambda x : 'same distribution' if x >0.05 else 'diff distribution')  

In [7]:
two_tail_dis_df

Unnamed: 0,network,col,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,ER,node_idx,OG,RW,511266800000.0,0.0,diff distribution
1,ER,node_idx,OG,MHRW,537714600000.0,0.0,diff distribution
2,ER,node_idx,RW,OG,695774600000.0,0.0,diff distribution
3,ER,node_idx,RW,MHRW,63121050000000.0,0.0,diff distribution
4,ER,node_idx,MHRW,OG,665050600000.0,0.0,diff distribution
5,ER,node_idx,MHRW,RW,56861370000000.0,0.0,diff distribution
6,ER,second_inf,OG,RW,252508500000.0,0.0,diff distribution
7,ER,second_inf,OG,MHRW,332969200000.0,0.0,diff distribution
8,ER,second_inf,RW,OG,954532900000.0,0.0,diff distribution
9,ER,second_inf,RW,MHRW,79695450000000.0,0.0,diff distribution


### One tail test

In [11]:
one_tail_dis_df = []
for network in ['ER','SF','SW']:
    sub_df = disease_df[(disease_df['network']==network)]
    for col in ['node_idx','second_inf','inf_time']:
        for algo1 in ['OG','RW','MHRW']:
            for algo2 in ['OG','RW','MHRW']:
                if algo1 != algo2:
                    U1, p = stats.mannwhitneyu(sub_df[sub_df['algo']==algo1][col],
                                                sub_df[sub_df['algo']==algo2][col]
                                               , method="asymptotic", alternative = 'less') 
                    # print(algo1,algo2, U1, p)
                    one_tail_dis_df.append([network,col,algo1,algo2, U1, p])
one_tail_dis_df = pd.DataFrame(one_tail_dis_df,columns = ['network','col','algo1','algo2','manwhit_stats','p_value'])
one_tail_dis_df['Hypothesis'] = one_tail_dis_df['p_value'].apply(lambda x : 'same distribution' if x > 0.05 else 'algo1<algo2')  

In [13]:
one_tail_dis_df

Unnamed: 0,network,col,algo1,algo2,manwhit_stats,p_value,Hypothesis
0,ER,node_idx,OG,RW,507272900000.0,0.0,algo1<algo2
1,ER,node_idx,OG,MHRW,610654500000.0,1.0,same distribution
2,ER,node_idx,RW,OG,699768500000.0,1.0,same distribution
3,ER,node_idx,RW,MHRW,69469340000000.0,1.0,same distribution
4,ER,node_idx,MHRW,OG,592110700000.0,6.1059589999999995e-19,algo1<algo2
5,ER,node_idx,MHRW,RW,50513090000000.0,0.0,algo1<algo2
6,ER,second_inf,OG,RW,256089000000.0,0.0,algo1<algo2
7,ER,second_inf,OG,MHRW,639235600000.0,1.0,same distribution
8,ER,second_inf,RW,OG,950952300000.0,1.0,same distribution
9,ER,second_inf,RW,MHRW,92172880000000.0,1.0,same distribution
