In [1]:
import os
import json
import statistics
import pandas as pd
from pathlib import Path

In [2]:
def get_statistics(dir_path, source_target):
    
    measures_dict = {
        'source_target': [],
        'at_cutoff_rank': [], 'at_cutoff_precision': [], 'at_cutoff_recall': [], 'at_cutoff_f1_score': [],
        'at_max_f1_score_rank': [], 'at_max_f1_score_precision': [], 'at_max_f1_score_recall': [], 'at_max_f1_score_f1_score': []
    }

    for dir in os.listdir(dir_path):
        with open(dir_path+dir+'/results_summary.json') as f:
            data = json.load(f)

            measures_dict['source_target'].append(source_target)

            # measures at cut off
            measures_at_cut_off_dict = data['measures_at_cut_off']
            measures_dict['at_cutoff_rank'].append(measures_at_cut_off_dict['rank'])
            measures_dict['at_cutoff_precision'].append(measures_at_cut_off_dict['precision'])
            measures_dict['at_cutoff_recall'].append(measures_at_cut_off_dict['recall'])
            measures_dict['at_cutoff_f1_score'].append(measures_at_cut_off_dict['f1_score'])

            # measures at max f1-score
            measures_at_max_f1_score = data['measures_at_max_f1_score']
            measures_dict['at_max_f1_score_rank'].append(measures_at_max_f1_score['rank'])
            measures_dict['at_max_f1_score_precision'].append(measures_at_max_f1_score['precision'])
            measures_dict['at_max_f1_score_recall'].append(measures_at_max_f1_score['recall'])
            measures_dict['at_max_f1_score_f1_score'].append(measures_at_max_f1_score['f1_score'])
        
    return measures_dict

# Approximate BC by varying the source and target nodes used

In [3]:
dir_main_path = 'network_analysis/figures/'
name_types = ['all', 'cell', 'attr']
df = pd.DataFrame()
dir_list = []

for source in name_types:
    for target in name_types:
        source_target = source + '_' + target
        cur_dir = dir_main_path+'TUS_source_' + source + '_target_' + target + '/'

        if os.path.isdir(cur_dir):
            # Retreive the stat dict and populate the dataframe
            stat_dict = get_statistics(cur_dir, source_target=source_target)
            df_temp = pd.DataFrame(stat_dict)
            df = df.append(df_temp, ignore_index=True)
df

Unnamed: 0,source_target,at_cutoff_rank,at_cutoff_precision,at_cutoff_recall,at_cutoff_f1_score,at_max_f1_score_rank,at_max_f1_score_precision,at_max_f1_score_recall,at_max_f1_score_f1_score
0,all_all,26035,0.625543,0.625543,0.625543,29626,0.605954,0.689533,0.645048
1,all_all,26035,0.608143,0.608143,0.608143,23516,0.663846,0.599616,0.630098
2,all_all,26035,0.62416,0.62416,0.62416,26459,0.630183,0.640446,0.635273
3,all_all,26035,0.630075,0.630075,0.630075,29733,0.639996,0.730901,0.682434
4,all_all,26035,0.621894,0.621894,0.621894,29633,0.61499,0.699981,0.654739
5,all_cell,26035,0.595468,0.595468,0.595468,23227,0.665906,0.594085,0.627949
6,all_cell,26035,0.597657,0.597657,0.597657,23292,0.667654,0.597311,0.630527
7,all_cell,26035,0.598425,0.598425,0.598425,23321,0.667639,0.598041,0.630926
8,all_cell,26035,0.60991,0.60991,0.60991,23314,0.667496,0.597734,0.630692
9,all_cell,26035,0.59854,0.59854,0.59854,23319,0.667953,0.598272,0.631195


In [4]:
# Mean values in each measure
df_summary_mean = df.groupby(['source_target'], as_index=False).mean()
df_summary_mean

Unnamed: 0,source_target,at_cutoff_rank,at_cutoff_precision,at_cutoff_recall,at_cutoff_f1_score,at_max_f1_score_rank,at_max_f1_score_precision,at_max_f1_score_recall,at_max_f1_score_f1_score
0,all_all,26035.0,0.621963,0.621963,0.621963,27793.4,0.630994,0.672095,0.649518
1,all_attr,26035.0,0.621932,0.621932,0.621932,27693.8,0.632672,0.671043,0.64968
2,all_cell,26035.0,0.6,0.6,0.6,23294.6,0.66733,0.597089,0.630258
3,attr_all,26035.0,0.619389,0.619389,0.619389,29802.0,0.614431,0.690855,0.643858
4,attr_attr,26035.0,0.618659,0.618659,0.618659,31894.0,0.594735,0.72653,0.652946
5,attr_cell,26035.0,0.599263,0.599263,0.599263,23355.0,0.667252,0.598556,0.631036
6,cell_all,26035.0,0.594769,0.594769,0.594769,22947.2,0.667626,0.588446,0.625538
7,cell_attr,26035.0,0.595245,0.595245,0.595245,22946.6,0.667635,0.588439,0.625537
8,cell_cell,26035.0,0.591127,0.591127,0.591127,22827.6,0.667191,0.584997,0.623394


In [5]:
# Standard deviation of values in each measure
df_summary_std = df.groupby(['source_target'], as_index=False).std()

rename_dict = {}
for column in df_summary_std.columns:
    if column not in ['source_target']:
        rename_dict[column] = column+'_std'

df_summary_std.rename(columns=rename_dict, inplace=True)
df_summary_std

Unnamed: 0,source_target,at_cutoff_rank_std,at_cutoff_precision_std,at_cutoff_recall_std,at_cutoff_f1_score_std,at_max_f1_score_rank_std,at_max_f1_score_precision_std,at_max_f1_score_recall_std,at_max_f1_score_f1_score_std
0,all_all,0.0,0.008283,0.008283,0.008283,2765.021392,0.022606,0.051949,0.020681
1,all_attr,0.0,0.008276,0.008276,0.008276,2958.615166,0.025725,0.053804,0.0205
2,all_cell,0.0,0.005675,0.005675,0.005675,39.513289,0.000813,0.001717,0.001315
3,attr_all,0.0,0.003175,0.003175,0.003175,6423.254899,0.063379,0.083867,0.006418
4,attr_attr,0.0,0.009062,0.009062,0.009062,2889.344995,0.02378,0.039606,0.007074
5,attr_cell,0.0,4.2e-05,4.2e-05,4.2e-05,132.789307,0.002532,0.00115,0.000487
6,cell_all,0.0,0.006713,0.006713,0.006713,106.175327,0.00212,0.003677,0.002808
7,cell_attr,0.0,0.007173,0.007173,0.007173,106.790917,0.002115,0.003686,0.002808
8,cell_cell,0.0,0.009304,0.009304,0.009304,96.053631,0.002172,0.003402,0.002686


In [6]:
# Combine the dataframes and generate a summarized dataframe for relevant columns
columns_to_select = [
    'source_target', 'at_cutoff_f1_score', 'at_cutoff_f1_score_std', 'at_max_f1_score_rank', 'at_max_f1_score_rank_std',
    'at_max_f1_score_precision', 'at_max_f1_score_precision_std', 'at_max_f1_score_recall', 'at_max_f1_score_recall_std', 
    'at_max_f1_score_f1_score', 'at_max_f1_score_f1_score_std'
]

df_summary = pd.merge(df_summary_mean, df_summary_std, on="source_target")[columns_to_select]
df_summary

Unnamed: 0,source_target,at_cutoff_f1_score,at_cutoff_f1_score_std,at_max_f1_score_rank,at_max_f1_score_rank_std,at_max_f1_score_precision,at_max_f1_score_precision_std,at_max_f1_score_recall,at_max_f1_score_recall_std,at_max_f1_score_f1_score,at_max_f1_score_f1_score_std
0,all_all,0.621963,0.008283,27793.4,2765.021392,0.630994,0.022606,0.672095,0.051949,0.649518,0.020681
1,all_attr,0.621932,0.008276,27693.8,2958.615166,0.632672,0.025725,0.671043,0.053804,0.64968,0.0205
2,all_cell,0.6,0.005675,23294.6,39.513289,0.66733,0.000813,0.597089,0.001717,0.630258,0.001315
3,attr_all,0.619389,0.003175,29802.0,6423.254899,0.614431,0.063379,0.690855,0.083867,0.643858,0.006418
4,attr_attr,0.618659,0.009062,31894.0,2889.344995,0.594735,0.02378,0.72653,0.039606,0.652946,0.007074
5,attr_cell,0.599263,4.2e-05,23355.0,132.789307,0.667252,0.002532,0.598556,0.00115,0.631036,0.000487
6,cell_all,0.594769,0.006713,22947.2,106.175327,0.667626,0.00212,0.588446,0.003677,0.625538,0.002808
7,cell_attr,0.595245,0.007173,22946.6,106.790917,0.667635,0.002115,0.588439,0.003686,0.625537,0.002808
8,cell_cell,0.591127,0.009304,22827.6,96.053631,0.667191,0.002172,0.584997,0.003402,0.623394,0.002686
