In [None]:
import os
import json
import statistics
import pickle
import pandas as pd
from pathlib import Path

In [None]:
def get_statistics(dir_path, source_target):
    
    measures_dict = {
        'source_target': [],
        'at_cutoff_rank': [], 'at_cutoff_precision': [], 'at_cutoff_recall': [], 'at_cutoff_f1_score': [],
        'at_max_f1_score_rank': [], 'at_max_f1_score_precision': [], 'at_max_f1_score_recall': [], 'at_max_f1_score_f1_score': []
    }

    for dir in os.listdir(dir_path):
        with open(dir_path+dir+'/results_summary.json') as f:
            data = json.load(f)

            measures_dict['source_target'].append(source_target)

            # measures at cut off
            measures_at_cut_off_dict = data['measures_at_cut_off']
            measures_dict['at_cutoff_rank'].append(measures_at_cut_off_dict['rank'])
            measures_dict['at_cutoff_precision'].append(measures_at_cut_off_dict['precision'])
            measures_dict['at_cutoff_recall'].append(measures_at_cut_off_dict['recall'])
            measures_dict['at_cutoff_f1_score'].append(measures_at_cut_off_dict['f1_score'])

            # measures at max f1-score
            measures_at_max_f1_score = data['measures_at_max_f1_score']
            measures_dict['at_max_f1_score_rank'].append(measures_at_max_f1_score['rank'])
            measures_dict['at_max_f1_score_precision'].append(measures_at_max_f1_score['precision'])
            measures_dict['at_max_f1_score_recall'].append(measures_at_max_f1_score['recall'])
            measures_dict['at_max_f1_score_f1_score'].append(measures_at_max_f1_score['f1_score'])
        
    return measures_dict

# Approximate BC by varying the source and target nodes used

In [None]:
dir_main_path = 'network_analysis/figures/'
name_types = ['all', 'cell', 'attr']
df = pd.DataFrame()
dir_list = []

for source in name_types:
    for target in name_types:
        source_target = source + '_' + target
        cur_dir = dir_main_path+'TUS_source_' + source + '_target_' + target + '/'

        if os.path.isdir(cur_dir):
            # Retreive the stat dict and populate the dataframe
            stat_dict = get_statistics(cur_dir, source_target=source_target)
            df_temp = pd.DataFrame(stat_dict)
            df = df.append(df_temp, ignore_index=True)
df

In [None]:
# Mean values in each measure
df_summary_mean = df.groupby(['source_target'], as_index=False).mean()
df_summary_mean

In [None]:
# Standard deviation of values in each measure
df_summary_std = df.groupby(['source_target'], as_index=False).std()

rename_dict = {}
for column in df_summary_std.columns:
    if column not in ['source_target']:
        rename_dict[column] = column+'_std'

df_summary_std.rename(columns=rename_dict, inplace=True)
df_summary_std

In [None]:
# Combine the dataframes and generate a summarized dataframe for relevant columns
columns_to_select = [
    'source_target', 'at_cutoff_f1_score', 'at_cutoff_f1_score_std', 'at_max_f1_score_rank', 'at_max_f1_score_rank_std',
    'at_max_f1_score_precision', 'at_max_f1_score_precision_std', 'at_max_f1_score_recall', 'at_max_f1_score_recall_std', 
    'at_max_f1_score_f1_score', 'at_max_f1_score_f1_score_std'
]

df_summary = pd.merge(df_summary_mean, df_summary_std, on="source_target")[columns_to_select]
df_summary

# Graph Analysis

In [None]:
import matplotlib
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (19,9)

font = {'weight' : 'bold',
        'size'   : 22}
matplotlib.rc('font', **font)

In [None]:
g_path = 'graph_construction/combined_graphs_output/TUS/bipartite/bipartite.graph'
df_path = 'network_analysis/output/TUS_source_all_target_all/seed1/graph_stats_with_groundtruth_df.pickle'
G = pickle.load(open(g_path, 'rb'))
df = pickle.load(open(df_path, 'rb'))

In [None]:
cell_nodes = [x for x,y in G.nodes(data=True) if y['type']=='cell']
attr_nodes = [x for x,y in G.nodes(data=True) if y['type']=='attr']
homograph_nodes = df.loc[df['is_homograph'] == True]['node'].values
identical_nodes = df.loc[df['is_homograph'] == False]['node'].values

In [None]:
def num_neighbors_list(G, nodes):
    '''
    Given a list of nodes from graph `G` return the number of neighbors for each node in the nodes list.

    The returned list size has the same size as the `nodes` list
    '''
    num_neighbors = []
    for node in nodes:
        num_neighbors.append(len(G[node]))    
    return num_neighbors

In [None]:
cell_nodes_num_neighbors = num_neighbors_list(G, cell_nodes)
attr_nodes_num_neighbors = num_neighbors_list(G, attr_nodes)
homograph_nodes_num_neighbors = num_neighbors_list(G, homograph_nodes)
identical_nodes_num_neighbors = num_neighbors_list(G, identical_nodes)

In [None]:
print('Cell Nodes mean num neighbors:', statistics.mean(cell_nodes_num_neighbors), 'median:', statistics.median(cell_nodes_num_neighbors))
print('Attribute Nodes mean num neighbors:', statistics.mean(attr_nodes_num_neighbors), 'median:', statistics.median(attr_nodes_num_neighbors))
print('Homograph Nodes mean num neighbors:', statistics.mean(homograph_nodes_num_neighbors), 'median:', statistics.median(homograph_nodes_num_neighbors))
print('Unambiguous Nodes Nodes mean num neighbors:', statistics.mean(identical_nodes_num_neighbors), 'median:', statistics.median(identical_nodes_num_neighbors))

In [None]:
plt.hist(cell_nodes_num_neighbors, bins=1000)
plt.xlim([0, 550])
plt.ylabel('Frequency')
plt.xlabel('Number of Neighbors')
plt.yscale('log')
plt.tight_layout()
plt.savefig('network_analysis/figures/BC_source_target_nodes_analysis/cell_nodes_num_neighbors.svg')

In [None]:
plt.hist(attr_nodes_num_neighbors, bins=400)
plt.ylabel('Frequency')
plt.xlabel('Number of Neighbors')
plt.yscale('log')
plt.tight_layout()
plt.savefig('network_analysis/figures/BC_source_target_nodes_analysis/attr_nodes_num_neighbors.svg')

In [None]:
plt.hist([homograph_nodes_num_neighbors, identical_nodes_num_neighbors], bins=400, label=['homographs', 'unambiguous values'])
plt.xlim([0, 550])
plt.ylabel('Frequency')
plt.xlabel('Number of Neighbors')
plt.yscale('log')
plt.legend()
plt.tight_layout()
plt.savefig('network_analysis/figures/BC_source_target_nodes_analysis/homographs_identical_vals_num_neighbors.svg')