In [None]:
import pickle
import pandas as pd
import numpy as np
import networkx as nx
import itertools

In [None]:
def calculate_measures(df, measure, num_true_homographs):
    '''
    Calculates and adds columns precision_`measure`, recall_`measure`, f1_score_`measure` for the specific in the dataframe
    for each node.

    measure a string specifying the column to be used for scoring

    num_true_homographs is an integer specifying the number of true homographs 
    in the dataframe based on the ground truth 
    '''
    num_homographs_seen_so_far = 0
    precision_list = []
    recall_list = []
    f1_list = []

    # Sort the dataframe by the specified measure (high->low)
    df = df.sort_values(by=[measure], ascending=False)
    df.loc[:,measure+'_rank'] = list(range(1, df.shape[0] + 1))
    df[measure+'_dense_rank'] = df[measure].rank(method='dense', ascending=False)

    # Calculate top-k precision/recall/f1-scores in a running fashion (start from k=1 all the way to the largest possible k)
    for k, cur_node_is_homograph in zip(range(1, df.shape[0] + 1), df['is_homograph']):
        if cur_node_is_homograph:
            num_homographs_seen_so_far += 1
        
        precision_list.append(num_homographs_seen_so_far / k)
        recall_list.append(num_homographs_seen_so_far / num_true_homographs)

        f1_score = (2*precision_list[-1]*recall_list[-1]) / (precision_list[-1]+recall_list[-1])
        f1_list.append(f1_score)

    df.loc[:, measure+'_precision'] = precision_list
    df.loc[:, measure+'_recall'] = recall_list
    df.loc[:, measure+'_f1_score'] = f1_list

    # Remove NaN values from F1-score
    df[measure+'_f1_score'] = df[measure+'_f1_score'].fillna(value=0)
    return df

In [None]:
df_path = 'output/TUS_source_all_target_all/seed1/graph_stats_with_groundtruth_df.pickle'
graph_path = '../graph_construction/combined_graphs_output/TUS/bipartite/bipartite.graph'

df = pickle.load(open(df_path, 'rb'))
df = df[df['node_type'] == 'cell']
df

In [None]:
num_true_homographs = df['is_homograph'].value_counts()[True]

measures = ['approximate_betweenness_centrality', 'katz', 'harmonic_closeness', 'pagerank']

for measure in measures:
    df = calculate_measures(df, measure, num_true_homographs)
df


In [None]:
# Compute F1-scores at num_homographs
for measure in measures:
    print('For', measure, 'at k =', num_true_homographs, 'the f1-score is:', df[df[measure+'_rank'] == num_true_homographs][measure+'_f1_score'].values[0])

In [None]:
for measure in measures:
    distinct_scores = df[measure+'_dense_rank'].max()
    print('For', measure, 'there are', distinct_scores, 'distinct scores')

# Precision/Recall/F1-score Figures

In [None]:
import matplotlib
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (22,12)

font = {'size'   : 22}
matplotlib.rc('font', **font)

In [None]:
# Precision
ranks = list(range(1,df.shape[0] +1))

measure_names = ['Approximate BC', 'Katz Centrality', 'Harmonic Closeness', 'Pagerank']
for (measure, measure_name) in zip(measures, measure_names):
    plt.plot(ranks, df.sort_values(by=[measure+'_rank'])[measure+'_precision'].values, label=measure_name, linewidth=2)

plt.axvline(x=num_true_homographs, color='black', linestyle='--')
plt.text(num_true_homographs + 5000, 0.90,'Number of true homographs cut-off line', fontsize=26)

plt.xlabel('Rank')
plt.ylabel('Precision')
leg = plt.legend()
for line in leg.get_lines():
    line.set_linewidth(4.0)
plt.tight_layout()
plt.savefig('figures/centrality_analysis/precision.svg')

In [None]:
for (measure, measure_name) in zip(measures, measure_names):
    plt.plot(ranks, df.sort_values(by=[measure+'_rank'])[measure+'_recall'].values, label=measure_name, linewidth=2)

plt.axvline(x=num_true_homographs, color='black', linestyle='--')
plt.text(num_true_homographs + 5000, 0.90,'Number of true homographs cut-off line', fontsize=26)

plt.xlabel('Rank')
plt.ylabel('Recall')
leg = plt.legend()
for line in leg.get_lines():
    line.set_linewidth(4.0)
plt.tight_layout()
plt.savefig('figures/centrality_analysis/recall.svg')

In [None]:
for (measure, measure_name) in zip(measures, measure_names):
    plt.plot(ranks, df.sort_values(by=[measure+'_rank'])[measure+'_f1_score'].values, label=measure_name, linewidth=2)

plt.axvline(x=num_true_homographs, color='black', linestyle='--')
plt.text(num_true_homographs + 5000, 0.70,'Number of true homographs cut-off line', fontsize=26)

plt.ylim(0, 1)
plt.xlabel('Rank')
plt.ylabel('F1-Score')
leg = plt.legend()
for line in leg.get_lines():
    line.set_linewidth(4.0)
plt.tight_layout()
plt.savefig('figures/centrality_analysis/f1_score.svg')

In [None]:
# Precision
ranks = list(range(1, 5000+1))

measure_names = ['Approximate BC', 'Katz Centrality', 'Harmonic Closeness', 'Pagerank']
for (measure, measure_name) in zip(measures, measure_names):
    vals = df.sort_values(by=[measure+'_rank'])[measure+'_precision'].values[0:5000]
    plt.plot(ranks, vals, label=measure_name, linewidth=2)

plt.xlabel('Rank')
plt.ylabel('Precision')
leg = plt.legend()
for line in leg.get_lines():
    line.set_linewidth(4.0)
plt.tight_layout()
plt.savefig('figures/centrality_analysis/precision_zoom.svg')