In [23]:
import os
import json
import pickle

import pandas as pd
import numpy as np
import shutil
import random
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(25, 16)}, font_scale=2)
sns.set_style("whitegrid")

import sys
sys.path.insert(1, '../network_analysis/')
import utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Functions

In [20]:
def graph_stats_with_gt(df, gt_homographs):
    '''
    Returns an updated `df` that contains the `is_homograph` column
    '''
        
    is_homograph_list=[]
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if row['node_type']=='attr':
            is_homograph_list.append(np.nan)
        else:
            if row['node'] in gt_homographs:
                is_homograph_list.append(True)
            else:
                is_homograph_list.append(False)
    
    df['is_homograph']=is_homograph_list
    return df

# Analysis over Synthetic Benchmark Large 3

In [29]:
input_dir='../network_analysis/output/synthetic_example_large3/'
graph_dir='../graph_construction/combined_graphs_output/synthetic_benchmark_large3/'
with open('datasets/synthetic_benchmark_large3/selected_homographs.json') as f:
    gt_homographs=json.load(f)

num_homographs=100
modes=['homographs_traditional', 'homographs_symbolic_code', 'homographs_symbolic_numeric', 'homographs_null_equivalent']

eval_dfs={}

for mode in modes:
    graph_stats_df=pd.read_pickle(input_dir+mode+'_'+str(num_homographs)+'/graph_stats_df.pickle')
    
    # Get graph and consider only cell nodes with greater than 1 degree
    graph_stats_df = graph_stats_df[graph_stats_df['node_type']=='cell'].sort_values(by='approximate_betweenness_centrality', ascending=False)
    with open(graph_dir+mode+'_'+str(num_homographs)+'/bipartite/bipartite.graph', 'rb') as f:
        G=pickle.load(f)
    nodes_with_degree_greater_than_1 = [n for n in graph_stats_df['node'].values if G.degree[n] > 1]
    graph_stats_df = graph_stats_df.loc[graph_stats_df['node'].isin(nodes_with_degree_greater_than_1)]

    # Add ground truth in the dataframe and perform evaluation
    cur_gt_homographs=set(gt_homographs[mode])
    graph_stats_df = graph_stats_with_gt(df=graph_stats_df, gt_homographs=cur_gt_homographs)
    graph_stats_df = utils.calculate_measures(df=graph_stats_df, num_true_homographs=num_homographs)
    
    eval_dfs[mode]=graph_stats_df

with open('evaluation/synthetic_benchmark_large3/eval_dfs.pickle', 'wb') as f:
    pickle.dump(eval_dfs, f)

100%|██████████| 22969/22969 [00:01<00:00, 16809.56it/s]
100%|██████████| 32715/32715 [00:01<00:00, 16924.36it/s]
100%|██████████| 23480/23480 [00:01<00:00, 16285.55it/s]
100%|██████████| 22552/22552 [00:01<00:00, 15991.36it/s]


In [None]:
# Perform evaluation over all types (add type information over each homograph)