In [1]:
import pandas as pd
import os
import json
import pickle
from sklearn import metrics
from tqdm import tqdm

# Functions

In [2]:
def get_cluster_assignment(files_path):
    '''
    Returns a dictionary assigning each column_filename of a homograph to a cluster id

    Algorithm: Initialy each file is in a separate cluster ID. Clusters IDs merge if the pair is considered not to be homoraphs
    '''
    file_to_cluster_id={}
    cur_id=0
    for pair in os.listdir(files_path):        
        file1=pair.split('___')[0] + '.csv'
        file2=pair.split('___')[1].split('.')[0] + '.csv'

        if file1 not in file_to_cluster_id:
            file_to_cluster_id[file1]=cur_id
            cur_id+=1
        if file2 not in file_to_cluster_id:
            file_to_cluster_id[file2]=cur_id
            cur_id+=1
    
    for pair in os.listdir(files_path):
        with open(files_path+'/'+pair, 'r') as f:
            is_homograph=json.load(f)['is_homograph']
        file1=pair.split('___')[0] + '.csv'
        file2=pair.split('___')[1].split('.')[0] + '.csv'

        if is_homograph=='False':
            val_to_set=min(file_to_cluster_id[file1], file_to_cluster_id[file2])
            file_to_cluster_id[file1]=val_to_set
            file_to_cluster_id[file2]=val_to_set
    
    return file_to_cluster_id

def get_clustering_evaluation_score(labels_pred, idx_to_nodes, G, measure='adj_rand_index'):
    '''
    Given `labels_pred` which is an assignment of types for the attributes of a node, derive the groundtruth
    using `idx_to_nodes` and `G` and return the specified clustering evaluation `measure` 

    Allowed measures: ['adj_rand_index', 'adj_mutual_info', 'norm_mutual_info']
    '''
    # Derive the groundtruth labels by getting the column name for each attribute in the idx_to_nodes list
    col_names = [G.nodes[attr]['column_name'] for attr in idx_to_nodes.values()]
    col_name_to_label_dict = dict([(y,x) for x,y in enumerate(sorted(set(col_names)))])
    labels_true = [col_name_to_label_dict[x] for x in col_names]
    
    if measure == 'adj_rand_index':
        score = metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred)
    elif measure == 'adj_mutual_info':
        score = metrics.adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred)
    elif measure == 'norm_mutual_info':
        score = metrics.normalized_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred)

    return score

# Cluster Construction

In [3]:
input_dir='gpt_output/synthetic_benchmark_large/'
g_path = '../graph_construction/combined_graphs_output/synthetic_benchmark_large/bipartite/bipartite.graph'
G = pickle.load(open(g_path, "rb"))

node_to_idx_dict_path='../network_analysis/output/synthetic_example_large/node_to_idx_dict.json'
with open(node_to_idx_dict_path, 'r') as f:
    node_to_idx_dict=json.load(f)

In [4]:
# Populate the `hom_to_cluster_ids_dict`
hom_to_cluster_ids_dict={}
for hom in os.listdir(input_dir):
    cur_cluster_ids_dict=get_cluster_assignment(files_path=input_dir+hom)
    hom_to_cluster_ids_dict[hom]=cur_cluster_ids_dict

In [5]:
# Store the chatGPT clusters in readable form
output_dir='readable_clusters/synthetic_benchmark_large/gpt/'
for hom in tqdm(hom_to_cluster_ids_dict):
    cluster_id_to_columns_dict={}
    for idx, val in node_to_idx_dict[hom].items():
        if val not in hom_to_cluster_ids_dict[hom]:
            cur_cluster_id=0
        else:
            cur_cluster_id=hom_to_cluster_ids_dict[hom][val]
        column_name=G.nodes[val]['column_name']

        if cur_cluster_id not in cluster_id_to_columns_dict:
            cluster_id_to_columns_dict[cur_cluster_id]=[column_name]
        else:
            cluster_id_to_columns_dict[cur_cluster_id].append(column_name)

    with open(output_dir+hom+'.json', 'w') as f:
        json.dump(cluster_id_to_columns_dict, f, indent=4)

100%|██████████| 180/180 [00:00<00:00, 3140.47it/s]


In [6]:
# Store the KDE computed clusters in readable form
output_dir='readable_clusters/synthetic_benchmark_large/kde/'
with open('../network_analysis/output/synthetic_example_large/kde_labels.pickle', 'rb') as f:
    kde_labels=pickle.load(f)

for hom in tqdm(kde_labels):
    cluster_id_to_columns_dict={}
    for i in range(len(kde_labels[hom])):
        column_name=G.nodes[node_to_idx_dict[hom][str(i)]]['column_name']
        cur_cluster_id=int(kde_labels[hom][i])
        
        if cur_cluster_id not in cluster_id_to_columns_dict:
            cluster_id_to_columns_dict[cur_cluster_id]= [column_name]
        else:
            cluster_id_to_columns_dict[cur_cluster_id].append(column_name)
        
    with open(output_dir+hom+'.json', 'w') as f:
        json.dump(cluster_id_to_columns_dict, f, indent=4)

100%|██████████| 180/180 [00:00<00:00, 3314.74it/s]


# Evaluation

In [7]:
homograph_eval_dict={}
# Perform Evaluation over each homograph
for hom in tqdm(hom_to_cluster_ids_dict):
    # Build the `labels_pred` in the order it appears in `node_to_idx_dict`
    labels_pred=[]
    for idx, val in node_to_idx_dict[hom].items():
        if val not in hom_to_cluster_ids_dict[hom]:
            labels_pred.append(0)
        else:
            labels_pred.append(hom_to_cluster_ids_dict[hom][val])
    adj_rand_idx_gpt=get_clustering_evaluation_score(labels_pred=labels_pred, idx_to_nodes=node_to_idx_dict[hom], G=G, measure='adj_rand_index')
    num_meanings_gpt=len(set(labels_pred))
    homograph_eval_dict[hom]={'adj_rand_index': adj_rand_idx_gpt, 'num_meanings': num_meanings_gpt}

 24%|██▍       | 43/180 [00:00<00:00, 421.02it/s]

100%|██████████| 180/180 [00:00<00:00, 433.44it/s]


In [8]:
# Update the eval_df to contain evaluation values for gpt extraction 
eval_df=pd.read_pickle('../network_analysis/output/synthetic_example_large/homographs_clustering_eval_df.pickle')
num_meanings_gpt_list=[]
is_gpt_num_meanings_correct_list=[]
gpt_adj_rand_index=[]
for idx, row in eval_df.iterrows():
    num_meanings=homograph_eval_dict[row['node']]['num_meanings']
    is_gpt_num_meanings_correct=row['num_meanings_groundtruth']==num_meanings
    adj_rand_idx=homograph_eval_dict[row['node']]['adj_rand_index']
    num_meanings_gpt_list.append(num_meanings)
    is_gpt_num_meanings_correct_list.append(is_gpt_num_meanings_correct)
    gpt_adj_rand_index.append(adj_rand_idx)

eval_df['num_meanings_gpt']=num_meanings_gpt_list
eval_df['is_gpt_num_meanings_correct']=is_gpt_num_meanings_correct_list
eval_df['gpt_adj_rand_index']=gpt_adj_rand_index
eval_df

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth,num_meanings_greatest,num_meanings_kde,epsilon_greatest,epsilon_kde,...,is_greatest_num_meanings_correct,greatest_adj_rand_index,greatest_adj_mutual_info,greatest_norm_mutual_info,kde_adj_rand_index,kde_adj_mutual_info,kde_norm_mutual_info,num_meanings_gpt,is_gpt_num_meanings_correct,gpt_adj_rand_index
7590,Palm,cell,1.445700e-02,True,1.0,2,2.0,2.0,0.970414,0.974673,...,True,1.0,1.0,1.0,1.0,1.0,1.0,3,False,0.333333
25825,Clementine,cell,8.975958e-03,True,2.0,2,2.0,2.0,0.812009,0.830499,...,True,1.0,1.0,1.0,1.0,1.0,1.0,1,False,0.000000
7130,Magnolia,cell,8.132084e-03,True,3.0,2,2.0,2.0,,,...,True,1.0,0.0,1.0,1.0,0.0,1.0,1,False,0.000000
3077,Jaguar,cell,8.049376e-03,True,4.0,2,2.0,2.0,0.692718,0.825997,...,True,1.0,1.0,1.0,1.0,1.0,1.0,5,False,-0.129032
7416,Timothy,cell,6.921700e-03,True,5.0,2,2.0,2.0,,,...,True,1.0,0.0,1.0,1.0,0.0,1.0,2,True,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,MD,cell,2.103537e-07,True,275.0,2,1.0,2.0,0.908497,0.469551,...,False,0.0,0.0,0.0,1.0,1.0,1.0,6,False,0.000000
1828,SD,cell,2.059766e-07,True,276.0,2,1.0,2.0,0.913907,0.459670,...,False,0.0,0.0,0.0,1.0,1.0,1.0,6,False,0.000000
2004,ME,cell,1.214843e-07,True,314.0,2,1.0,2.0,0.907285,0.438742,...,False,0.0,0.0,0.0,1.0,1.0,1.0,3,False,0.333333
1950,NC,cell,7.969017e-08,True,329.0,2,1.0,2.0,0.912752,0.358959,...,False,0.0,0.0,0.0,1.0,1.0,1.0,3,False,0.000000


In [9]:
kde_precision = eval_df['is_kde_num_meanings_correct'].value_counts()[True] / len(eval_df.index)
gpt_precision = eval_df['is_gpt_num_meanings_correct'].value_counts()[True] / len(eval_df.index)
print("Number of meanings AVG precision using KDE is:", kde_precision, "and using chatGPT is:", gpt_precision)

Number of meanings AVG precision using KDE is: 0.9722222222222222 and using chatGPT is: 0.45555555555555555


In [10]:
print("AVG adj rand index using KDE is:", eval_df['kde_adj_rand_index'].mean(), "and using chatGPT is:", eval_df['gpt_adj_rand_index'].mean())

AVG adj rand index using KDE is: 0.9911270945693412 and using chatGPT is: 0.43870604794489404


## Per Pair Evaluation

In [36]:
def get_pair_prediction(pair, pred_labels, node_to_idx_dict):
    '''
    Given a pair (column_filename, column_filename) return two booleans
    
    1) the gt (i.e., part of the same or not cluster)
    2) whether DomainNet predicted that they are part of the same or different cluster
    '''
    file1=pair.split('___')[0] + '.csv'
    file2=pair.split('___')[1].split('.')[0] + '.csv'

    filename_to_idx={}
    for idx, val in node_to_idx_dict.items():
        filename_to_idx[val]=int(idx)
    
    # GT relation between two files
    col1, col2 = G.nodes[file1]['column_name'], G.nodes[file2]['column_name']
    if col1 == col2:
        gt=False
    else:
        gt=True

    # DomainNet Prediction
    file1_pred=pred_labels[filename_to_idx[file1]]
    file2_pred=pred_labels[filename_to_idx[file2]]
    if file1_pred == file2_pred:
        pred=False
    else:
        pred=True

    return gt, pred

In [41]:
# Perform a per-pair evaluation between GPT and DomainNet
gpt_path='gpt_output/synthetic_benchmark_large/'
node_to_idx_dict_path='../network_analysis/output/synthetic_example_large/node_to_idx_dict.json'
with open('../network_analysis/output/synthetic_example_large/kde_labels.pickle', 'rb') as f:
    kde_labels=pickle.load(f)
with open(node_to_idx_dict_path, 'r') as f:
    node_to_idx_dict=json.load(f)
g_path = '../graph_construction/combined_graphs_output/synthetic_benchmark_large/bipartite/bipartite.graph'
G = pickle.load(open(g_path, "rb"))

pair_eval_dict={"homograph": [], "pair": [], "is_gpt_pred_correct": [], "is_domain_net_pred_correct": [], "gpt_pred": [], "domain_net_pred": [], "gt": []}
for hom in tqdm(os.listdir(gpt_path)):
    for pair in os.listdir(gpt_path+hom+'/'):
        with open(gpt_path+hom+'/'+pair, 'r') as f:
            gpt_pred=json.load(f)['is_homograph']
        if gpt_pred == 'True':
            gpt_pred=True
        else:
            gpt_pred=False
        
        # Get GT and DomainNet Predictions
        gt, domain_net_pred = get_pair_prediction(pair=pair, pred_labels=kde_labels[hom], node_to_idx_dict=node_to_idx_dict[hom])
        
        if gpt_pred==gt:
            is_gpt_pred_correct=True
        else:
            is_gpt_pred_correct=False
        if domain_net_pred==gt:
            is_domain_net_pred_correct=True
        else:
            is_domain_net_pred_correct=False
        
        pair_eval_dict['homograph'].append(hom)
        pair_eval_dict['pair'].append(pair)
        pair_eval_dict['is_gpt_pred_correct'].append(is_gpt_pred_correct)
        pair_eval_dict['is_domain_net_pred_correct'].append(is_domain_net_pred_correct)
        pair_eval_dict['gpt_pred'].append(gpt_pred)
        pair_eval_dict['domain_net_pred'].append(domain_net_pred)
        pair_eval_dict['gt'].append(gt)

pair_eval_df = pd.DataFrame.from_dict(pair_eval_dict)
pair_eval_df

 39%|███▉      | 70/180 [00:00<00:00, 601.09it/s]

100%|██████████| 180/180 [00:00<00:00, 769.01it/s]


Unnamed: 0,homograph,pair,is_gpt_pred_correct,is_domain_net_pred_correct,gpt_pred,domain_net_pred,gt
0,Irving,first_name_personal_first_name_last_name_ssn_s...,True,True,True,True,True
1,Irving,city_location_city_country_3___city_location_c...,False,True,True,False,False
2,Irving,first_name_personal_first_name_last_name_ssn_s...,True,True,True,True,True
3,Thorpe,first_name_personal_first_name_last_name_ssn_s...,False,True,True,False,False
4,Thorpe,first_name_personal_first_name_last_name_ssn_s...,True,True,True,True,True
...,...,...,...,...,...,...,...
1576,Clementine,grocery_product_grocery_country_2___grocery_pr...,False,True,True,False,False
1577,Luther,last_name_personal_first_name_last_name_ssn_st...,True,True,True,True,True
1578,Denver,first_name_personal_first_name_last_name_ssn_s...,True,True,True,True,True
1579,Denver,first_name_personal_first_name_last_name_ssn_s...,True,True,True,True,True


In [47]:
gpt_pred_accuracy=pair_eval_df['is_gpt_pred_correct'].value_counts()[True] / len(pair_eval_df)
domain_net_accuracy=pair_eval_df['is_domain_net_pred_correct'].value_counts()[True] / len(pair_eval_df)
print("AVG GPT pair evaluation accuracy:", gpt_pred_accuracy, "AVG DomainNet pair evaluation accuracy:", domain_net_accuracy)

AVG GPT pair evaluation accuracy: 0.4845034788108792 AVG DomainNet pair evaluation accuracy: 0.9955724225173941


In [48]:
with open('tus_icl_examples.pickle', 'rb') as f:
    examples=pickle.load(f)
examples

{'icl_ind_0': 'Table:\n18772|16032|"Birch Hills"|0|0|0|25|1|PNR|SK|101|"Tisdale - CN"\nTable:\n"No summary - Aucun sommaire"|"Electrical and electronic machinery and equipment (including computer hardware)"|"Computing and Information Sciences - A"\nUnionable: no',
 'icl_ind_1': 'Table:\n4693|12658|"Hwy  665"|50.0175|-92.8846|"ON -Ministry of Transportation"|Passive|0|19|CN|50|50|2|1|PNR|MB|48.46|Redditt\nTable:\n"No summary - Aucun sommaire"|"Electrical and electronic machinery and equipment (including computer hardware)"|"Computing and Information Sciences - A"\nUnionable: no',
 'icl_ind_2': 'Table:\n18772|16032|"Birch Hills"|0|0|0|25|1|PNR|SK|101|"Tisdale - CN"\nTable:\n"No summary - Aucun sommaire"|"Computer communications"|"Computing and Information Sciences - A"\nUnionable: no',
 'icl_ind_3': 'Table:\n12658|"Hwy  665"|-92.8846|"ON -Ministry of Transportation"|Passive|0|0|CN|50|55|2|1|MB|48.46|Redditt|\nTable:\n"No summary - Aucun sommaire"|"Electrical and electronic machinery and 

# Testing

In [6]:
num_pairs_per_homograph_dict={"homograph": [], "count": []}
dir="gpt_queries/synthetic_benchmark_large/gpt_formed_query/"
for hom in os.listdir(dir):
    num_pairs_per_homograph_dict['homograph'].append(hom)
    num_pairs_per_homograph_dict['count'].append(len(os.listdir(dir+hom)))
num_pairs_per_homograph=pd.DataFrame.from_dict(num_pairs_per_homograph_dict)
num_pairs_per_homograph.sort_values(by='count', ascending=False).head(50)

Unnamed: 0,homograph,count
69,Cuba,105
169,Jamaica,91
59,Georgia,78
98,ID,45
138,AZ,45
29,PA,45
171,CA,45
22,MA,45
166,TN,45
165,AL,45
