# Jupyter Notebook used to extract the statistics of each dataset used

Before running this notebook ensure that you have already ran all the experiments (i.e. executed the scripts synthetic_benchmark.sh, TUS_benchmark.sh, TUS_injection_cardinality.sh)

In [5]:
import pickle
import networkx as nx

import statistics
import utils

from tqdm import tqdm

# Synthetic Dataset

In [6]:
# Statistics for Synthetic example

synthetic_graph = pickle.load(open('../graph_construction/combined_graphs_output/synthetic_benchmark_bipartite/bipartite/bipartite.graph', 'rb'))

cell_nodes = {n for n, d in synthetic_graph.nodes(data=True) if d['type']=='cell'}
attribute_nodes = {n for n, d in synthetic_graph.nodes(data=True) if d['type']=='attr'}

# Synthetic example dataframe
synthetic_df = pickle.load(open('output/synthetic_example_bipartite/graph_stats_df.pickle', 'rb'))
groundtruth_synthetic = pickle.load(open('ground_truth/synthetic_example_groundtruth_dict.pickle', 'rb'))

synthetic_df['is_homograph'] = synthetic_df['node'].map(groundtruth_synthetic)

homographs_list = synthetic_df[synthetic_df['is_homograph'] == 'homograph']['node'].values

cardinality_of_homographs = {}
for homograph in homographs_list:
    # cardinality_of_homographs[homograph] = get_cardinality_of_homograph(synthetic_graph, homograph)
    cardinality_of_homographs[homograph] = utils.graph_helpers.get_cardinality_of_homograph(synthetic_graph, homograph)

In [7]:
cardinality_of_homographs_list = list(cardinality_of_homographs.values())
print('Nodes:', synthetic_graph.number_of_nodes())
print('Edges:', synthetic_graph.number_of_edges())
print('Cell Values:', len(cell_nodes))
print('Attributes:', len(attribute_nodes))
print('Number of homographs:', len(homographs_list))
print('Minimum cardinality of a homograph:', min(cardinality_of_homographs_list))
print('Maximum cardinality of a homograph:', max(cardinality_of_homographs_list))

Nodes: 17672
Edges: 19473
Cell Values: 17633
Attributes: 39
Number of homographs: 55
Minimum cardinality of a homograph: 151
Maximum cardinality of a homograph: 1966


# Table Union Search - With Injection (No Homographs)

In [11]:
# Statistics for TUS injected dataset

TUS_injected_graph = pickle.load(open('../graph_construction/combined_graphs_output/TUS_no_homographs/bipartite/bipartite.graph', 'rb'))

cell_nodes = {n for n, d in TUS_injected_graph.nodes(data=True) if d['type']=='cell'}
attribute_nodes = {n for n, d in TUS_injected_graph.nodes(data=True) if d['type']=='attr'}

# Synthetic example dataframe
TUS_injected_df = pickle.load(open('output/TUS_no_homographs/graph_stats_with_groundtruth_df.pickle', 'rb'))
TUS_injected_df

Unnamed: 0,node,node_type,approximate_betweenness_centrality,is_homograph,has_missing_key
0,Branch name_t_1934eacab8c57857____c10_0____0.csv,attr,0.000000e+00,,
1,Organisation class_t_1934eacab8c57857____c10_0...,attr,0.000000e+00,,
2,"Organisation type (location, profit/not-for-pr...",attr,3.541735e-13,,
3,PBA type_t_1934eacab8c57857____c10_0____0.csv,attr,2.062540e-12,,
4,OGM Asia Pacific,cell,0.000000e+00,False,False
...,...,...,...,...,...
168875,SOIL_CODE_t_ece0594e0480a1ed____c7_0____3.csv,attr,2.118676e-07,,
168876,SOIL_ID_t_ece0594e0480a1ed____c7_0____3.csv,attr,2.403848e-07,,
168877,MODIFIER_t_ece0594e0480a1ed____c7_0____4.csv,attr,7.175429e-10,,
168878,SOIL_CODE_t_ece0594e0480a1ed____c7_0____4.csv,attr,3.707436e-07,,


In [12]:
print('Nodes:', TUS_injected_graph.number_of_nodes())
print('Edges:', TUS_injected_graph.number_of_edges())
print('Cell Values:', len(cell_nodes))
print('Attributes:', len(attribute_nodes))

Nodes: 168880
Edges: 3094278
Cell Values: 163860
Attributes: 5020


# Table Union Search

In [14]:
# Statistics for TUS injected dataset

TUS_graph = pickle.load(open('../graph_construction/combined_graphs_output/TUS/bipartite/bipartite.graph', 'rb'))

cell_nodes = {n for n, d in TUS_graph.nodes(data=True) if d['type']=='cell'}
attribute_nodes = {n for n, d in TUS_graph.nodes(data=True) if d['type']=='attr'}

# Synthetic example dataframe
TUS_df = pickle.load(open('output/TUS/graph_stats_with_groundtruth_df.pickle', 'rb'))
homographs_list = TUS_df[TUS_df['is_homograph'] == True]['node'].values

cardinality_of_homographs = {}
for homograph in tqdm(homographs_list):
    cardinality_of_homographs[homograph] = utils.graph_helpers.get_cardinality_of_homograph(TUS_graph, homograph)

100%|██████████| 26035/26035 [08:54<00:00, 48.75it/s] 


In [15]:
cardinality_of_homographs_list = list(cardinality_of_homographs.values())

In [16]:
print('Nodes:', TUS_graph.number_of_nodes())
print('Edges:', TUS_graph.number_of_edges())
print('Cell Values:', len(cell_nodes))
print('Attributes:', len(attribute_nodes))
print('Number of homographs:', len(homographs_list))
print('Minimum cardinality of a homograph:', min(cardinality_of_homographs_list))
print('Maximum cardinality of a homograph:', max(cardinality_of_homographs_list))

Nodes: 200258
Edges: 4150563
Cell Values: 190399
Attributes: 9859
Number of homographs: 26035
Minimum cardinality of a homograph: 3
Maximum cardinality of a homograph: 22703


In [47]:
import math
homograph_to_num_meanings_dict = pickle.load(open('homograph_to_num_meanings_dict.pickle', 'rb'))

min_number_of_meanings = math.inf
max_number_of_meanings = 0

for homograph in homograph_to_num_meanings_dict:
    if homograph_to_num_meanings_dict[homograph] > max_number_of_meanings:
        max_number_of_meanings = homograph_to_num_meanings_dict[homograph]
    if homograph_to_num_meanings_dict[homograph] < min_number_of_meanings:
        min_number_of_meanings = homograph_to_num_meanings_dict[homograph]

print('Max number of homograph meanings:', max_number_of_meanings)
print('Min number of homograph meanings:', min_number_of_meanings)



Max number of homograph meanings: 100
Min number of homograph meanings: 2
