In [10]:
import pandas as pd
import numpy as np
import networkx as nx

import utils
import pickle
import json

import semantic_type_propagation as stp


# Synthetic Benchmark

In [20]:
df_path = 'output/synthetic_example_bipartite/graph_stats_with_groundtruth_df.pickle'
g_path = '../graph_construction/combined_graphs_output/synthetic_benchmark_bipartite/bipartite/bipartite.graph'

G = pickle.load(open(g_path, "rb"))

df = pd.read_pickle(df_path)
df = stp.process_df(df, G)
df['num_meanings_groundtruth'] = np.nan
df

There are 1230 cell nodes with a degree greater than 1
There are 100 unique ranks based on BC.


Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,pagerank,dense_rank,num_meanings_groundtruth
2384,Jaguar,cell,5.434442e-02,True,0.000064,1.0,
10229,Mace,cell,3.280319e-02,True,0.000052,2.0,
300,Lincoln,cell,3.212017e-02,True,0.000052,3.0,
4228,Heather,cell,2.965270e-02,True,0.000052,4.0,
6646,Charity,cell,2.611485e-02,True,0.000052,5.0,
...,...,...,...,...,...,...,...
8917,Guinea-Bissau,cell,9.689597e-09,False,0.000019,98.0,
2633,Laos,cell,7.601714e-09,False,0.000019,99.0,
2966,Trinidad and Tobago,cell,7.601714e-09,False,0.000019,99.0,
1239,Reunion,cell,5.620724e-09,False,0.000019,100.0,


In [12]:
# Assign the groundtruth number of meanings for each homograph in the dataframe
for idx, row in df[df['is_homograph'] == True].iterrows():
    df.loc[idx, 'num_meanings_groundtruth'] = len(utils.graph_helpers.get_cell_node_column_names(G, row['node']))

df[df['is_homograph'] == True].sort_values(by='betweenness_centrality', ascending=False)

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,pagerank,dense_rank,num_meanings_groundtruth
2384,Jaguar,cell,0.05434442,True,6.4e-05,1.0,2.0
10229,Mace,cell,0.03280319,True,5.2e-05,2.0,2.0
300,Lincoln,cell,0.03212017,True,5.2e-05,3.0,2.0
4228,Heather,cell,0.0296527,True,5.2e-05,4.0,2.0
6646,Charity,cell,0.02611485,True,5.2e-05,5.0,2.0
5701,Leandra,cell,0.02611485,True,5.2e-05,5.0,2.0
15222,Ram,cell,0.02539456,True,5.2e-05,6.0,2.0
981,Phoenix,cell,0.02194003,True,5.2e-05,7.0,2.0
10357,Elan,cell,0.01666137,True,5.2e-05,8.0,2.0
10915,Jimmy,cell,0.01630799,True,5.2e-05,9.0,2.0


In [13]:
homograph_nodes = df[df['is_homograph'] == True]['node'].tolist()
homograph_nodes = json.dumps(homograph_nodes)
homograph_nodes

'["Jaguar", "Mace", "Lincoln", "Heather", "Charity", "Leandra", "Ram", "Phoenix", "Elan", "Jimmy", "Crossfire", "Smitty", "Nadine", "Virginia", "Sydney", "Quinta", "Elmira", "Jamaica", "Pumpkin", "Cuba", "GT", "Garvey", "Conroy", "Reid", "Duff", "Costanza", "Berkeley", "Christophe", "Else", "Vinson", "ES", "TL", "California", "Colorado", "Georgia", "CT", "SC", "Florida", "AL", "ID", "AR", "CO", "MA", "CA", "DE", "TN", "AZ", "MN", "PA", "MD", "SD", "GA", "ME", "IL", "NE"]'

In [14]:
with open('output/synthetic_example_bipartite/num_of_meanings/attr_to_type.json') as json_file:
    attr_to_type = json.load(json_file)

# Update the dataframe to include the number of meanings inferred by type propagation
df['num_meanings'] = np.nan
for val in attr_to_type:
    num_meanings = len(set([value for key, value in attr_to_type[val]['attr_to_type'].items() if value > 0]))
    df.loc[df['node'] == val, 'num_meanings'] = num_meanings

df[df['is_homograph'] == True].sort_values(by='betweenness_centrality', ascending=False)

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,pagerank,dense_rank,num_meanings_groundtruth,num_meanings
2384,Jaguar,cell,0.05434442,True,6.4e-05,1.0,2.0,2.0
10229,Mace,cell,0.03280319,True,5.2e-05,2.0,2.0,2.0
300,Lincoln,cell,0.03212017,True,5.2e-05,3.0,2.0,2.0
4228,Heather,cell,0.0296527,True,5.2e-05,4.0,2.0,2.0
6646,Charity,cell,0.02611485,True,5.2e-05,5.0,2.0,2.0
5701,Leandra,cell,0.02611485,True,5.2e-05,5.0,2.0,2.0
15222,Ram,cell,0.02539456,True,5.2e-05,6.0,2.0,2.0
981,Phoenix,cell,0.02194003,True,5.2e-05,7.0,2.0,2.0
10357,Elan,cell,0.01666137,True,5.2e-05,8.0,2.0,2.0
10915,Jimmy,cell,0.01630799,True,5.2e-05,9.0,2.0,2.0


In [15]:
cell_node_neighbors = utils.graph_helpers.get_cell_node_neighbors(G, 'Florida')

df_tmp = df[df['node'].isin(cell_node_neighbors)]
# df_tmp = stp.process_df(df_tmp, G)
df_tmp

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,pagerank,dense_rank,num_meanings_groundtruth,num_meanings
10229,Mace,cell,0.032803,True,5.2e-05,2.0,2.0,2.0
6646,Charity,cell,0.026115,True,5.2e-05,5.0,2.0,2.0
5701,Leandra,cell,0.026115,True,5.2e-05,5.0,2.0,2.0
10915,Jimmy,cell,0.016308,True,5.2e-05,9.0,2.0,2.0
11501,Smitty,cell,0.013734,True,5.2e-05,11.0,2.0,2.0
10601,Nadine,cell,0.013734,True,5.2e-05,11.0,2.0,2.0
546,Virginia,cell,0.012067,True,7.3e-05,12.0,3.0,2.0
11,Sydney,cell,0.009811,True,5.2e-05,13.0,2.0,2.0
1088,Quinta,cell,0.009811,True,5.2e-05,13.0,2.0,2.0
275,Elmira,cell,0.009811,True,5.2e-05,13.0,2.0,2.0


In [16]:
attr_to_type = pickle.load(open('output/synthetic_example_bipartite/num_of_meanings/attr_to_type.pickle', "rb"))

attrs = utils.graph_helpers.get_attribute_of_instance(G, 'Jaguar')
for attr in attrs:
    print(attr_to_type[attr]) 

1
1
2


# Synthetic Benchmark Large

In [49]:
df_path = 'output/synthetic_example_large/graph_stats_df.pickle'
g_path = '../graph_construction/combined_graphs_output/synthetic_benchmark_large/bipartite/bipartite.graph'

df = pd.read_pickle(df_path)
G = pickle.load(open(g_path, "rb"))

# Add the is_homograph column
df['is_homograph'] = np.nan
is_homograph_map = {}
for node in df[df['node_type']=='cell']['node']:
    is_homograph_map[node] = utils.groundtruth.is_cur_node_homograph(G, node)
df['is_homograph'] = df['node'].map(is_homograph_map)

# Filter out nodes with degree 1
df = stp.process_df(df, G)
df['num_meanings_groundtruth'] = np.nan
df

There are 6502 cell nodes with a degree greater than 1
There are 611 unique ranks based on BC.


Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth
7590,Palm,cell,1.445700e-02,True,1.0,
25825,Clementine,cell,8.975958e-03,True,2.0,
7130,Magnolia,cell,8.132084e-03,True,3.0,
3077,Jaguar,cell,8.049376e-03,True,4.0,
7416,Timothy,cell,6.921700e-03,True,5.0,
...,...,...,...,...,...,...
26917,Tohono O'Odham,cell,1.877314e-11,False,611.0,
27091,Houma,cell,1.877314e-11,False,611.0,
26921,Japanese,cell,1.877314e-11,False,611.0,
27087,South American,cell,1.877314e-11,False,611.0,


In [50]:
print('There are', len(df[df['is_homograph']==True]), 'homographs based on the groundtruth')

There are 180 homographs based on the groundtruth


In [51]:
# Assign the groundtruth number of meanings for each homograph in the dataframe
for idx, row in df[df['is_homograph'] == True].iterrows():
    df.loc[idx, 'num_meanings_groundtruth'] = len(utils.graph_helpers.get_cell_node_column_names(G, row['node']))

df[df['is_homograph'] == True].sort_values(by='betweenness_centrality', ascending=False)

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth
7590,Palm,cell,1.445700e-02,True,1.0,2.0
25825,Clementine,cell,8.975958e-03,True,2.0,2.0
7130,Magnolia,cell,8.132084e-03,True,3.0,2.0
3077,Jaguar,cell,8.049376e-03,True,4.0,2.0
7416,Timothy,cell,6.921700e-03,True,5.0,2.0
...,...,...,...,...,...,...
2000,MD,cell,2.103537e-07,True,275.0,2.0
1828,SD,cell,2.059766e-07,True,276.0,2.0
2004,ME,cell,1.214843e-07,True,314.0,2.0
1950,NC,cell,7.969017e-08,True,329.0,2.0


In [52]:
df[df['num_meanings_groundtruth'] == 3]

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth
300,Lincoln,cell,0.004879,True,8.0,3.0
2012,Montana,cell,0.002381,True,15.0,3.0
1401,Aurora,cell,0.00169,True,20.0,3.0
546,Virginia,cell,0.001467,True,25.0,3.0


In [57]:
utils.graph_helpers.get_cell_node_column_names(G, 'Timothy')

['plant_name', 'first_name']

# Synthetic Example Large With Nulls

In [69]:
df_path = 'output/synthetic_example_large_with_nulls/'
g_path = '../graph_construction/combined_graphs_output/synthetic_benchmark_large_with_nulls/bipartite/bipartite.graph'

df = pd.read_pickle(df_path + 'graph_stats_df.pickle')
G = pickle.load(open(g_path, "rb"))

# Add the is_homograph column
df['is_homograph'] = np.nan
is_homograph_map = {}
for node in df[df['node_type']=='cell']['node']:
    is_homograph_map[node] = utils.groundtruth.is_cur_node_homograph(G, node)
df['is_homograph'] = df['node'].map(is_homograph_map)

df.to_pickle(df_path+'graph_stats_with_groundtruth_df.pickle')

# Filter out nodes with degree 1
df = stp.process_df(df, G)
df['num_meanings_groundtruth'] = np.nan
df

There are 6503 cell nodes with a degree greater than 1
There are 614 unique ranks based on BC.


Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth
70,missing,cell,1.611175e-01,True,1.0,
2059,unknown,cell,1.244477e-01,True,2.0,
2815,undefined,cell,1.045398e-02,True,3.0,
723,Jamaica,cell,3.296411e-03,True,4.0,
7589,Palm,cell,3.005904e-03,True,5.0,
...,...,...,...,...,...,...
27226,Korean,cell,1.858687e-11,False,614.0,
27233,Chinese,cell,1.858687e-11,False,614.0,
27263,Asian Indian,cell,1.858687e-11,False,614.0,
27267,Tongan,cell,1.858687e-11,False,614.0,


In [70]:
print('There are', len(df[df['is_homograph']==True]), 'homographs based on the groundtruth')

json.dumps(df[df['is_homograph']==True]['node'].tolist())

There are 183 homographs based on the groundtruth


'["missing", "unknown", "undefined", "Jamaica", "Palm", "Cuba", "Clementine", "Magnolia", "Western", "Jicama", "Lettuce", "Guava", "Suzy", "Hugo", "Viva", "Charity", "Brooklyn", "Kiwi", "Michael", "Kohlrabi", "Darcy", "Pumpkin", "Terri", "Chicago", "House", "Noah", "Virginia", "Timothy", "ES", "Yucca", "Luther", "Constantine", "Golden", "Middleton", "Starbuck", "Lincoln", "Bar", "Anastasia", "Gus", "Wade", "Ransom", "Somerset", "Jimmy", "TL", "San Francisco", "Beaufort", "Rio", "Mariner", "Spring", "Rogue", "Aurora", "Garland", "Mirabel", "Denver", "Shane", "Cinderella", "SM", "Gantz", "Montana", "CL", "Inger", "M", "Cougar", "Nebraska", "Elmira", "GT", "LS", "MG", "Valerie", "Rosie", "Franklyn", "Pathfinder", "Focus", "Kansas", "Magnum", "Adriaens", "Manuel", "Gaylord", "Maddy", "Romain", "Jaguar", "Wichita", "Pueblo", "Seminole", "Lamont", "Tracy", "Barrie", "Harland", "XK", "Equinox", "Aura", "Georgia", "Denton", "Tacoma", "Tucson", "Ford", "Orlando", "Raleigh", "Montgomery", "Katy"

In [71]:
# Assign the groundtruth number of meanings for each homograph in the dataframe
for idx, row in df[df['is_homograph'] == True].iterrows():
    df.loc[idx, 'num_meanings_groundtruth'] = len(utils.graph_helpers.get_cell_node_column_names(G, row['node']))

df[df['is_homograph'] == True].sort_values(by='betweenness_centrality', ascending=False)

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth
70,missing,cell,1.611175e-01,True,1.0,17.0
2059,unknown,cell,1.244477e-01,True,2.0,11.0
2815,undefined,cell,1.045398e-02,True,3.0,8.0
723,Jamaica,cell,3.296411e-03,True,4.0,2.0
7589,Palm,cell,3.005904e-03,True,5.0,2.0
...,...,...,...,...,...,...
27428,Comanche,cell,1.487889e-05,True,213.0,2.0
2004,ME,cell,1.108179e-05,True,219.0,2.0
1959,California,cell,5.439775e-06,True,267.0,2.0
1945,LA,cell,5.320422e-06,True,272.0,2.0


In [72]:
with open(df_path + 'num_of_meanings/attr_to_type.json') as json_file:
    attr_to_type = json.load(json_file)

# Update the dataframe to include the number of meanings inferred by type propagation
df['num_meanings'] = np.nan
for val in attr_to_type:
    num_meanings = len(set([value for key, value in attr_to_type[val]['attr_to_type'].items() if value > 0]))
    df.loc[df['node'] == val, 'num_meanings'] = num_meanings

df[df['is_homograph'] == True].sort_values(by='betweenness_centrality', ascending=False)

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,dense_rank,num_meanings_groundtruth,num_meanings
70,missing,cell,1.611175e-01,True,1.0,17.0,9.0
2059,unknown,cell,1.244477e-01,True,2.0,11.0,5.0
2815,undefined,cell,1.045398e-02,True,3.0,8.0,6.0
723,Jamaica,cell,3.296411e-03,True,4.0,2.0,4.0
7589,Palm,cell,3.005904e-03,True,5.0,2.0,2.0
...,...,...,...,...,...,...,...
27428,Comanche,cell,1.487889e-05,True,213.0,2.0,2.0
2004,ME,cell,1.108179e-05,True,219.0,2.0,2.0
1959,California,cell,5.439775e-06,True,267.0,2.0,2.0
1945,LA,cell,5.320422e-06,True,272.0,2.0,1.0
