In [81]:
import networkx
import pickle
import utils

In [82]:
def get_attributes_of_instance(G, instance_node):
    '''
    Given a graph `G` and an `instance_node` from the graph return its corresponding set of attribute nodes
    '''
    attribute_nodes = []
    for neighbor in G[instance_node]:
        if G.nodes[neighbor]['type'] == 'attr':
            attribute_nodes.append(neighbor)
    return attribute_nodes
        
def get_instances_for_attribute(G, attribute_node):
    '''
    Given a graph `G` and an `instance_node` from the graph find its cell nodes
    '''
    instances_nodes = []
    for neighbor in G[attribute_node]:
        if G.nodes[neighbor]['type'] == 'cell':
            instances_nodes.append(neighbor)
    return instances_nodes

def get_num_meanings_of_homograph(homograph, filename_column_unionable_pairs_dict, G):
    '''
    Return the number of meanings of a given homograph
    '''

    attrs = get_attributes_of_instance(G, homograph)

    # Get filename column tuples that we test for
    filename_column_tuples = []
    for attr in attrs:
        column_name = G.nodes[attr]['column_name']
        file_name = G.nodes[attr]['filename']
        filename_column_tuples.append((file_name, column_name))
    
    sets_of_unionable_vals_set = set([])
    for tup in filename_column_tuples:
        sets_of_unionable_vals_set.add(frozenset(filename_column_unionable_pairs_dict[tup]))

    return sets_of_unionable_vals_set

In [83]:
g_path = '../graph_construction/combined_graphs_output/TUS/bipartite/bipartite.graph'
unionable_pairs_dict = 'output/TUS/filename_column_tuple_to_unionable_pairs_dict.pickle'

filename_column_unionable_pairs_dict = pickle.load(open(unionable_pairs_dict, 'rb'))
G = pickle.load(open(g_path, 'rb'))

In [84]:
sets_of_unionable_vals_set = get_num_meanings_of_homograph('Asia', filename_column_unionable_pairs_dict, G)

print('There are', len(sets_of_unionable_vals_set), 'sets/meanings according to ground truth.')

col_names = set()
for s in sets_of_unionable_vals_set:
    print('size:', len(s))
    for pair in s:
        col_names.add(pair[1])

print(col_names)

There are 21 sets/meanings according to ground truth.
size: 159
size: 149
size: 157
size: 164
size: 154
size: 209
size: 234
size: 220
size: 149
size: 239
size: 226
size: 169
size: 214
size: 159
size: 144
size: 157
size: 206
size: 155
size: 209
size: 147
size: 204
{'Fund centre name', 'Continent name', 'Country/region name'}


In [91]:
# Let's pick the the first two sets and compare their differences
sets_of_unionable_vals_set = list(get_num_meanings_of_homograph('Asia', filename_column_unionable_pairs_dict, G))
set1 = sets_of_unionable_vals_set[5]
set2 = sets_of_unionable_vals_set[3]

print('Size of set 1:', len(set1))
print('Size of set 2:', len(set2))

set1_unique = set1 - set2
set2_unique = set2 - set1 
print('Number of Set 1 unique pairs:', len(set1_unique))
print('Number of Set 2 unique pairs:', len(set2_unique))

print(set1_unique)
print('\n\n')

print(set2_unique)

Size of set 1: 209
Size of set 2: 164
Number of Set 1 unique pairs: 90
Number of Set 2 unique pairs: 45
frozenset({('t_1934eacab8c57857____c4_0____4.csv', 'Fund centre name'), ('t_1934eacab8c57857____c10_1____0.csv', 'Fund centre name'), ('t_1934eacab8c57857____c4_1____2.csv', 'Country/region name'), ('t_1934eacab8c57857____c19_0____1.csv', 'Fund centre name'), ('t_1934eacab8c57857____c17_0____2.csv', 'Fund centre name'), ('t_1934eacab8c57857____c9_1____2.csv', 'Country/region name'), ('t_1934eacab8c57857____c16_0____0.csv', 'Fund centre name'), ('t_1934eacab8c57857____c13_0____3.csv', 'Fund centre name'), ('t_1934eacab8c57857____c17_0____4.csv', 'Fund centre name'), ('t_1934eacab8c57857____c17_0____1.csv', 'Country/region name'), ('t_1934eacab8c57857____c13_0____4.csv', 'Fund centre name'), ('t_1934eacab8c57857____c11_1____1.csv', 'Country/region name'), ('t_1934eacab8c57857____c17_0____0.csv', 'Country/region name'), ('t_1934eacab8c57857____c10_1____3.csv', 'Fund centre name'), ('t_1

In [86]:
attrs = get_attributes_of_instance(G, 'Asia')
col_names = set()
for attr in attrs:
    column_name = G.nodes[attr]['column_name']
    file_name = G.nodes[attr]['filename']
    col_names.add(column_name)

print(col_names)

{'Continent name', 'Country/region name'}


# Testing using 'Wigmore St'

In [88]:
sets_of_unionable_vals_set = get_num_meanings_of_homograph('Wigmore St', filename_column_unionable_pairs_dict, G)

print('There are', len(sets_of_unionable_vals_set), 'sets.')

col_names = set()
for s in sets_of_unionable_vals_set:
    print("set size:", len(s))
    for pair in s:
        col_names.add(pair[1])

print(col_names)
print(sets_of_unionable_vals_set)


There are 2 sets.
set size: 69
set size: 74
{'CommonName', 'ShortCommonName'}
{frozenset({('t_67c3f7ce5eab8804____c9_0____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c14_0____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c5_0____3.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c12_1____1.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c9_0____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____1.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c9_0____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c5_1____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c15_1____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_1____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c12_1____3.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c7_1____3.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c7_1____0.csv', 'ShortCommonName'), ('

In [89]:
sets_of_unionable_vals_set = list(sets_of_unionable_vals_set)

set1_only = sets_of_unionable_vals_set[0] - sets_of_unionable_vals_set[1]
set2_only = sets_of_unionable_vals_set[1] - sets_of_unionable_vals_set[0]
print('SET 1:', set1_only,'\n\n\n')
print('SET 2:', set2_only,'\n\n\n')

SET 1: frozenset({('t_67c3f7ce5eab8804____c9_0____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c14_0____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c5_0____3.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c12_1____1.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c11_1____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c9_0____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____1.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_0____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c5_1____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c9_0____0.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c5_1____4.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c15_1____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c7_1____1.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c14_0____2.csv', 'ShortCommonName'), ('t_67c3f7ce5eab8804____c13_1____0.csv', 'ShortCommonName'), ('t_67c3f7ce5

In [90]:
attrs = get_attributes_of_instance(G, 'Wigmore St')
col_names = set()
for attr in attrs:
    column_name = G.nodes[attr]['column_name']
    file_name = G.nodes[attr]['filename']
    col_names.add(column_name)

print(col_names)

{'ShortCommonName', 'CommonName'}
