Read datasets and senses.

In [1]:
from utils.utils import read_data, sense2dict

data_path = 'data.csv'
senses_path = 'senses.csv'

data, sense_dict = read_data(data_path, senses_path)

print('data:\n', data)
print('\nsense_dict:\n', sense_dict)  # sense->synonym

data:
      ID   A   B   C
0    t1  a1  b1  c3
1    t2  a1  b1  c6
2    t3  a1  b2  c1
3    t4  a1  b2  c2
4    t5  a1  b2  c3
5    t6  a1  b2  c4
6    t7  a2  b2  c2
7    t8  a2  b2  c4
8    t9  a2  b2  c4
9   t10  a2  b3  c5
10  t11  a2  b3  c3
11  t12  a2  b3  c3

sense_dict:
 {'1': ['c1', 'c2', 'c3'], '2': ['c2', 'c4'], '3': ['c1', 'c4', 'c5'], '4': ['c2', 'c3', 'c5'], '5': ['c1', 'c2', 'c5'], '6': ['c4', 'c6'], '7': ['c1', 'c4', 'c6', 'c7']}


Convert sense table to ssets.

In [2]:
ssets = {}  # synonym->sense
for sense, synonym in sense_dict.items():
    for value in synonym:
        if value not in ssets.keys():
            ssets[value] = []
        ssets[value].append(sense)

print('ssets:\n', ssets)

ssets:
 {'c1': ['1', '3', '5', '7'], 'c2': ['1', '2', '4', '5'], 'c3': ['1', '4'], 'c4': ['2', '3', '6', '7'], 'c5': ['3', '4', '5'], 'c6': ['6', '7'], 'c7': ['7']}


Initialize attribute columns here.

In [3]:
from utils.utils import get_attribute

col_name1 = 'A'
attrs1 = get_attribute(data, col_name1)

col_name2 = 'B'
attrs2 = get_attribute(data, col_name2)

right_col_name = 'C'

print('attr1:', attrs1)
print('attr2:', attrs2)

attr1: ['a1' 'a2']
attr2: ['b1' 'b2' 'b3']


Compute an initial assignment for every equivalence class $x$.

In [4]:
import pandas as pd
from algorithms.init_assign import init_assign

initial_senses1 = {}
initial_senses2 = {}

for l in attrs1:
    x = data[data[col_name1] == l][[col_name1, right_col_name]]
    selected_sense = init_assign(x, ssets, sense_dict)
    initial_senses1[l] = selected_sense
    print('x:\n', x)

for l in attrs2:
    x = data[data[col_name2] == l][[col_name2, right_col_name]]
    selected_sense = init_assign(x, ssets, sense_dict)
    initial_senses2[l] = selected_sense
    print('x:\n', x)

print('initial_senses1:\n', initial_senses1)
print('initial_senses2:\n', initial_senses2)

k= 5
sorted_synonyms ['c3', 'c6', 'c1', 'c2', 'c4']
sorted_senses [['1', '4'], ['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5'], ['2', '3', '6', '7']]
topk:
 [['1', '4'], ['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5'], ['2', '3', '6', '7']]
topk:
 [['1', '4'], ['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5']]
topk:
 [['1', '4'], ['6', '7'], ['1', '3', '5', '7']]
topk:
 [['1', '4'], ['6', '7']]
topk:
 [['1', '4']]
x:
     A   C
0  a1  c3
1  a1  c6
2  a1  c1
3  a1  c2
4  a1  c3
5  a1  c4
k= 4
sorted_synonyms ['c4', 'c3', 'c2', 'c5']
sorted_senses [['2', '3', '6', '7'], ['1', '4'], ['1', '2', '4', '5'], ['3', '4', '5']]
topk:
 [['2', '3', '6', '7'], ['1', '4'], ['1', '2', '4', '5'], ['3', '4', '5']]
topk:
 [['2', '3', '6', '7'], ['1', '4'], ['1', '2', '4', '5']]
topk:
 [['2', '3', '6', '7'], ['1', '4']]
topk:
 [['2', '3', '6', '7']]
x:
      A   C
6   a2  c2
7   a2  c4
8   a2  c4
9   a2  c5
10  a2  c3
11  a2  c3
k= 2
sorted_synonyms ['c3', 'c6']
sorted_senses [['1', '4

Construct the dependency graph $G$.

Compute the Earth Mover's Distance between overlapping classes ($u_1$, $u_2$) as edge weights.

In [5]:
from algorithms.dependency_graph import DependencyGraph

G = DependencyGraph(data, initial_senses1, initial_senses2, attrs1, attrs2, sense_dict, right_col_name)
print('G:', G)

attr: 0    c3
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c2
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c1
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c1
4    c3
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c1
4    c1
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c1
4    c1
5    c4
Name: C, dtype: object
attr: 0    c1
1    c6
2    c1
3    c1
4    c1
5    c4
Name: C, dtype: object
attr: 0    c3
1    c6
Name: C, dtype: object
attr: 0    c2
1    c6
Name: C, 

Visit nodes in decreasing order of their $EMD$ values by summing over all corresponding edges.

BFS traverse.

In [None]:
test_threshold = 0.2

Refine the sense for each equivalence class.

In [None]:
opt = optimal_senses(test_data, sset)
print('optimal_senses:', opt)

'''example output
senses: {1: '123', 2: '24', 3: '145', 4: '235', 5: '125'}
graph: {0: [2], 1: [2, 3], 2: [0, 1], 3: [1]}
optimal_senses: {0: [0], 1: [3], 2: [0, 1], 3: [3]}
'''

print('prob_table:\n', prob_table(test_data, sset))

KLtab = KL_table(test_graph, prob_table(test_data, sset))
print('KL_table:', KLtab)

# minKL(KLtab, optimal_senses(test_data, test_senses), test_threshold)
#
# BFS(test_graph, 0)

# print('baseline:', baseline(KLtab, opt))

sense_assign(test_graph, KLtab, opt)

sense_assign(test_graph, KLtab, opt, test_threshold)

Data repair algorithm.

In [None]:
repair()