Read datasets and senses.

In [1]:
from utils.utils import read_data

data_path = 'datasets/example.csv'
senses_path = 'senses/example.csv'

data, sense_dict = read_data(data_path, senses_path)

print('data:\n', data)
print('\nsense_dict:\n', sense_dict)  # sense->synonym

data:
      ID   A   B   C
0    t1  a1  b1  c3
1    t2  a1  b1  c6
2    t3  a1  b2  c1
3    t4  a1  b2  c2
4    t5  a1  b2  c3
5    t6  a1  b2  c4
6    t7  a2  b2  c2
7    t8  a2  b2  c4
8    t9  a2  b2  c4
9   t10  a2  b3  c5
10  t11  a2  b3  c3
11  t12  a2  b3  c3

sense_dict:
 {'1': ['c1', 'c2', 'c3'], '2': ['c2', 'c4'], '3': ['c1', 'c4', 'c5'], '4': ['c2', 'c3', 'c5'], '5': ['c1', 'c2', 'c5'], '6': ['c4', 'c6'], '7': ['c1', 'c4', 'c6', 'c7']}


Convert sense table to ssets.

In [2]:
ssets = {}  # synonym->sense
for sense, synonym in sense_dict.items():
    for value in synonym:
        if value not in ssets.keys():
            ssets[value] = []
        ssets[value].append(sense)

print('ssets:\n', ssets)

ssets:
 {'c1': ['1', '3', '5', '7'], 'c2': ['1', '2', '4', '5'], 'c3': ['1', '4'], 'c4': ['2', '3', '6', '7'], 'c5': ['3', '4', '5'], 'c6': ['6', '7'], 'c7': ['7']}


Initialize attribute columns here.

In [3]:
from utils.utils import get_attribute

col_name1 = 'A'
attrs1 = get_attribute(data, col_name1)

col_name2 = 'B'
attrs2 = get_attribute(data, col_name2)

right_col_name = 'C'

print('attrs1:', attrs1)
print('attrs2:', attrs2)

attrs1: ['a1' 'a2']
attrs2: ['b1' 'b2' 'b3']


Compute an initial assignment for every equivalence class $x$.

In [4]:
import pandas as pd
from algorithms.init_assign import init_assign

initial_senses1 = {}
initial_senses2 = {}

for l in attrs1:
    x = data[data[col_name1] == l][[col_name1, right_col_name]]
    selected_sense = init_assign(x, ssets, sense_dict)
    initial_senses1[l] = selected_sense
    print('x:\n', x)

for l in attrs2:
    x = data[data[col_name2] == l][[col_name2, right_col_name]]
    selected_sense = init_assign(x, ssets, sense_dict)
    initial_senses2[l] = selected_sense
    print('x:\n', x)

right attribute: ['c3', 'c6', 'c1', 'c2', 'c3', 'c4']
deviation: {'c6': 0, 'c1': 0, 'c2': 0, 'c4': 0, 'c3': 1}
k= 5
sorted_synonyms ['c6', 'c1', 'c2', 'c4', 'c3']
sorted_senses [['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5'], ['2', '3', '6', '7'], ['1', '4']]
topk:
 [['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5'], ['2', '3', '6', '7'], ['1', '4']]
topk:
 [['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5'], ['2', '3', '6', '7']]
topk:
 [['6', '7'], ['1', '3', '5', '7'], ['1', '2', '4', '5']]
topk:
 [['6', '7'], ['1', '3', '5', '7']]
x:
     A   C
0  a1  c3
1  a1  c6
2  a1  c1
3  a1  c2
4  a1  c3
5  a1  c4
right attribute: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
deviation: {'c2': 0.5, 'c4': 0.5, 'c5': 0.5, 'c3': 0.5}
k= 4
sorted_synonyms ['c2', 'c4', 'c5', 'c3']
sorted_senses [['1', '2', '4', '5'], ['2', '3', '6', '7'], ['3', '4', '5'], ['1', '4']]
topk:
 [['1', '2', '4', '5'], ['2', '3', '6', '7'], ['3', '4', '5'], ['1', '4']]
topk:
 [['1', '2', '4', '5'], ['2', '3', '6

In [5]:
print('initial_senses1:\n', initial_senses1)
print('initial_senses2:\n', initial_senses2)

initial_senses1:
 {'a1': '7', 'a2': '2'}
initial_senses2:
 {'b1': '1', 'b2': '1', 'b3': '4'}


Construct the dependency graph $G$.

Compute the Earth Mover's Distance between overlapping classes ($u_1$, $u_2$) as edge weights.

In [6]:
from algorithms.dependency_graph import DependencyGraph

G = DependencyGraph(data, initial_senses1, initial_senses2, attrs1, attrs2, sense_dict, right_col_name)
G.display()

attrs: ['a1' 'a2' 'b1' 'b2' 'b3']
val1: ['c3', 'c6', 'c1', 'c2', 'c3', 'c4']
val2: ['c3', 'c6']
dist1: {'c1': 1, 'c6': 1, 'c4': 1, 'c2': 1, 'c3': 2}
dist2: {'c1': 0, 'c6': 1, 'c4': 0, 'c2': 0, 'c3': 1}
EMDs: [0, 1, 1, 2, 3, 4]
emd: 11
val1: ['c3', 'c6', 'c1', 'c2', 'c3', 'c4']
val2: ['c1', 'c2', 'c3', 'c4', 'c2', 'c4', 'c4']
dist1: {'c1': 1, 'c6': 1, 'c4': 1, 'c2': 1, 'c3': 2}
dist2: {'c1': 1, 'c6': 0, 'c4': 3, 'c2': 2, 'c3': 1}
EMDs: [0, 0, 1, -1, -2, -1]
emd: 5
val1: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
val2: ['c1', 'c2', 'c3', 'c4', 'c2', 'c4', 'c4']
dist1: {'c1': 0, 'c4': 2, 'c2': 1, 'c3': 2, 'c5': 1}
dist2: {'c1': 1, 'c4': 3, 'c2': 2, 'c3': 1, 'c5': 0}
EMDs: [0, -1, -2, -3, -2, -1]
emd: 9
val1: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
val2: ['c5', 'c3', 'c3']
dist1: {'c5': 1, 'c4': 2, 'c2': 1, 'c3': 2}
dist2: {'c5': 1, 'c4': 0, 'c2': 0, 'c3': 2}
EMDs: [0, 0, 2, 3, 3]
emd: 8

dependency graph: {0: [2, 3], 1: [3, 4], 2: [0], 3: [0, 1], 4: [1]}

edge weight: {(0, 2): 11, (2, 0): 11, (0, 

Visit nodes in decreasing order of their $EMD$ values by summing over all corresponding edges.

Traverse G with BFS and refine the sense for each equivalence class.

In [7]:
G.BFS()
G.display()

attr1: a2
attr2: b2
x1:
      ID   A   B   C
6    t7  a2  b2  c2
7    t8  a2  b2  c4
8    t9  a2  b2  c4
9   t10  a2  b3  c5
10  t11  a2  b3  c3
11  t12  a2  b3  c3
x2:
    ID   A   B   C
2  t3  a1  b2  c1
3  t4  a1  b2  c2
4  t5  a1  b2  c3
5  t6  a1  b2  c4
6  t7  a2  b2  c2
7  t8  a2  b2  c4
8  t9  a2  b2  c4

synonyms: 2
vals: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
outliers: ['c5', 'c4', 'c2', 'c3']

synonyms: 1
vals: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
outliers: ['c5', 'c4', 'c2', 'c3']

synonyms: 1
vals: ['c1', 'c2', 'c3', 'c4', 'c2', 'c4', 'c4']
outliers: ['c1', 'c4', 'c2', 'c3']

synonyms: 2
vals: ['c1', 'c2', 'c3', 'c4', 'c2', 'c4', 'c4']
outliers: ['c1', 'c4', 'c2', 'c3']

synonyms: 2
vals: ['c2', 'c4', 'c4']
outliers: ['c4', 'c2']

synonyms: 1
vals: ['c2', 'c4', 'c4']
outliers: ['c4', 'c2']
val1: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
val2: ['c2', 'c4', 'c4', 'c5', 'c3', 'c3']
dist1: {'c5': 1, 'c4': 2, 'c2': 1, 'c3': 2}
dist2: {'c5': 1, 'c4': 2, 'c2': 1, 'c3': 2}
EMDs: [0, 0, 0

Data repair algorithm.

In [8]:
# repair()