In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from itertools import compress, product
from entity_resolution_evaluation.evaluation import evaluate
#https://pypi.org/project/entity-resolution-evaluation/

def generate_combinations(_columns):
    ''' generate (2^N)-1 combinations'''
    return [list(compress(_columns,mask)) for mask in product(*[[0,1]]*len(_columns))][1:]

def create_matrices(_df, _columns):
    ''' build a dictionary with:
    - key: column name
    - value: the matrix of similarity between entities associated to this column'''
    _matrices= {}
    for _column in _columns:
        _tmp = pd.get_dummies(_df[_column], columns=[_column], sparse=True)
        _tmp = csr_matrix(_tmp)
        _tmp = np.dot(_tmp, _tmp.T)
        #_tmp.setdiag(0)
        #_tmp.eliminate_zeros()
        _tmp = _tmp.toarray()
        #np.fill_diagonal(_tmp,0)
        _matrices[_column] = _tmp
    return _matrices

def set_partition(_operation):    
    #Find connected elements in the graph and output label of clusters
    n_components, labels = connected_components(csgraph=_operation, directed=False, return_labels=True)

    #Build the partition
    partition = set(pd.Series(range(len(labels))).groupby(labels).apply(frozenset))
    
    #return set(_df.reset_index().groupby([0])['index'].apply(frozenset).tolist())
    return partition

def generate_new_entity(_dict_matrices, operand="OR"):
    if operand=="AND":
        operation = 1
        for m in _dict_matrices.values():
            operation *= m
    elif operand=="OR":
        # define the resolution operations Usage:  OR -> + , AND -> *
        #operation = _dict_matrices['name_consignee'] * _dict_matrices['NLP_stem']
        operation = sum(_dict_matrices.values())
    else:
        operation = sum(_dict_matrices.values())
    
    #return _df[_columns].apply(lambda x: "@".join(x), axis=1)
    return csr_matrix(operation)



def resolve(_dict_matrices, operand="OR"):
    operation = generate_new_entity(_dict_matrices, operand)
    return set_partition(operation)


In [2]:
goldenER = pd.read_csv("golden_ER_5000.csv")

golden_dict_matrices = create_matrices(goldenER, ["name_consignee"])
consignee_goldenER = resolve(golden_dict_matrices, operand="AND")

In [3]:
consignee = pd.read_csv("5000_consignee_processed.csv", dtype=str)[:4999]

column_for_resolution = ['name_consignee', 'NLP_stem',
       'NLP_phonetic_daitchmokotoff', 'NLP_phonetic_metaphone',
       'NLP_phonetic_caverphone', 'NLP_phonetic_koelner',
       'NLP_fingerprint_positionfreq', 'NLP_fingerprint_consonant',
       'NLP_fingerprint_LCCutter', 'NLP_fingerprint_BWTRLEF',
       'NLP_fingerprint_LACSS']

consignee_ALL_ER = {}

consignee_dict_matrices = create_matrices(consignee, column_for_resolution)

#small_dict=dict_filter(large_dict, new_dict_keys)
dict_filter = lambda x, y: dict([ (i,x[i]) for i in x if i in set(y) ])

for columns in generate_combinations(column_for_resolution):
    small_dict_matrices = dict_filter(consignee_dict_matrices, columns)
    consignee_ALL_ER[" & ".join(columns)] = generate_new_entity(small_dict_matrices, operand="AND")
    consignee_ALL_ER[" | ".join(columns)] = generate_new_entity(small_dict_matrices, operand="OR")

In [4]:
len(list(consignee_ALL_ER.keys()))

4083

In [5]:
consignee_UNIQUE_ER = {}
for i, v in consignee_ALL_ER.items():
    v_nnz = zip(*v.nonzero())
    vh = hash(frozenset(v_nnz))
    consignee_UNIQUE_ER[vh] = [i] if vh not in consignee_UNIQUE_ER.keys() else consignee_UNIQUE_ER[vh] + [i]

In [7]:
len(list(consignee_UNIQUE_ER.keys()))

816

In [7]:
a= consignee_ALL_ER['NLP_fingerprint_LACSS']
b= consignee_ALL_ER['NLP_fingerprint_BWTRLEF']
aa = [ list(elem) for elem in list(consignee_ALL_ER['NLP_fingerprint_LACSS']) ]
bb = [ list(elem) for elem in list(consignee_ALL_ER['NLP_fingerprint_BWTRLEF']) ]

len(list(a.intersection(b))) / len(list(a.union(b)))
#https://deepai.org/machine-learning-glossary-and-terms/jaccard-index

0.7348033373063171

In [8]:
evaluate(aa,bb,'variation_of_information')

0.22142788684490156

<img src="entity_metrics.PNG" width="800" height="400">

In [8]:
metrics = ['bmd','precision','recall','f1','variation_of_information']

results = {}
for er_hash, combination in consignee_UNIQUE_ER.items():
    
    results[er_hash] = { m:evaluate(set_partition(consignee_ALL_ER[combination[0]]),consignee_goldenER,m) for m in metrics}
    
results = pd.DataFrame.from_dict(results).transpose()
results

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
1917457792910462427,445.0,0.430375,0.987583,0.599497,0.175802
4709233299943901364,174.0,0.976699,0.832781,0.899017,0.052491
-5935520194894399993,155.0,1.000000,0.831126,0.907776,0.046908
327073812043129292,464.0,0.426481,0.989238,0.596010,0.181767
-402954592079275274,54.0,0.995656,0.948675,0.971598,0.016226
...,...,...,...,...,...
3251653819289977911,1571.0,0.042989,0.995861,0.082420,0.853855
5617004114954200066,1439.0,0.069923,0.994205,0.130657,0.731923
6175556420308098946,1458.0,0.056392,0.995033,0.106735,0.757601
-7493460618237766647,1440.0,0.069907,0.994205,0.130629,0.732423


In [9]:
results[results['bmd'] == results['bmd'].min()]

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
8658508909212701585,48.0,0.984759,0.962748,0.973629,0.014717


In [10]:
results[results['variation_of_information'] == results['variation_of_information'].min()]

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
8658508909212701585,48.0,0.984759,0.962748,0.973629,0.014717


In [11]:
results[results['precision'] == results['precision'].max()]

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
-5935520194894399993,155.0,1.0,0.831126,0.907776,0.046908
-7564123882045471681,155.0,1.0,0.831126,0.907776,0.046908
-4835416919049419341,156.0,1.0,0.830298,0.907282,0.047186
-734058257703497872,153.0,1.0,0.832781,0.908762,0.046354
-5682949988691089844,154.0,1.0,0.831954,0.908269,0.046631
6900855548508373781,154.0,1.0,0.831954,0.908269,0.046631
-2170805985137001182,155.0,1.0,0.831126,0.907776,0.046908


In [12]:
results[results['recall'] == results['recall'].max()]

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
8658760148384774510,1531.0,0.050792,0.995861,0.096654,0.811766
-8210321128922505927,1532.0,0.050783,0.995861,0.096638,0.812267
-500325564094303220,1548.0,0.049832,0.995861,0.094915,0.822301
-7600839865746547303,1549.0,0.049824,0.995861,0.0949,0.822802
5962292268067600685,1554.0,0.044014,0.995861,0.084303,0.842636
-9094981574371390647,1555.0,0.044008,0.995861,0.084291,0.843137
6396093515575291387,1567.0,0.043252,0.995861,0.082903,0.850783
-3394162375200208635,1568.0,0.043245,0.995861,0.082891,0.851284
4546638767376743619,1534.0,0.045717,0.995861,0.087421,0.826856
-8415382928674300570,1535.0,0.04571,0.995861,0.087408,0.827356


In [13]:
results[results['f1'] == results['f1'].max()]

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
-8672847595432865152,50.0,0.992268,0.956126,0.973862,0.015012


In [14]:
consignee_UNIQUE_ER[8658760148384774510]

['NLP_phonetic_daitchmokotoff | NLP_fingerprint_positionfreq | NLP_fingerprint_BWTRLEF | NLP_fingerprint_LACSS',
 'NLP_phonetic_daitchmokotoff | NLP_fingerprint_positionfreq | NLP_fingerprint_consonant | NLP_fingerprint_BWTRLEF | NLP_fingerprint_LACSS',
 'name_consignee | NLP_phonetic_daitchmokotoff | NLP_fingerprint_positionfreq | NLP_fingerprint_BWTRLEF | NLP_fingerprint_LACSS',
 'name_consignee | NLP_phonetic_daitchmokotoff | NLP_fingerprint_positionfreq | NLP_fingerprint_consonant | NLP_fingerprint_BWTRLEF | NLP_fingerprint_LACSS']

In [15]:
consignee_UNIQUE_ER[-8672847595432865152]

['NLP_phonetic_caverphone & NLP_fingerprint_positionfreq & NLP_fingerprint_LACSS',
 'NLP_phonetic_daitchmokotoff & NLP_phonetic_caverphone & NLP_fingerprint_positionfreq',
 'NLP_phonetic_daitchmokotoff & NLP_phonetic_caverphone & NLP_fingerprint_positionfreq & NLP_fingerprint_LACSS']

In [186]:
test = pd.DataFrame({0:['a','b','c','d','b','a'], 1:['a','b','c','d','e','f']})
print(test)
test = create_matrices(test, [0,1])

   0  1
0  a  a
1  b  b
2  c  c
3  d  d
4  b  e
5  a  f


dict_values([array([[1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1]], dtype=uint8), array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1]], dtype=uint8)])

In [187]:
#operation = consignee_matrices['name_consignee'] * consignee_matrices['NLP_stem']

operation = 1
for m in test.values():
    operation *= m
operation

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1]], dtype=uint8)

In [184]:
operation

0

In [160]:
partition

{frozenset({3}), frozenset({2}), frozenset({1, 4}), frozenset({0, 5})}

{frozenset({3}), frozenset({2}), frozenset({1, 4}), frozenset({0, 5})}

In [150]:


#consignee_matrices = 
%timeit create_matrices(consignee, ['name_consignee','NLP_stem']) 

964 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
consignee_matrices['name_consignee']

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)