In [39]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from itertools import compress, product
from entity_resolution_evaluation.evaluation import evaluate
#https://pypi.org/project/entity-resolution-evaluation/

def generate_combinations(_columns):
    ''' generate (2^N)-1 combinations'''
    return [list(compress(_columns,mask)) for mask in product(*[[0,1]]*len(_columns))][1:]

def create_matrices(_df, _columns):
    ''' build a dictionary with:
    - key: column name
    - value: the matrix of similarity between entities associated to this column'''
    _matrices= {}
    for _column in _columns:
        _tmp = pd.get_dummies(_df[_column], columns=[_column], sparse=True)
        _tmp = csr_matrix(_tmp)
        _tmp = np.dot(_tmp, _tmp.T)
        #_tmp.setdiag(0)
        #_tmp.eliminate_zeros()
        _tmp = _tmp.toarray()
        #np.fill_diagonal(_tmp,0)
        _matrices[_column] = _tmp
    return _matrices

def set_partition(_operation):    
    #Find connected elements in the graph and output label of clusters
    n_components, labels = connected_components(csgraph=_operation, directed=False, return_labels=True)

    #Build the partition
    partition = set(pd.Series(range(len(labels))).groupby(labels).apply(frozenset))
    
    #return set(_df.reset_index().groupby([0])['index'].apply(frozenset).tolist())
    return partition

def generate_new_entity(_dict_matrices, operand="OR"):
    if operand=="AND":
        operation = 1
        for m in _dict_matrices.values():
            operation *= m
    elif operand=="OR":
        # define the resolution operations Usage:  OR -> + , AND -> *
        #operation = _dict_matrices['name_consignee'] * _dict_matrices['NLP_stem']
        operation = sum(_dict_matrices.values())
    else:
        operation = sum(_dict_matrices.values())
    
    #return _df[_columns].apply(lambda x: "@".join(x), axis=1)
    return csr_matrix(operation)



def resolve(_dict_matrices, operand="OR"):
    operation = generate_new_entity(_dict_matrices, operand)
    return set_partition(operation)


In [40]:
goldenER = pd.read_csv("ER-Golden-50000.tsv", sep="\t")[:1000]

golden_dict_matrices = create_matrices(goldenER, ["name_consignee"])
consignee_goldenER = resolve(golden_dict_matrices, operand="AND")

In [41]:
goldenER

Unnamed: 0,identifier,name_consignee,address_1_consignee,address_2_consignee,address_3_consignee,address_4_consignee,city_consignee,zip_code_consignee,country_code_consignee,name_consignor,address_1_consignor,address_2_consignor,address_3_consignor,address_4_consignor,city_consignor,zip_code_consignor,country_code_consignor
0,202001022,WOOT SERVICES,1601 ESTES AVENUE,ELK GROVE VILLAGE,ILLINOIS 60007 IL,,,,,KORNIT DIGITAL LTD,12 HAAMAL ST. POB 11781 AFEK PARK,ROSH-HAAYIN 4809246 IL,,,,,
1,202001029,"IMPERIAL FOOD, INC.",475-A BLOY ST.,HILLSIDE US,,,,,,SHAVIT TECHNOLOGIES CO.,28 HOLLAND ST,NETANYA 42221 IS,,,,,
2,202001031,CLIPSO AMERICAS INC.,200 CORPORATE DRIVE,SUITE 4,BLAUVELT NY 10913 US,,,,,CLIPSO PRODUCTIONS,5 RUE DE LEGLISE,VIEUX-THANN 68800 FR,,,,,
3,202001035,LANGENSCHEIDT PUBLICATIONS INGRAM,1280 INGRAM DRIVE,CHAMBERSBURG PA 17202 US,,,,,,RHENUS MEDIEN GOTHA GMBH & CO.KG,LANGENSCHEIDTSTRASSE 10,GOTHA 99867 DE,,,,,
4,202001036,BOYD TECHNOLOGIES,501 PLEASANT STREET,LEE MA 01238 US,,,,,,PDM INDUSTRIES,BP 34,KERISOLE,QUIMPERE 29393 FR,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2020060333,"KONECRANES AMERICA, INC.",7300 CHIPPEWA BOULEVARD,HOUSTON TX 77086 US,,,,,,PFEIFER SEIL- U. HEBETECHNIK GMBH,DR.KARL LENZ STR. 66,MEMMINGEN DE,,,,,
996,2020060377,MOLSON COORS,V. SUAREZ ALMACEN PROMO,,"ALMACEN ANEX. 1, REXCO IND. PARK",,GUAYNABO,,PR,MICRO MATIC A/S,HOLKEBJERGVEJ 48,,,,ODENSE SV,,DK
997,2020060377,MOLSON COORS,V. SUAREZ ALMACEN PROMO,,"ALMACEN ANEX. 1, REXCO IND. PARK",,GUAYNABO,,PR,MICRO MATIC A/S,HOLKEBJERGVEJ 48,,,,ODENSE SV,,DK
998,2020060377,MOLSON COORS,V. SUAREZ ALMACEN PROMO,,"ALMACEN ANEX. 1, REXCO IND. PARK",,GUAYNABO,,PR,MICRO MATIC A/S,HOLKEBJERGVEJ 48,,,,ODENSE SV,,DK


In [42]:
consignee = pd.read_csv("50000_consignee_processed.csv", dtype=str)[:1000]

column_for_resolution = ['name_consignee', 'NLP_stem',
       'NLP_phonetic_daitchmokotoff', 'NLP_phonetic_metaphone',
       'NLP_phonetic_caverphone', 'NLP_phonetic_koelner',
       'NLP_fingerprint_positionfreq', 'NLP_fingerprint_consonant',
       'NLP_fingerprint_LCCutter', 'NLP_fingerprint_BWTRLEF',
       'NLP_fingerprint_LACSS']

consignee_ALL_ER = {}

print("matrice creation...")
consignee_dict_matrices = create_matrices(consignee, column_for_resolution)

print("dictionary filter...")
#small_dict=dict_filter(large_dict, new_dict_keys)
dict_filter = lambda x, y: dict([ (i,x[i]) for i in x if i in set(y) ])

for columns in generate_combinations(column_for_resolution):
    small_dict_matrices = dict_filter(consignee_dict_matrices, columns)
    consignee_ALL_ER[" & ".join(columns)] = generate_new_entity(small_dict_matrices, operand="AND")
    consignee_ALL_ER[" | ".join(columns)] = generate_new_entity(small_dict_matrices, operand="OR")

matrice creation...
dictionary filter...


In [22]:
consignee

Unnamed: 0,identifier,name_consignee,NLP_stem,NLP_phonetic_daitchmokotoff,NLP_phonetic_metaphone,NLP_phonetic_caverphone,NLP_phonetic_koelner,NLP_fingerprint_positionfreq,NLP_fingerprint_consonant,NLP_fingerprint_LCCutter,NLP_fingerprint_BWTRLEF,NLP_fingerprint_LACSS
0,202001022,WOOT SERVICES,woot servic,749744749754,WTSRFSS,WTSFSS1111,387388,WOVS,WT SRVCS,W66874785347,STISCVWOEE OR,8314792
1,202001029,"IMPERIAL FOOD, INC.","imperi food , inc",067987,IMPRLFTNK,AMPRFTNK11,061753268,IMDN,"IMPRL FD, NC.",I47475354663563,".L,DCINOP R",5220099
2,202001031,CLIPSO AMERICAS INC.,clipso america inc,487469587469,KLPSMRKSNK,KLPSMRKSNK,4518674868,CLNC,CLPS MRCS NC.,C5577636475337563,.OSC CNI,3021707
3,202001035,LANGENSCHEIDT PUBLICATIONS INGRAM,langenscheidt public ingram,865643,LNJNSKTTPBLKXNSNKRM,LNKNSKTPPL,56468211542686476,LANM,LNGNSCHDT PBLCTNS NGRM,L364467344538783553385667564736,MSTRLCUISIHGNNCLE T,6007809
4,202001036,BOYD TECHNOLOGIES,boyd technolog,734685735685,BTTXNLJS,PTKNLKS111,1246548,BOYD,BYD TCHNLGS,B693843466564547,SD,2500774
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2020060333,"KONECRANES AMERICA, INC.","konecran america , inc",564964565964,KNKRNSMRKNK,KNKRNSMRKN,46876867468,KONC,"KNCRNS MRC, NC.",K6643736473647533563,".S,ACC RNIENMNR",5751380
996,2020060377,MOLSON COORS,molson coor,684644684649684654684659,MLSNKRS,MSNKS11111,6586478,MORS,MLSN CRS,M6576636677,SN O,6253461
997,2020060377,MOLSON COORS,molson coor,684644684649684654684659,MLSNKRS,MSNKS11111,6586478,MORS,MLSN CRS,M6576636677,SN O,6253461
998,2020060377,MOLSON COORS,molson coor,684644684649684654684659,MLSNKRS,MSNKS11111,6586478,MORS,MLSN CRS,M6576636677,SN O,6253461


In [23]:
len(list(consignee_ALL_ER.keys()))

4083

In [34]:
consignee_UNIQUE_ER = {}
for i, v in consignee_ALL_ER.items():
    v_nnz = zip(*v.nonzero())
    vh = hash(frozenset(v_nnz))
    consignee_UNIQUE_ER[vh] = [i] if vh not in consignee_UNIQUE_ER.keys() else consignee_UNIQUE_ER[vh] + [i]

In [25]:
len(list(consignee_UNIQUE_ER.keys()))

86

In [32]:
type(a)

scipy.sparse.csr.csr_matrix

In [43]:
a= consignee_ALL_ER['NLP_fingerprint_LACSS']
b= consignee_ALL_ER['NLP_fingerprint_BWTRLEF']
aa = [ list(elem) for elem in list(consignee_ALL_ER['NLP_fingerprint_LACSS']) ]
bb = [ list(elem) for elem in list(consignee_ALL_ER['NLP_fingerprint_BWTRLEF']) ]

In [46]:
type(a)

scipy.sparse.csr.csr_matrix

In [45]:
len(list(a.intersection(b))) / len(list(a.union(b)))
#https://deepai.org/machine-learning-glossary-and-terms/jaccard-index

AttributeError: intersection not found

In [48]:
evaluate(aa,bb,'variation_of_information')

TypeError: unhashable type: 'csr_matrix'

<img src="entity_metrics.PNG" width="800" height="400">

In [30]:
metrics = ['bmd','precision','recall','f1','variation_of_information']

results = {}
for er_hash, combination in consignee_UNIQUE_ER.items():
    
    results[er_hash] = { m:evaluate(set_partition(consignee_ALL_ER[combination[0]]),consignee_goldenER,m) for m in metrics}
    
results = pd.DataFrame.from_dict(results).transpose()
results

Unnamed: 0,bmd,precision,recall,f1,variation_of_information
-6193327702797796591,32.0,0.807692,1.000000,0.893617,0.055479
-6141876134287074174,12.0,1.000000,0.958730,0.978930,0.017159
-613699151210294997,4.0,1.000000,0.984127,0.992000,0.006068
-2967526185321071585,11.0,1.000000,0.961905,0.980583,0.015772
1194780099854449836,3.0,1.000000,0.987302,0.993610,0.004682
...,...,...,...,...,...
6349580182836644931,4.0,0.996795,0.987302,0.992026,0.006068
-5611511134012889664,3.0,0.996805,0.990476,0.993631,0.004682
6255416303574597041,8.0,0.981132,0.990476,0.985782,0.011614
-3769901920257834251,9.0,0.891429,0.990476,0.938346,0.019252


In [None]:
results[results['bmd'] == results['bmd'].min()]

In [None]:
results[results['variation_of_information'] == results['variation_of_information'].min()]

In [None]:
results[results['precision'] == results['precision'].max()]

In [None]:
results[results['recall'] == results['recall'].max()]

In [None]:
results[results['f1'] == results['f1'].max()]

In [None]:
consignee_UNIQUE_ER[8658760148384774510]

In [None]:
consignee_UNIQUE_ER[-8672847595432865152]

In [None]:
test = pd.DataFrame({0:['a','b','c','d','b','a'], 1:['a','b','c','d','e','f']})
print(test)
test = create_matrices(test, [0,1])

In [None]:
#operation = consignee_matrices['name_consignee'] * consignee_matrices['NLP_stem']

operation = 1
for m in test.values():
    operation *= m
operation

In [None]:
operation

In [None]:
partition

In [None]:


#consignee_matrices = 
%timeit create_matrices(consignee, ['name_consignee','NLP_stem']) 

In [None]:
consignee_matrices['name_consignee']