# Step - by - step Guide to finding potential matches
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Use the Explorer to label representative sample of the data
5. Do further scoring and add new features to the similarity matrix
6. Train a machine learning model on the data
 

## 1. Load the data

In [81]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue

In [82]:
df_source = getsource(nrows=None)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:1444


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7d49ff77,bundesanstalt f arbeitsschutz,1 25 friedrich henkel weg,dortmund,44149,344398474.0,DE
aeec2ddb,siemens ag,henkestr,erlangen,91052,313366613.0,DE
6dd6ba94,ergodata gmbh,lessingstrae 27,dresden langebrrck,1465,330457995.0,DE
43b5ff3f,jura zentralservice singen,16 otto hahn str,singen,78224,342653791.0,DE
dfaddbc3,hydro cleansing ltd,hcl house beddington farm road,croydon,cr04xb,,GB


In [83]:
df_target = gettarget(nrows=None)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

Number of rows in target data:3177


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3206f8ea,aeroflex international ltd,school close,eastleigh,so53 4ra,216834002.0,GB
7bd39e82,simancas ediciones sa,m 6 pol municipal de dueaas,duenas,34210,,ES
b6739c31,aeroflex gmbh,2 gutenbergstr,ismaning,85737,,DE
97c71f2f,astro med gmbh,1 senefelder str,rodgau,63110,,DE
b43b6d88,key publishing ltd,po box 100,stamford,pe9,226513919.0,GB


## 2. Push the data to Elastic Search

In [84]:
import elasticsearch
import time
from suricate.dbconnectors.esconnector import index_with_es

In [85]:
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print('Number of docs in created index:{}'.format(catcount))

Number of docs in created index:3177


## 3. Create the first similarity matrix

In [86]:
from suricate.dbconnectors import EsConnector

In [87]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
Xst = escon.fit_transform(X=df_source)
ix_con = Xst.index
print('Number of possible pairs:{}'.format(len(ix_con)))
print('Example of similarity matrix:')
Xst.sample(5)

Number of possible pairs:14440
Example of similarity matrix:


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,es_rank
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1
1bc1ab2a,1bc1ab2a,34.726013,0
06045ef8,f59b980f,9.250144,9
3e254c40,e5cfe1a5,25.478561,1
e649baee,e649baee,33.3064,0
e9001fee,f7107e9a,8.931313,9


In [88]:
Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
print('Example of side-by-side view of source and target data')
Xsbs.sample(5)

Example of side-by-side view of source and target data


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5a33b026,40442b94,gluetec,ratec licht,8a am biotop,8a blumenstr,geisenheim,gross pankow prignitz,65366,16928,,506951917.0,DE,DE
86c8695e,94626ddf,halco iberia sa,monocomp instrumentacion sa,pq empresarial sant cugat,63 calle santa leonor,sant cugat del valls,madrid,8174,28037,464727759.0,564992394.0,ES,ES
0675a2a8,0675a2a8,medicom service e k,medicom service e k,wankelstr,wankelstr,korschenbroich,korschenbroich,41352,41352,537440336.0,537440336.0,DE,DE
8f20d3df,a28a52a4,etikettenstar gmbh,es electronic service gmbh,vogelbeerweg,3 hohe str,bad zwischenahn,bad nauheim,26160,61231,312587183.0,,DE,DE
4b61a6e1,c682e74b,c stiefelmayer gmbh co kg,keck energieservice gmbh co kg,4 htten weg,4 rieseler feld,wertheim am main,brakel,97877,33034,313042822.0,343732744.0,DE,DE


4. Further scoring

In [89]:
from suricate.sbsdftransformers import FuncSbsComparator
from sklearn.pipeline import FeatureUnion

In [90]:
_sbs_score_list = [
    ('name_fuzzy', FuncSbsComparator(on='name', comparator='simple')),
    ('street_fuzzy', FuncSbsComparator(on='street', comparator='simple')),
    ('name_token', FuncSbsComparator(on='name', comparator='token')),
    ('street_token', FuncSbsComparator(on='street', comparator='token')),
    ('city_fuzzy', FuncSbsComparator(on='city', comparator='simple')),
    ('postalcode_fuzzy', FuncSbsComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', FuncSbsComparator(on='postalcode', comparator='contains'))
]
scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores = pd.DataFrame(
    data=scorer_sbs.fit_transform(X=Xsbs),
    index=ix_con,
    columns=[c[0] for c in _sbs_score_list]
)
print('Additional scores')
Xscores.sample(3)

Additional scores


Unnamed: 0_level_0,Unnamed: 1_level_0,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
077fdf56,c88f82c3,0.47,0.48,0.47,0.48,0.09,0.0,0.0
2d652dca,15e562a8,0.47,1.0,0.51,1.0,1.0,1.0,1.0
0c07244b,150322b3,0.4,0.48,0.4,0.48,1.0,0.8,0.0


Concatenate with the scores from the previous step

In [91]:
Xscores = pd.concat([Xst[['es_score']], Xscores], axis=1, ignore_index=False)
print('Final scoring table')
Xscores.sample(5)

Final scoring table


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
54939dc0,bd9a83b4,6.920133,0.3,0.83,0.2,0.83,0.29,0.4,0.0
9913e18c,5f978773,19.664148,0.42,0.59,0.42,0.59,1.0,0.8,0.0
9675372e,df8b52ad,35.009052,1.0,1.0,1.0,1.0,1.0,1.0,1.0
e6e0f237,87224afc,9.150091,0.3,0.53,0.2,0.53,1.0,0.8,0.0
666666f3,bc842ca6,15.149372,0.92,0.36,0.92,0.36,0.18,0.4,0.0


## 6. Apply the machine-learning model

In [92]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

for expediency, we will use the y_true already saved

In [93]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

Number of data in training:14440


### Make the pipeline

In [94]:
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=Xscores, y=y_true, scoring=scoring, cv=3)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

precision score: 0.9100410910142727
recall score: 0.8602367500672585
accuracy score: 0.9559565270546496


In [95]:
pipe.fit(X=Xscores, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=Xscores), index=ix_con, name='y_pred')

In [96]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
Xsbs.loc[positive_matches].sample(5)

Showing positive matches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3973a412,a9f4905c,inficon gmbh,inficon gmbh,498 bonner str,bonner str,cologne,cologne,50968,50968,314521394.0,,DE,DE
7d694999,6602b62d,nespresso deutschland gmbh,nespresso deutschland gmbh,8 zollhof,speditionsstrasse,dusseldorf,dusseldorf,40221,40221,333868649.0,341618791.0,DE,DE
6bb99b89,6bb99b89,snap on industrial germany,snap on industrial germany,10 willettstr,10 willettstr,mettmann,mettmann,40822,40822,332477905.0,332477905.0,DE,DE
40dc815f,40dc815f,febrotec gmbh,febrotec gmbh,frankfurter str,frankfurter str,halver,halver,58553,58553,,,DE,DE
a939e3e1,afa3b342,honeywell aerospace,honeywell aerospace,23500 w 105th st,23500 w 105th st,olathe,olathe,66061-8425,66061-8425,614705783.0,21214283.0,US,US


In [97]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
Xsbs.loc[negative_matches].sample(5)



Showing negativematches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2e7b3b12,877c0076,multi circuit boards ltd,heisslufttechnik flocke gmbh,2 brunnthaler str,mondstraaye 2 4,brunnthal,feldkirchen,85649,85622,342177911.0,,DE,DE
77d04dd9,f3d630e0,comtec cable accessories,stanford university,cardinal way,655 knight way,huntingdon,stanford california,pe29 2xn,,,78425739.0,GB,GB
ab212abd,70145d3e,kasper richter gmbh co kg,muetron mueller gmbh co kg,14 erlanger str,theodor barth str,uttenreuth,achim,91080,28832,,,DE,DE
1e180a85,c7c7deb2,kugellager shop,spie deutschland system integration,64 bismarckstr,9 rudolfstr,berlin,berlin,13585,10245,,537617008.0,DE,DE
de51d1e5,61f33364,von roll uk limited,thales uk limited,wharfedale road,manor road,bradford,crawley,bd4 6sg,rh109ha,21309974.0,,GB,GB
