# Step - by - step Guide to finding potential matches
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Use the Explorer to label representative sample of the data
5. Do further scoring and add new features to the similarity matrix
6. Train a machine learning model on the data
 

## 1. Load the data

In [63]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue

In [64]:
df_source = getsource(nrows=None)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:1444


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6900fd67,alexander tress,11 niedermhle weg,bad wurzach,88410,344869433.0,DE
c8012c1b,mel aviation ltd,60 woolmer way,bordon,gu35,,GB
837188f3,crc industries iberia slu,96 calle del gremio cuero,segovia,40195,,ES
1289d400,febrotec gmbh,76 frankfurter str,halver,58553,319710398.0,DE
ea0d41e6,airflow lufttechnik gmbh,21 kleine heeg,rheinbach,53359,318818127.0,DE


In [65]:
df_target = gettarget(nrows=None)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

Number of rows in target data:3177


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c1b026d0,bvs industrie elektronik gmbh,6 rodenbacher chaussee,hanau,63457,320971187.0,DE
0be28259,tdf telediffusion de france,4 avenue ampere,montigny le bretonneux,78897,380971556.0,FR
03608f8b,northrop grumman,10 norden pl,norwalk,06855-1452,966764417.0,US
041b20ff,pratt whitney engines services,1525 midway park rd,bridgeport,26330-9688,55530604.0,US
4f43eea7,mc conseil,5 rue belle alle,mortagne sur sevre,85290,772407417.0,FR


## 2. Push the data to Elastic Search

In [66]:
import elasticsearch
import time
from suricate.dbconnectors.esconnector import index_with_es

In [67]:
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print('Number of docs in created index:{}'.format(catcount))

Number of docs in created index:3177


## 3. Create the first similarity matrix

In [68]:
from suricate.dbconnectors import EsConnector

In [69]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
Xst = escon.fit_transform(X=df_source)
ix_con = Xst.index
print('Number of possible pairs:{}'.format(len(ix_con)))
print('Example of similarity matrix:')
Xst.sample(5)

Number of possible pairs:MultiIndex(levels=[['00076444', '0025e9ae', '002c559b', '009a86ad', '00a0bae6', '00d11a3d', '00dd00ff', '0129a092', '0167f9b6', '01906e48', '01c34b3a', '0259216a', '0297485a', '02ca54ed', '02f8a26c', '02fd0b2e', '031704b5', '032a495d', '033f628b', '0358d8de', '035d337d', '036c39fe', '0384d657', '041f9944', '04552123', '045c4309', '0472d735', '0483c61c', '04c2bc34', '054aa318', '05548a30', '05730d16', '05ab6b6a', '06045ef8', '0651a667', '066c325e', '0675a2a8', '06a44c5e', '06cbd331', '06d2cd8e', '06ec6021', '07587bc5', '077fdf56', '079d4185', '07bea237', '080e9920', '083608e9', '08798f70', '087ec076', '08890639', '0893b797', '08a90fec', '08fbd646', '0900a4d6', '0908a0aa', '095ad03b', '097913ca', '0a0bb7ad', '0a545341', '0a6ce927', '0a871c45', '0a8de44c', '0a966627', '0ad4aa46', '0ae07252', '0b2448c7', '0b369747', '0bcf1f54', '0c030810', '0c07244b', '0c1c75cf', '0c5ded04', '0c908473', '0ca56dd2', '0cb14cd7', '0cdda6c2', '0d5ea3e3', '0dd38705', '0defc034', '0e02fe

Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,es_rank
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1
a32da5d0,759e162c,29.573872,3
705b8c5b,a39f98b8,14.533558,9
b5596907,a25ba9bf,28.383192,8
f49460e3,c8aaa4cd,42.41605,1
4af3a76c,bb49d430,45.178738,7


In [70]:
Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
print('Example of side-by-side view of source and target data')
Xsbs.sample(5)

Example of side-by-side view of source and target data


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7ff2b1d2,7ba2165e,h media,european datacomm nv,329 heistraat,257 heidestraat,antwerp,zwijndrecht,2610,2070,,283835379.0,BE,BE
fe5db42f,d4468f99,le joint francais,le manoir de gressy,centre administratif,chemin des carrosses,thure,gressy,86540,77410,,,FR,FR
ba5c212c,9727627d,ralf michael mohr,ahc oberflaechentechnik gmbh,2 rosenstr,2 2 zillenhardtstr,ziegendorf,goppingen,19372,73037,,341053885.0,DE,DE
caa9b691,c8f81892,cosset md,euro cls,rue jean mermoz,rue jean mermoz,le haillan,courcouronnes,33185,91080,277271321.0,,FR,FR
8440876d,839ae5f3,landratsamt bodenseekreis,mangold elektromaschinen gmbh,1 3 glarnisch str,lindauer str,friedrichshafen,friedrichshafen,88045,88046,330386074.0,316124353.0,DE,DE


4. Further scoring

In [71]:
from suricate.sbsdftransformers import FuncSbsComparator
from sklearn.pipeline import FeatureUnion

In [72]:
_sbs_score_list = [
    ('name_fuzzy', FuncSbsComparator(on='name', comparator='fuzzy')),
    ('street_fuzzy', FuncSbsComparator(on='street', comparator='fuzzy')),
    ('name_token', FuncSbsComparator(on='name', comparator='token')),
    ('street_token', FuncSbsComparator(on='street', comparator='token')),
    ('city_fuzzy', FuncSbsComparator(on='city', comparator='fuzzy')),
    ('postalcode_fuzzy', FuncSbsComparator(on='postalcode', comparator='fuzzy')),
    ('postalcode_contains', FuncSbsComparator(on='postalcode', comparator='contains'))
]
scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores = pd.DataFrame(
    data=scorer_sbs.fit_transform(X=Xsbs),
    index=ix_con,
    columns=[c[0] for c in _sbs_score_list]
)
print('Additional scores')
Xscores.sample(3)

Additional scores


Unnamed: 0_level_0,Unnamed: 1_level_0,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dd4ccf8c,eb2c1b65,0.65,0.35,0.59,0.47,1.0,0.2,0.0
604e23d5,604e23d5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
322eb731,45007c12,0.69,0.12,0.48,0.24,0.57,0.4,0.0


Concatenate with the scores from the previous step

In [73]:
Xscores = pd.concat([Xst[['es_score']], Xscores], axis=1, ignore_index=False)
print('Final scoring table')
Xscores.sample(5)

Final scoring table


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fa55c17b,cc1d69b8,43.36041,1.0,0.95,1.0,0.95,1.0,1.0,1.0
a83fa5a8,48784f03,10.231821,0.47,0.17,0.35,0.17,1.0,0.6,0.0
2253b32d,5e587c49,5.498617,0.27,0.32,0.13,0.32,0.0,0.8,0.0
1e6586fc,38153802,9.441657,0.5,0.58,0.55,0.58,1.0,0.4,0.0
29d1f197,65099899,32.40538,0.96,1.0,0.88,1.0,1.0,1.0,1.0


## 6. Apply the machine-learning model

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

for expediency, we will use the y_true already saved

In [75]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

Number of data in training:14440


### Make the pipeline

In [76]:
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=Xscores, y=y_true, scoring=scoring, cv=3)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

precision score: 0.9096992768833619
recall score: 0.8602367500672585
accuracy score: 0.9558872701809049


In [77]:
pipe.fit(X=Xscores, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=Xscores), index=ix_con, name='y_pred')

In [78]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
Xsbs.loc[positive_matches].sample(5)

Showing positive matches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9bb2f5d9,e4917d10,specialty fasteners components,specialty fasteners components lt,unit d seymour wharf,seymour wharf steamer quay road,totnes,totnes,tq9,tq9,233148675.0,,GB,GB
8e71eb0a,591099fe,nespresso deutschland gmbh b 746m6,nespresso deutschland gmbh,8 zollhof,speditionsstrae,dusseldorf,dusseldorf,40221,40221,333868649.0,,DE,DE
88aaa3a3,7ca58742,united parcel service deutschland,united parcel service,1 gorlitzer str,1 grlitzer strae,neuss,neuss,41460,41401,,315081096.0,DE,DE
fbb6c400,fbb6c400,hoffman engineering,hoffman engineering,8 riverbend dr,8 riverbend dr,stamford,stamford,69070,69070,,,US,US
036c39fe,036c39fe,fako,fako,15 peutestr,15 peutestr,hamburg,hamburg,20539,20539,,,DE,DE


In [79]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
Xsbs.loc[negative_matches].sample(5)



Showing negativematches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
989ba440,ec844e4c,ulmer volkshoch,notariat ulm,5 kornhausplatz,zeughausgasse,ulm,ulm,89073,89073,329382126.0,,DE,DE
c7f39e26,ab176db8,gottschalk michaelis gmbh,datacon gmbh,11 23 lahnstr,23 waldstr,berlin,dietzenbach,12055,63128,,328652599.0,DE,DE
f21fdcf5,15ddb77a,smiths aerospace customer services,ge aviation systems ltd,bishops cleeve,bishops cleeve,cheltenham,cheltenham,gl528yb,gl52 8yb,288525181.0,211207784.0,GB,GB
b05a1ef5,adf53fa8,wupptool,toho tenax europe gmbh,herbringhausen,19 21 kasinostr,wuppertal,wuppertal,42399,42103,341522115.0,,DE,DE
9a51e10b,8d3f45c8,cover construction company limited,arj construction ltd,filkins mill,rutherford close,lechlade on thames,stevenage,gl7,sg1 2ef,,,GB,GB
