# Step - by - step Guide to finding potential matches with Elastic Search
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Do further scoring and add new features to the similarity matrix
5. Train a machine learning model on the data
 

## 1. Load the data

In [1]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue
nrows = 100

In [2]:
df_source = getsource(nrows=nrows)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8de0b63f,bueromarkt boettcher ag,3 brusseler str,jena,7745,,DE
77ca58f8,buchde,an den speichern,munster,48157,31539316.0,DE
b8f79f98,phytec messtechnik gmbh,robert koch str,mainz,55129,,DE
f7da5ee1,e a elektro automatik gmbh,31 37 helmholtzstr,viersen,41747,344304634.0,DE
5e02efdb,bozic tours,32 zinglerstr,ulm,89077,,DE


In [3]:
df_target = gettarget(nrows=nrows)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

Number of rows in target data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dd1314c3,kuhn albert,9 wilhelmstr,stuttgart,70182,,DE
5c3c57bf,tino winter,obere au,rechtenstein,89611,,DE
ab364b25,c and g tool and cutter co ltd c and g tool an...,north road,yate,bs17 5lq,,GB
fc8bf3d0,ge sensing,fir tree lane,groby,le60fh,219144201.0,GB
b54f1933,selex es spa,via piemonte,rome,187,,IT


## 2. Push the data to Elastic Search

In [4]:
import elasticsearch
import time
from suricate.dbconnectors.esconnector import index_with_es

In [5]:
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print('Number of docs in created index:{}'.format(catcount))

Number of docs in created index:100


## 3. Create the first similarity matrix

In [6]:
from suricate.dbconnectors import EsConnector

In [7]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
Xst = escon.fit_transform(X=df_source)
ix_con = Xst.index
print('Number of possible pairs:{}'.format(len(ix_con)))
print('Example of similarity matrix:')
Xst.sample(5)

Number of possible pairs:1000
Example of similarity matrix:


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,es_rank
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1
a0a1b780,ce8a3993,7.317079,7
77f5274a,f6c4cea8,26.248058,6
e67441ba,f1d5897c,16.838224,4
536d4a08,d1d8f03c,28.83798,3
0a8de44c,7091755c,22.02799,1


In [8]:
Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
print('Example of side-by-side view of source and target data')
Xsbs.sample(5)

Example of side-by-side view of source and target data


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2b6c43e5,deaca020,industrie und,industrie foerderung,fasanenstr,20 gertraudenstrae,berlin,berlin,10623,10178,,329847623.0,DE,DE
8c071814,a90adf42,ge aviation systems ltdcustomer services,ge aviation,evesham road,evesham road,cheltenham,cheltenham,gl528sf,gl52 8sf,211207784.0,67221824.0,GB,GB
a9f4905c,168937e1,inficon gmbh,acla werke gmbh,bonner str,142 frankfurter str,cologne,cologne,50968,51065,,,DE,DE
a30fe96c,a7906fc8,edmund optics inc,edmund optics,lysander close,nether poppleton,york,york,yo30 4xb,yo260 6bl,236713199.0,236713199.0,GB,GB
e8c68ece,56f196de,uk space agency,tritech precision products ltd,north star avenue,north,swindon,wrexham,sn2 1sz,ll13,,,GB,GB


## 4. Further scoring

In [9]:
from suricate.sbstransformers import SbsApplyComparator
from sklearn.pipeline import FeatureUnion

In [10]:
_sbs_score_list = [
    ('name_fuzzy', SbsApplyComparator(on='name', comparator='simple')),
    ('street_fuzzy', SbsApplyComparator(on='street', comparator='simple')),
    ('name_token', SbsApplyComparator(on='name', comparator='token')),
    ('street_token', SbsApplyComparator(on='street', comparator='token')),
    ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')),
    ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains'))
]
scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores = pd.DataFrame(
    data=scorer_sbs.fit_transform(X=Xsbs),
    index=ix_con,
    columns=[c[0] for c in _sbs_score_list]
)
print('Additional scores')
Xscores.sample(3)

Additional scores


Unnamed: 0_level_0,Unnamed: 1_level_0,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
a0a1b780,60f20152,1.0,0.83,1.0,0.83,1.0,1.0,1.0
3c4377c9,bdcb5ed2,0.31,0.55,0.24,0.36,0.27,0.2,0.0
7a037111,69ef33b5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Concatenate with the scores from the previous step

In [11]:
Xscores = pd.concat([Xst[['es_score']], Xscores], axis=1, ignore_index=False)
print('Final scoring table')
Xscores.sample(5)

Final scoring table


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
39734970,ed38ab4d,13.272622,0.38,0.4,0.38,0.4,1.0,1.0,1.0
39e80128,92176610,12.068469,0.36,1.0,0.44,1.0,0.31,0.4,0.0
84ba9e9f,81460463,7.035188,0.46,0.53,0.33,0.53,0.29,0.4,0.0
b8f79f98,57fb4d86,8.616287,0.43,0.38,0.43,0.46,0.0,0.4,0.0
b8f79f98,88884406,8.008194,0.77,0.3,0.77,0.3,0.33,0.2,0.0


## 5. Apply the machine-learning model

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

### 5.1. Load y_true
for expediency, we will use the y_true already saved

In [13]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

Number of data in training:1000


### 5.2. Make the pipeline and display the scores

In [14]:
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=Xscores, y=y_true, scoring=scoring, cv=3)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

precision score: 0.8492561796481701
recall score: 0.7983122362869199
accuracy score: 0.9169858481235726


## 6. Viewing the results

### 6.1. Fit and pred

In [15]:
pipe.fit(X=Xscores, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=Xscores), index=ix_con, name='y_pred')

### 6.2. Positive matches

In [16]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
Xsbs.loc[positive_matches].sample(5)

Showing positive matches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
816d262e,fc8bf3d0,ge measurement control,ge sensing,fir tree lane,fir tree lane,groby,groby,le60fh,le60fh,226525053.0,219144201.0,GB,GB
b60f09c2,b60f09c2,stc steyr wlzlager,stc steyr wlzlager,40 41 rosenthaler str,40 41 rosenthaler str,berlin,berlin,10178,10178,,,DE,DE
84ba9e9f,90a3530a,rts elektronik systeme gmbh,rts elektronik systeme gmbh,66 preysingstr,66 preysingstr,wolnzach,wolnzach,85283,85283,,,DE,DE
8ae6cdb0,bbdd15a5,ge sensing,druck limited,fir tree lane,fir tree lane,groby,groby,le6 0fh,le6 0fh,219144201.0,219144201.0,GB,GB
68ba9560,b5f16ae7,frey blumenhof,frey blumenhof,mittenheimer str,mittenheimer str,oberschleissheim,oberschleissheim,85764,85764,342418069.0,,DE,DE


### Negative matches

In [17]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
Xsbs.loc[negative_matches].sample(5)

Showing negativematches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
77ca58f8,f4f1b345,buchde,renewa ingeniero sl,an den speichern,11 avenida ribera de axpe,munster,erandio,48157,48950,31539316.0,460012613.0,DE,ES
9b938f7e,8ad5aae3,siemens ag,siemens ag,16 sieboldstr,31 von der tann str,erlangen,nuremberg,91052,90439,342558576.0,332630888.0,DE,DE
4389becc,02f8a26c,ask distribution,manfrotto distribution,rue de la boucherie,44 rue de la couture,molsheim,rungis,67120,94390,,263172699.0,FR,FR
1d2e777b,984abc8d,citaku gbr,leonit gbr,32 papenbreede,62 leonrodstr,bad essen,munich,49152,80636,,34184809.0,DE,DE
0ad4aa46,c600af52,algner safety consulting services,hotel restaurant maier,14 weiherstr,poststr,friedrichshafen,friedrichshafen,88048,88048,,340105802.0,DE,DE
