# Step - by - step Guide to finding potential matches with Elastic Search
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Do further scoring and add new features to the similarity matrix
5. Train a machine learning model on the data
 

## 1. Load the data

In [7]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue
nrows = 100

In [8]:
df_source = getsource(nrows=nrows)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7deb146f,smiths aerospace electronic systems,cheltenham,cheltenham,gl528sf,,GB
7675fecc,salicru electronics,calle parque ind,seville,41016,,ES
9b938f7e,siemens ag,16 sieboldstr,erlangen,91052,342558576.0,DE
77ca58f8,buchde,an den speichern,munster,48157,31539316.0,DE
19c06e93,denzel kg,12 mhlwinkel,wertingen,86637,316456565.0,DE


In [9]:
df_target = gettarget(nrows=nrows)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

Number of rows in target data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
591099fe,nespresso deutschland gmbh,speditionsstrae,dusseldorf,40221,,DE
484866fb,ea elektro automatik,31 37 helmholtzstr,viersen,41747,330862074.0,DE
251a3154,aalco metall,12 aindlinger str,augsburg,86167,333354707.0,DE
c2e5b14f,marketline,sheepen place,colchester,co3 3lp,,GB
ab364b25,c and g tool and cutter co ltd c and g tool an...,north road,yate,bs17 5lq,,GB


In [10]:
Xst = [df_source, df_target]

## 2. Create the similarity matrix

In [46]:
from suricate.dftransformers import DfApplyComparator, VectorizerConnector, ExactConnector, cartesian_join
from suricate.preutils import createmultiindex
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scores = [
    ('name_vecword', VectorizerConnector(on='name', analyzer='word', ngram_range=(1,2))),
    ('name_vecchar', VectorizerConnector(on='name', analyzer='char', ngram_range=(1,3))),
    ('street_vecword', VectorizerConnector(on='street', analyzer='word', ngram_range=(1,2))),
    ('street_vecchar', VectorizerConnector(on='street', analyzer='char', ngram_range=(1,3))),
    ('city_vecchar', VectorizerConnector(on='city', analyzer='char', ngram_range=(1,3))),
    ('postalcode_exact', ExactConnector(on='postalcode')),
    ('duns_exact', ExactConnector(on='duns')),
    ('countrycode_exact', ExactConnector(on='countrycode'))
]
transformer = FeatureUnion(scores)
X_score = transformer.fit_transform(X=Xst)
print(X_score.shape)
ix_con = createmultiindex(X=Xst)
X_score = pd.DataFrame(data=X_score, columns = [c[0] for c in scores], index=ix_con)
X_score.sample(4)

(10000, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,name_vecword,name_vecchar,street_vecword,street_vecchar,city_vecchar,postalcode_exact,duns_exact,countrycode_exact
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
e8c68ece,3cbe1b93,0.0,0.064568,0.0,0.067782,0.0,0.0,0.0,0.0
97571df1,ab00128c,0.054105,0.126078,0.0,0.038994,0.031912,0.0,0.0,1.0
0ad4aa46,ab364b25,0.0,0.149259,0.0,0.032903,0.020106,0.0,0.0,0.0
fe5db42f,7091755c,0.0,0.084757,0.0,0.06486,0.01197,0.0,0.0,0.0


## 3. Apply the Machine-Learning Model

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate

for expediency, we will use the y_true already saved

In [17]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

Number of data in training:10000


### Make the pipeline

In [37]:
pipe = Pipeline([
    ('imputer', Imputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('classifier', LogisticRegressionCV(cv=5))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=X_score, y=y_true, scoring=scoring, cv=5)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

precision score: 0.96
recall score: 0.875
accuracy score: 0.9993000000000001


In [47]:
pipe.fit(X=X_score, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=X_score), index=ix_con, name='y_pred')

In [48]:
X_sbs = cartesian_join(source=df_source, target=df_target).set_index(['ix_source', 'ix_target'])

In [50]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
X_sbs.loc[positive_matches].sample(5)

Showing positive matches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,street_source,city_source,postalcode_source,duns_source,countrycode_source,name_target,street_target,city_target,postalcode_target,duns_target,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
c6401891,17ce75d1,dr ing k busch gmbh,1 schauinslandstr,maulburg,79689,,DE,k busch gmbh,1 schauinslandstr,maulburg,79689,317541456.0,DE
fe5db42f,666666f3,le joint francais,centre administratif,thure,86540,,FR,le joint francais,centre services partages,thure,86540,766251516.0,FR
2b6c43e5,71d680b3,industrie und,fasanenstr,berlin,10623,,DE,industrie und handelskammer,fasanenstr,berlin,10623,,DE
8b5d81b9,591099fe,nespresso deutschland gmbh,23 speditionstr,dusseldorf,40221,333868649.0,DE,nespresso deutschland gmbh,speditionsstrae,dusseldorf,40221,,DE
f02cb731,f02cb731,selex sensos and airborne systems,lg,edinburgh,28000,23226769.0,ES,selex sensos and airborne systems,lg,edinburgh,28000,23226769.0,ES


In [52]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
X_sbs.loc[negative_matches].sample(5)

Showing negativematches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,street_source,city_source,postalcode_source,duns_source,countrycode_source,name_target,street_target,city_target,postalcode_target,duns_target,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4c115719,afac4773,hamilton sundstrand aerospace,4747 harrison ave,rockford,61108-7929,,US,parcom gmbh,ewald renz str,bad schonborn,76669,,DE
150322b3,ab364b25,fako gmbh,peutestr,hamburg,20539,313518398.0,DE,c and g tool and cutter co ltd c and g tool an...,north road,yate,bs17 5lq,,GB
0908a0aa,6065ae26,selex es spa,4 piazza monte grappa,rome,195,,IT,eriks gmbh,bronninghauser str,bielefeld,33729,,DE
dcc308de,6065ae26,drei bond gmbh,17 carl zeiss ring,ismaning,85737,319143681.0,DE,eriks gmbh,bronninghauser str,bielefeld,33729,,DE
8c071814,f8db7942,ge aviation systems ltdcustomer services,evesham road,cheltenham,gl528sf,211207784.0,GB,citaku gbr,32 papenbreede,bad essen,49152,,DE
