# Step - by - step Guide to finding potential matches with Elastic Search
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Do further scoring and add new features to the similarity matrix
5. Train a machine learning model on the data
 

## 1. Load the data

In [15]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue
nrows = 100

In [16]:
df_source = getsource(nrows=nrows)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1a6ccbe5,vri gmbh,wilhelm maybach str,ellwangen,73479,,DE
2f0901d9,mike garwood ltd,shelleys lane,alton,gu34,,GB
39082433,sinus electronic gmbh,schleifweg,untereisesheim,74257,,DE
77f5274a,selex es spa,via piemonte,rome,187,434003576.0,IT
591099fe,nespresso deutschland gmbh,speditionsstrae,dusseldorf,40221,,DE


In [None]:
df_target = gettarget(nrows=nrows)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

## 2. Push the data to Elastic Search

In [4]:
import elasticsearch
import time
from suricate.dbconnectors.esconnector import index_with_es

In [5]:
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print('Number of docs in created index:{}'.format(catcount))

Number of docs in created index:3177


## 3. Create the first similarity matrix

In [6]:
from suricate.dbconnectors import EsConnector

In [7]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
Xst = escon.fit_transform(X=df_source)
ix_con = Xst.index
print('Number of possible pairs:{}'.format(len(ix_con)))
print('Example of similarity matrix:')
Xst.sample(5)

Number of possible pairs:14440
Example of similarity matrix:


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,es_rank
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1
6a0fa6cd,06045ef8,37.74788,3
d373c382,e7cee213,14.45892,6
f6ea706c,e268e671,40.241352,5
46a13500,eabdefc6,19.725716,8
f0ec3f43,c52d24d2,36.110096,4


In [8]:
Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
print('Example of side-by-side view of source and target data')
Xsbs.sample(5)

Example of side-by-side view of source and target data


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
b2bd34fb,dba7dcb7,sport in,boerner und co,17 jesuitenstr,5 messerschmittstr,ingolstadt,ingolstadt,85049,85053,340898242.0,318154358.0,DE,DE
3967f071,e754a524,diagnostikzentrum am klinikum,diagnostikzentrum am klinikum,2 rantgen str,2 rantgen str,friedrichshafen,friedrichshafen,88048,88048,,,DE,DE
932b06e8,15e562a8,natixis factor starplast,factorem international visio,avenue winston churchill,avenue winston churchill,charenton le pont,charenton le pont,94220,94220,281985668.0,281985668.0,FR,FR
ed6089e2,691642b5,tropack packmittel gmbh,koller gmbh,6 gewerbestr,16 gewerbestr,lahnau,vohburg an der donau,35633,85088,329711691.0,,DE,DE
b911b552,9017414d,fhf gmbh,mobilfunk bremen handels gmbh,26 kap horn str,feuerkuhle,bremen,bremen,28237,28207,,328651005.0,DE,DE


## 4. Further scoring

In [12]:
from suricate.sbstransformers import SbsApplyComparator
from sklearn.pipeline import FeatureUnion

In [13]:
_sbs_score_list = [
    ('name_fuzzy', SbsApplyComparator(on='name', comparator='simple')),
    ('street_fuzzy', SbsApplyComparator(on='street', comparator='simple')),
    ('name_token', SbsApplyComparator(on='name', comparator='token')),
    ('street_token', SbsApplyComparator(on='street', comparator='token')),
    ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')),
    ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains'))
]
scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores = pd.DataFrame(
    data=scorer_sbs.fit_transform(X=Xsbs),
    index=ix_con,
    columns=[c[0] for c in _sbs_score_list]
)
print('Additional scores')
Xscores.sample(3)

Additional scores


Unnamed: 0_level_0,Unnamed: 1_level_0,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f7da5ee1,995df284,0.62,0.41,0.44,0.41,1.0,0.6,0.0
86ca1988,653240ad,0.55,0.46,0.45,0.46,1.0,1.0,1.0
f9d94010,a98a2f74,0.45,0.48,0.41,0.48,0.31,0.4,0.0


Concatenate with the scores from the previous step

In [None]:
Xscores = pd.concat([Xst[['es_score']], Xscores], axis=1, ignore_index=False)
print('Final scoring table')
Xscores.sample(5)

## 5. Apply the machine-learning model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

for expediency, we will use the y_true already saved

In [None]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

### Make the pipeline

In [None]:
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=Xscores, y=y_true, scoring=scoring, cv=3)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

In [None]:
pipe.fit(X=Xscores, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=Xscores), index=ix_con, name='y_pred')

In [None]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
Xsbs.loc[positive_matches].sample(5)

In [None]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
Xsbs.loc[negative_matches].sample(5)

