# Step - by - step Guide to finding potential matches
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Use the Explorer to label representative sample of the data
5. Do further scoring and add new features to the similarity matrix
6. Train a machine learning model on the data
 

## 1. Prepare the data

In [38]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue

In [22]:
df_source_raw=getsource(nrows=None)
df_target_raw = gettarget(nrows=None)
print(df_source_raw.shape[0])
print(df_target_raw.shape[0])

1444
3177


In [23]:
def prepare_source(df):
    """

    Args:
        df:

    Returns:
        pd.DataFrame
    """
    df2 = df
    return df2

def prepare_target(df):
    """

    Args:
        df:

    Returns:
        pd.DataFrame
    """
    df2 = df
    return df2

df_source = prepare_source(df_source_raw)
df_target = prepare_target(df_target_raw)

## 2. Push the data to Elastic Search

In [24]:
import elasticsearch
import pandas as pd
import time
from suricate.dbconnectors.esconnector import index_with_es

In [25]:
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print(catcount)

3177


## 3. Create the first similarity matrix

In [26]:
from suricate.dbconnectors import EsConnector

In [27]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
Xst = escon.fit_transform(X=df_source)
ix = Xst.index
Xsbs = escon.getsbs(X=df_source, on_ix=ix)
print(Xsbs.shape[0])
print(Xsbs.sample(5))

14440
                                      name_source  \
ix_source ix_target                                 
9727627d  9727627d   ahc oberflaechentechnik gmbh   
2253b32d  5e587c49                         wtd 81   
d22b2cb0  c9616f5e        zentro elektrik gmbh kg   
dcecaed1  dcecaed1   hubert waltermann eisenwaren   
3b4a9463  fc5ad975           pratt whitney canada   

                                      name_target              street_source  \
ix_source ix_target                                                            
9727627d  9727627d   ahc oberflaechentechnik gmbh         2 2 zillenhardtstr   
2253b32d  5e587c49                      fath gmbh                    bergstr   
d22b2cb0  c9616f5e                    elektro rhl              7 hahen strae   
dcecaed1  dcecaed1   hubert waltermann eisenwaren                  4 rat loh   
3b4a9463  fc5ad975                  pratt whitney  4 dr ernst zimmermann str   

                          street_target   city_source        

4. Further scoring

In [18]:
from suricate.sbsdftransformers import FuncSbsComparator
from sklearn.pipeline import FeatureUnion

In [29]:
_sbs_score_list = [
    ('name_fuzzy', FuncSbsComparator(on='name', comparator='fuzzy')),
    ('street_fuzzy', FuncSbsComparator(on='street', comparator='fuzzy')),
    ('name_token', FuncSbsComparator(on='name', comparator='token')),
    ('street_token', FuncSbsComparator(on='street', comparator='token')),
    ('city_fuzzy', FuncSbsComparator(on='city', comparator='fuzzy')),
    ('postalcode_fuzzy', FuncSbsComparator(on='postalcode', comparator='fuzzy')),
    ('postalcode_contains', FuncSbsComparator(on='postalcode', comparator='contains'))
]
scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores = pd.DataFrame(
    data=scorer_sbs.fit_transform(X=Xsbs),
    index=Xsbs.index,
    columns=[c[0] for c in _sbs_score_list]
)
print(Xscores.sample(3))

                     name_fuzzy  street_fuzzy  name_token  street_token  \
ix_source ix_target                                                       
a7f73d0b  12c8876d         0.21          0.91        0.21          0.91   
f02cb731  3dd3b43c         0.83          0.00        0.83          0.00   
c42aecca  56a3da87         1.00          0.97        1.00          0.97   

                     city_fuzzy  postalcode_fuzzy  postalcode_contains  
ix_source ix_target                                                     
a7f73d0b  12c8876d         0.15               0.2                  0.0  
f02cb731  3dd3b43c         0.18               0.0                  0.0  
c42aecca  56a3da87         1.00               1.0                  1.0  


Concatenate with the scores from the previous step

In [32]:
Xscores = pd.concat([Xst[['es_score']], Xscores], axis=1, ignore_index=False)
print(Xscores.shape)
print(Xscores.columns)
Xscores.sample(5)

(14440, 8)
Index(['es_score', 'name_fuzzy', 'street_fuzzy', 'name_token', 'street_token',
       'city_fuzzy', 'postalcode_fuzzy', 'postalcode_contains'],
      dtype='object')


Unnamed: 0_level_0,Unnamed: 1_level_0,es_score,name_fuzzy,street_fuzzy,name_token,street_token,city_fuzzy,postalcode_fuzzy,postalcode_contains
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9e14e359,8b97a32c,14.251021,0.15,0.56,0.07,0.56,1.0,0.62,0.0
273ccd4e,f807543d,16.240355,0.34,0.23,0.38,0.23,1.0,1.0,1.0
720277b6,22763bda,8.347393,0.0,0.37,0.0,0.37,0.3,0.8,0.0
e6f50811,782a5b93,49.309097,0.13,1.0,0.13,1.0,1.0,1.0,1.0
f7da5ee1,dcb02beb,10.689902,0.41,0.19,0.26,0.19,1.0,0.6,0.0


## 6. Apply the machine-learning model

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

for expediency, we will use the y_true already saved

In [40]:
ix_con = Xst.index
y_true = getytrue().loc[ix_con]
print(y_true.shape[0])

14440


### Make the pipeline

In [43]:
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=Xscores, y=y_true, scoring=scoring, cv=3)
for c in scoring:
    print(pd.datetime.now(), ' | {} score: {}'.format(c, np.average(scores['test_'+c])))

2020-04-19 20:57:28.254170  | precision score: 0.9086327781758312
2020-04-19 20:57:28.254358  | recall score: 0.8598840163811916
2020-04-19 20:57:28.254449  | accuracy score: 0.9556102858455898


In [46]:
pipe.fit(X=Xscores, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=Xscores), index=ix_con, name='y_pred')
positive_matches = y_pred.loc[y_pred == 1.0].index
print(Xsbs.loc[positive_matches]).sample(5)

                                                           name_source  \
ix_source ix_target                                                      
1a6ccbe5  7878c231                                            vri gmbh   
b3d57f9d  fbba638f                                        aeroflex ltd   
          1bf66a20                                        aeroflex ltd   
          b3d57f9d                                        aeroflex ltd   
fe5db42f  666666f3                                   le joint francais   
1b69ccb6  6602b62d                          nespresso deutschland gmbh   
          591099fe                          nespresso deutschland gmbh   
          1b69ccb6                          nespresso deutschland gmbh   
          1bc1ab2a                          nespresso deutschland gmbh   
          8e71eb0a                          nespresso deutschland gmbh   
          8b5d81b9                          nespresso deutschland gmbh   
303a4b5d  69e3533d                    

AttributeError: 'NoneType' object has no attribute 'sample'