# 3-Implementing a ML Model
1. Load the data
2. Create the similarity matrix
3. Apply the ML model
4. Visualize the matches

## 1. Load the data

In [1]:
import numpy as np
import pandas as pd
from suricate.data.companies import getsource, gettarget, getytrue
nrows = 100

In [2]:
df_source = getsource(nrows=nrows)
print('Number of rows in source data:{}'.format(df_source.shape[0]))
df_source.sample(5)

Number of rows in source data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8946d1cc,isr stephan walter,56 nahestr,rodgau,63110,,DE
35508103,hr kommunikation,max von mller s trasse,rottenburg,84056,340190931.0,DE
3c4c091c,industrie und,olgastr,ulm,89073,,DE
43cb5eea,kerb konus vertriebs gmbh,7 werner von braun str,amberg,92224,,DE
83024ff7,battery direct gmbh,1 ewald renz str,bad schonborn,76669,331599808.0,DE


In [3]:
df_target = gettarget(nrows=nrows)
print('Number of rows in target data:{}'.format(df_target.shape[0]))
df_target.sample(5)

Number of rows in target data:100


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f8db7942,citaku gbr,32 papenbreede,bad essen,49152,,DE
0b9845b5,dymacon gmbh,96 rheinstrae,darmstadt,64295,388046799.0,DE
71d680b3,industrie und handelskammer,fasanenstr,berlin,10623,,DE
f3020ce6,linde technische gase gmbh,70 seitnerstr,pullach,82049,329252832.0,DE
94c93a16,csb battery store,14 badstr,neumarkt in der oberpfalz,92318,,DE


In [4]:
Xst = [df_source, df_target]

## 2. Create the similarity matrix

In [5]:
from suricate.dftransformers import DfApplyComparator, VectorizerConnector, ExactConnector, cartesian_join
from suricate.preutils import createmultiindex
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scores = [
    ('name_vecword', VectorizerConnector(on='name', analyzer='word', ngram_range=(1,2))),
    ('name_vecchar', VectorizerConnector(on='name', analyzer='char', ngram_range=(1,3))),
    ('street_vecword', VectorizerConnector(on='street', analyzer='word', ngram_range=(1,2))),
    ('street_vecchar', VectorizerConnector(on='street', analyzer='char', ngram_range=(1,3))),
    ('city_vecchar', VectorizerConnector(on='city', analyzer='char', ngram_range=(1,3))),
    ('postalcode_exact', ExactConnector(on='postalcode')),
    ('duns_exact', ExactConnector(on='duns')),
    ('countrycode_exact', ExactConnector(on='countrycode'))
]
transformer = FeatureUnion(scores)
X_score = transformer.fit_transform(X=Xst)
print(X_score.shape)
ix_con = createmultiindex(X=Xst)
X_score = pd.DataFrame(data=X_score, columns = [c[0] for c in scores], index=ix_con)
X_score.sample(4)

(10000, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,name_vecword,name_vecchar,street_vecword,street_vecchar,city_vecchar,postalcode_exact,duns_exact,countrycode_exact
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
83024ff7,d56af94b,0.04029,0.141152,0.0,0.03154,0.067234,0.0,0.0,1.0
3c4c091c,25d6d84f,0.0,0.042746,0.0,0.03362,0.02254,0.0,0.0,1.0
43cb5eea,7554a1bb,0.0326,0.132317,0.0,0.109436,0.066043,0.0,0.0,1.0
f7da5ee1,eef729a2,0.04418,0.217567,0.0,0.031334,0.141088,0.0,0.0,1.0


## 3. Apply the Machine-Learning Model

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate

### 3.1 Load y_true
for expediency, we will use the y_true already saved
y_true is supervized data

In [7]:
y_true = getytrue().loc[ix_con]
print('Number of data in training:{}'.format(y_true.shape[0]))

Number of data in training:10000


### 3.2 Make the pipeline

In [8]:
pipe = Pipeline([
    ('imputer', Imputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('classifier', LogisticRegressionCV(cv=5))
])
scoring = ['precision', 'recall', 'accuracy']
scores = cross_validate(estimator=pipe, X=X_score, y=y_true, scoring=scoring, cv=5)
for c in scoring:
    print('{} score: {}'.format(c, np.average(scores['test_'+c])))

precision score: 0.96
recall score: 0.875
accuracy score: 0.9993000000000001


In [9]:
pipe.fit(X=X_score, y=y_true)
y_pred = pd.Series(data=pipe.predict(X=X_score), index=ix_con, name='y_pred')

## 4. Visualize the matches

In [10]:
X_sbs = cartesian_join(source=df_source, target=df_target)

### 4.1. Positive matches (matching pairs)

In [11]:
positive_matches = y_pred.loc[y_pred == 1.0].index
print('Showing positive matches')
X_sbs.loc[positive_matches].sample(5)

Showing positive matches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8c071814,cce26ebb,ge aviation systems ltdcustomer services,ge aviation,evesham road,evesham road,cheltenham,cheltenham,gl528sf,gl52 8sf,211207784.0,288525181.0,GB,GB
7a037111,69ef33b5,dr ursula matschke,dr ursula matschke,33 emilienstr,33 emilienstr,stuttgart,stuttgart,70563,70563,,,DE,DE
f02cb731,f02cb731,selex sensos and airborne systems,selex sensos and airborne systems,lg,lg,edinburgh,edinburgh,28000,28000,23226769.0,23226769.0,ES,ES
f0d34671,253ce464,hamilton sundstrand,hamilton sundstrand,cl4747 harrison ave,4747 harrison ave,rockford,rockford,61125,61108-7929,,51079937.0,US,US
f7da5ee1,484866fb,e a elektro automatik gmbh,ea elektro automatik,31 37 helmholtzstr,31 37 helmholtzstr,viersen,viersen,41747,41747,344304634.0,330862074.0,DE,DE


### 4.2. Negative matches (non-matching pairs)

In [12]:
negative_matches = y_pred.loc[y_pred == 0.0].index
print('Showing negativematches')
X_sbs.loc[negative_matches].sample(5)

Showing negativematches


Unnamed: 0_level_0,Unnamed: 1_level_0,name_source,name_target,street_source,street_target,city_source,city_target,postalcode_source,postalcode_target,duns_source,duns_target,countrycode_source,countrycode_target
ix_source,ix_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7a037111,ab00128c,dr ursula matschke,carney gmbh,33 emilienstr,25 vor dem wald,stuttgart,wutha farnroda,70563,99848,,342656493.0,DE,DE
68ba9560,3cbe1b93,frey blumenhof,maximilian wigger suttner,mittenheimer str,jahnstr,oberschleissheim,ulm,85764,89073,342418069.0,,DE,DE
0297485a,24273c60,gross karlheinz,general dynamics uk,37 ulmenweg,road,zweibrucken,st leonards,66482,tn38 0hw,,233415103.0,DE,GB
9438b5e4,975bdeab,dichtelemente arcus gmbh,mce computer gmbh,6 gromoor ring,14 lerchenstr,hamburg,munich,21079,80995,313821894.0,343582987.0,DE,DE
1d2e777b,501ad89f,citaku gbr,ge aviation systems ltd,32 papenbreede,cheltenham,bad essen,cheltenham,49152,gl528sf,,,DE,GB
