# Comparing Levenshtein distance and Tf-Idf String similarity

#### Libraries needed

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import make_union, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression as Classifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans as Cluster
plt.style.use('fivethirtyeight')

from suricate.lrdftransformers import VectorizerConnector, ExactConnector, FuzzyConnector, VisualHelper
from suricate.preutils import createmultiindex
from suricate.preutils.scores import scores
from suricate.lrdftransformers.cluster import ClusterQuestions, ClusterClassifier
from suricate.pipeline import PipeSbsClf, PruningLrSbsClf, PipeLrClf
from suricate.sbsdftransformers import FuncSbsComparator

#### Load the data

In [33]:
n_lines = 300
filepath_left = '/Users/paulogier/81-GithubPackages/suricate/suricate/data/left.csv'
filepath_right = '/Users/paulogier/81-GithubPackages/suricate/suricate/data/right.csv'
filepath_training = '/Users/paulogier/81-GithubPackages/suricate/suricate/data/trainingdata.csv'
left = pd.read_csv(filepath_left, index_col=0, nrows=n_lines, dtype=str)
right = pd.read_csv(filepath_right, index_col=0, nrows=n_lines, dtype=str)
y_true = pd.read_csv(filepath_training, usecols=['ix_left', 'ix_right', 'y_true']).set_index(['ix_left', 'ix_right'])['y_true']
df_X = [left, right]
y_true = y_true.loc[
    y_true.index.intersection(createmultiindex(X=df_X))
]
n_possible_pairs= left.shape[0]*right.shape[0]
print('Two datasets of size {} and {}  rows yield:\n {} possible pairs to scan --> manually exhausting'.format(left.shape[0], right.shape[0], n_possible_pairs))
left.sample(5)

Two datasets of size 300 and 300  rows yield:
 90000 possible pairs to scan --> manually exhausting


Unnamed: 0_level_0,name,street,city,postalcode,duns,countrycode
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
b3d57f9d-db16-42b5-ac05-44a27f15fc97,aeroflex ltd,monks brook industrial park,chandler s ford,so534ra,,GB
e20bbf3a-874b-4e01-ada5-e5913460a059,fhf gmbh,waterbergstrae 11,bremen,28237,328761531.0,DE
de51d1e5-d11b-4304-af25-bb43649d5b87,von roll uk limited,wharfedale road,bradford,bd4 6sg,21309974.0,GB
212fd244-f47c-4241-8555-b67ee2e7319a,buerklin ohg,245 hherweg,dusseldorf,40231,325678258.0,DE
07587bc5-5794-407f-8150-002c338d1e47,wuerttembergische allplastik,12 johannes kepler str,herrenberg,71083,,DE


# How does the string comparator performs?

#### The framework is fully compatible with open-source Scikit-Learn Machine Learning libraries

In [16]:
t1 = VectorizerConnector(on='name', analyzer='word', ngram_range=(1,2))
t2 = VectorizerConnector(on='name', analyzer='char', ngram_range=(1,2))
t3 = FuzzyConnector(on='name')

In [17]:
%%timeit
y1=t1.fit_transform(X=df_X)

119 ms ± 41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%%timeit
y2 = t2.fit_transform(X=df_X)

The slowest run took 5.32 times longer than the fastest. This could mean that an intermediate result is being cached.
280 ms ± 142 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%%timeit
y3 = t3.fit_transform(X=df_X)

2.2 s ± 257 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Make prediction using training data

In [34]:
p1 = PipeLrClf(transformer=t1, classifier=Classifier())
y1_pred = pd.Series(
    data = p1.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y1_pred))



{'precision': 0.7572815533980582, 'recall': 0.6782608695652174, 'f1': 0.7155963302752293, 'accuracy': 0.8534278959810875, 'balanced_accuracy': 0.798546019198193}


In [35]:
p2 = PipeLrClf(transformer=t2, classifier=Classifier())
y2_pred = pd.Series(
    data = p2.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y2_pred))



{'precision': 0.6880733944954128, 'recall': 0.6521739130434783, 'f1': 0.6696428571428572, 'accuracy': 0.8250591016548463, 'balanced_accuracy': 0.770892151326934}


In [36]:
p3 = PipeLrClf(transformer=t3, classifier=Classifier())
y3_pred = pd.Series(
    data = p3.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y3_pred))



{'precision': 0.696078431372549, 'recall': 0.6173913043478261, 'f1': 0.6543778801843319, 'accuracy': 0.8226950354609929, 'balanced_accuracy': 0.7583709768492377}
