# String similarity: comparing Levenshtein distance and Tf-Idf Vectorizer methods

String similarity (Comparing 'hello world' and 'hello wolrd') is an important component of deduplication.    
This workbook makes uses of the flexible structure of the suricate package to compare two methods:
- One using the classic Levenshtein distance
- The other one using a tokenization of the words, (either by character or word), using n-grams, and then using the cosine similarity

#### Libraries needed

In [1]:
# Standard libraries for data science + data visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
from sklearn.cluster import KMeans as Cluster
from sklearn.pipeline import make_union, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression as Classifier
from sklearn.preprocessing import RobustScaler as Scaler

# Suricate dedicated methods
from suricate.lrdftransformers import VectorizerConnector, FuzzyConnector
from suricate.preutils import createmultiindex
from suricate.preutils.scores import scores
from suricate.pipeline import PipeLrClf

#### Load the data

In [2]:
n_lines = 100
from suricate.data.companies import getXlr, getytrue
df_X = getXlr()
y_true = getytrue()
n_possible_pairs= left.shape[0]*right.shape[0]
print('Two datasets of size {} and {}  rows yield:\n {} possible pairs to scan --> manually exhausting'.format(left.shape[0], right.shape[0], n_possible_pairs))
left.sample(5)

FileNotFoundError: [Errno 2] File b'/Users/paulogier/anaconda3/envs/wookie_env/lib/python3.7/site-packages/suricate/data/csv_company/left.csv' does not exist: b'/Users/paulogier/anaconda3/envs/wookie_env/lib/python3.7/site-packages/suricate/data/csv_company/left.csv'

# How does the string comparator performs?

#### The framework is fully compatible with open-source Scikit-Learn Machine Learning libraries

In [None]:
t1 = VectorizerConnector(on='name', analyzer='word', ngram_range=(1,2))
t2 = VectorizerConnector(on='name', analyzer='char', ngram_range=(1,2))
t3 = FuzzyConnector(on='name')

In [None]:
%%timeit
y1=t1.fit_transform(X=df_X)

In [None]:
%%timeit
y2 = t2.fit_transform(X=df_X)

In [None]:
%%timeit
y3 = t3.fit_transform(X=df_X)

# Make prediction using training data

In [None]:
p1 = PipeLrClf(transformer=t1, classifier=Classifier())
y1_pred = pd.Series(
    data = p1.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y1_pred))

In [None]:
p2 = PipeLrClf(transformer=t2, classifier=Classifier())
y2_pred = pd.Series(
    data = p2.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y2_pred))

In [None]:
p3 = PipeLrClf(transformer=t3, classifier=Classifier())
y3_pred = pd.Series(
    data = p3.fit_predict(X=df_X, y=y_true),
    index=createmultiindex(X=df_X)
)
print(scores(y_true=y_true, y_pred=y3_pred))