In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('ml_dataset.csv')
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match
...,...,...,...,...
18503,GLOBAL AGE LIMITED,"SHAABAN, Bouthaina",Non-Match,Not Match
18504,OBSHCHESTVO S OGRANICHENNOI OTVETSTVENNOSTYU M...,TAMIN KALAYE SABZ ARAS COMPANY,Non-Match,Not Match
18505,"AHMED, Qassim Abdullah Ali",KOREA HAEGUMGANG TRADING CORPORATION,Non-Match,Not Match
18506,JINGHO TECHNOLOGY CO. LIMITED,COMITE' DE BIENFAISANCE ET DE SECOURS AUX PALE...,Non-Match,Not Match


# Feature engineering

In [4]:
def matching_numbers(original_name, test_case_name):

    original_numbers = set(re.findall(r'[0-9]+', original_name))
    test_case_numbers = set(re.findall(r'[0-9]+', test_case_name))    
    union = original_numbers.union(test_case_numbers)
    intersection = original_numbers.intersection(test_case_numbers)

    if len(original_numbers)==0 and len(test_case_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [5]:
def engineer_features(df):

    df['Test Case Name'] = df['Test Case Name'].str.lower()
    df['Original Name'] = df['Original Name'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['Original Name'], 
                                      x['Test Case Name']), axis=1)

    df['damerau_levenshtein_distance'] = df.apply(
    lambda x: jf.damerau_levenshtein_distance(x['Original Name'], 
                                              x['Test Case Name']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_winkler_similarity'] = df.apply(
    lambda x: jf.jaro_winkler_similarity(x['Original Name'], 
                                         x['Test Case Name']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['Original Name'], 
                                         x['Test Case Name']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['Original Name'], 
                         x['Test Case Name']), axis=1)

    df['partial_ratio'] = df.apply(
    lambda x: fuzz.partial_ratio(x['Original Name'], 
                                 x['Test Case Name']), axis=1)

    df['token_sort_ratio'] = df.apply(
    lambda x: fuzz.token_sort_ratio(x['Original Name'], 
                                    x['Test Case Name']), axis=1)

    df['token_set_ratio'] = df.apply(
    lambda x: fuzz.token_set_ratio(x['Original Name'], 
                                   x['Test Case Name']), axis=1)

    df['w_ratio'] = df.apply(
    lambda x: fuzz.WRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['uq_ratio'] = df.apply(
    lambda x: fuzz.UQRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['q_ratio'] = df.apply(
    lambda x: fuzz.QRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)    

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['Original Name'], 
                               x['Test Case Name']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)

    df['log_fuzz_score'] = (df['ratio'] + df['partial_ratio'] + 
                            df['token_sort_ratio'] + df['token_set_ratio']).apply(np.log)

    df['log_fuzz_score_numbers'] = df['log_fuzz_score'] + (df['matching_numbers']).apply(np.log)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [6]:
df = engineer_features(df)
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,16,16,18,0.449735,0.449735,0,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,44,44,51,0.510733,0.510733,0,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,28,28,35,0.553419,0.553419,0,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,43,43,54,0.546201,0.546201,0,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


# Standardization

In [7]:
scaled_df = df.copy()
scaler = StandardScaler()
scaled_df.iloc[:, 4:] = scaler.fit_transform( df.iloc[:, 4:] )
scaled_df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
2,korean people's army,korean people's army,Fuzzy Match,Match,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0.087678,0.089095,-0.046822,-1.130002,-1.132696,-0.987863,...,-1.139610,-1.139366,-1.162921,-1.197981,-0.986128,-0.986128,0.290826,0.290628,-1.042007,-0.195218
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,2.012732,2.014502,1.868584,-0.870142,-0.876169,-0.987863,...,-1.107480,-0.865624,-0.900348,-1.069783,-1.016632,-1.016632,0.290826,0.290628,-0.802370,-0.090337
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0.912701,0.914269,0.939902,-0.688292,-0.696650,-0.987863,...,-0.946829,-0.926456,-0.958697,-1.069783,-0.864111,-0.864111,0.290826,0.290628,-0.730363,-0.058821
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,1.943980,1.945737,2.042712,-0.719040,-0.727005,-0.987863,...,-0.978959,-0.774377,-0.783649,-0.909535,-0.803102,-0.803102,0.290826,0.290628,-0.628492,-0.014236


In [8]:
scaled_df['Label'] = scaled_df['Label'].map({'Match' : 1, 'Not Match' : 0})
scaled_df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,1,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,1,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
2,korean people's army,korean people's army,Fuzzy Match,1,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
3,rancho la herradura,rancho la herradura,Fuzzy Match,1,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,1,-1.012352,-1.011137,-1.091588,1.214226,1.181471,1.012286,...,1.141642,1.202646,1.083537,1.109590,1.210180,1.210180,0.290826,0.290628,0.976623,0.688270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,0,0.087678,0.089095,-0.046822,-1.130002,-1.132696,-0.987863,...,-1.139610,-1.139366,-1.162921,-1.197981,-0.986128,-0.986128,0.290826,0.290628,-1.042007,-0.195218
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,0,2.012732,2.014502,1.868584,-0.870142,-0.876169,-0.987863,...,-1.107480,-0.865624,-0.900348,-1.069783,-1.016632,-1.016632,0.290826,0.290628,-0.802370,-0.090337
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,0,0.912701,0.914269,0.939902,-0.688292,-0.696650,-0.987863,...,-0.946829,-0.926456,-0.958697,-1.069783,-0.864111,-0.864111,0.290826,0.290628,-0.730363,-0.058821
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,0,1.943980,1.945737,2.042712,-0.719040,-0.727005,-0.987863,...,-0.978959,-0.774377,-0.783649,-0.909535,-0.803102,-0.803102,0.290826,0.290628,-0.628492,-0.014236


# Create test and train data

In [9]:
X = scaled_df[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']].values
y = scaled_df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Select the model

In [10]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1, eval_metric = ['logloss', 'auc', 'error']),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "MLP": MLPClassifier(),
    "XGBClassifer tuned": XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1,
                      eval_metric = ['logloss', 'auc', 'error'])
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision',
                                   'recall','f1','roc','run_time','tp','fp',
                                   'tn','fn'])

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results

Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DummyClassifier_stratified,0.491986,0.508014,0.491971,0.485416,0.488671,0.491987,0.0,1384,1392,1348,1429
1,KNeighborsClassifier,0.993697,0.006303,0.99602,0.991358,0.993683,0.993698,0.02,2765,11,2753,24
2,XGBClassifier,0.996398,0.003602,0.996757,0.996039,0.996398,0.996398,0.24,2767,9,2766,11
3,DecisionTreeClassifier,0.992977,0.007023,0.990681,0.995319,0.992994,0.992976,0.0,2750,26,2764,13
4,RandomForestClassifier,0.996578,0.003422,0.996758,0.996399,0.996578,0.996578,0.02,2767,9,2767,10
5,AdaBoostClassifier,0.995318,0.004682,0.998189,0.992438,0.995305,0.995318,0.01,2771,5,2756,21
6,GradientBoostingClassifier,0.995678,0.004322,0.996752,0.994598,0.995674,0.995678,0.04,2767,9,2762,15
7,Perceptron,0.984873,0.015127,0.97867,0.991358,0.984973,0.984872,0.0,2716,60,2753,24
8,MLP,0.994778,0.005222,0.996746,0.992798,0.994768,0.994778,0.18,2767,9,2757,20
9,XGBClassifer tuned,0.994778,0.005222,0.993534,0.996039,0.994785,0.994777,0.02,2758,18,2766,11


# Select and tune the best model

In [11]:
def get_scale_pos_weight(target, square_root=False, gridsearch=False):
    """Return the scale_pos_weight parameter for the XGBoost model when data are imbalanced.
    The scale_pos_weight parameter is calculated from the ratio of the negative class over
    the positive class. The exact scale_pos_weight sometimes does not give the best result,
    so by passing the gridsearch=True parameter you can return a list of values to test with
    GridSearchCV. In addition, passing square_root=True changes the scale_pos_weight to the
    square root value, which can sometimes be beneficial on extremely imbalanced data.

    :param target: Pandas dataframe column containing the binary target
    :param square_root: Optional boolean parameter to convert to square root on extremely unbalanced data
    :param gridsearch: Optional boolean parameter to return a bracketed list for use in GridSearchCV

    Usage:
        scale_pos_weight = get_scale_pos_weight(df['target'], square_root=False, gridsearch=True)

    """

    import math

    scale_pos_weight = round((len(target) - sum(target)) / sum(target))

    if square_root:
        scale_pos_weight = round(math.sqrt(scale_pos_weight))

    if gridsearch:
        scale_pos_weight = [scale_pos_weight-2, scale_pos_weight-1, scale_pos_weight, 
                            scale_pos_weight+1, scale_pos_weight+2]

    return scale_pos_weight

In [12]:
scale_pos_weight = get_scale_pos_weight(scaled_df['Label'], square_root=False, gridsearch=True)
scale_pos_weight

[-1, 0, 1, 2, 3]

In [13]:
n_estimators = [50]
learning_rate = [0.1]
max_depth = [5, 10, 20]
min_child_weight = [1, 2]
scale_pos_weight = [8, 9, 10, 11, 12]
gamma = [0.9, 1.0]
subsample = [0.9]
colsample_bytree = [0.8, 1.0]

start = time.perf_counter()

param_grid = dict(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                min_child_weight=min_child_weight,
                scale_pos_weight=scale_pos_weight,
                gamma=gamma,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
)

model = XGBClassifier(random_state=0, eval_metric = ['logloss', 'auc', 'error'])

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           )

print('Running GridSearchCV...')
best_model = grid_search.fit(X_train, y_train)
best_score = round(best_model.score(X_test, y_test), 4)
best_params = best_model.best_params_

print('Score:', best_score)
print('Optimum parameters', best_params)

finish = time.perf_counter()
run_time = (finish - start / 60)
print(f"Completed task in {run_time:0.4f} seconds")

Running GridSearchCV...
Score: 0.9995
Optimum parameters {'colsample_bytree': 1.0, 'gamma': 1.0, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 2, 'n_estimators': 50, 'scale_pos_weight': 8, 'subsample': 0.9}
Completed task in 396.6161 minutes


# Fit selected model

In [15]:
model = XGBClassifier(colsample_bytree=1.0,
                      gamma=1.0,
                      max_depth=10,
                      min_child_weight=2,
                      scale_pos_weight=8,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1,
                      eval_metric = ['logloss', 'auc', 'error'])
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Assess model performance

In [16]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.99      1.00      0.99      2777
   not match       1.00      0.99      0.99      2776

    accuracy                           0.99      5553
   macro avg       0.99      0.99      0.99      5553
weighted avg       0.99      0.99      0.99      5553



In [19]:
tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)
print('tp: {}\nfp: {}\nfn: {}\ntn: {}'.format(tp, fp, fn, tn))

tp: 2758
fp: 18
fn: 11
tn: 2766
