In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('ml_dataset_score.csv')
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match,100
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match,100
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match,100
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match,100
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match,100
...,...,...,...,...,...
18503,GLOBAL AGE LIMITED,"SHAABAN, Bouthaina",Non-Match,Not Match,0
18504,OBSHCHESTVO S OGRANICHENNOI OTVETSTVENNOSTYU M...,TAMIN KALAYE SABZ ARAS COMPANY,Non-Match,Not Match,0
18505,"AHMED, Qassim Abdullah Ali",KOREA HAEGUMGANG TRADING CORPORATION,Non-Match,Not Match,0
18506,JINGHO TECHNOLOGY CO. LIMITED,COMITE' DE BIENFAISANCE ET DE SECOURS AUX PALE...,Non-Match,Not Match,0


# Feature engineering

In [4]:
def matching_numbers(original_name, test_case_name):

    original_numbers = set(re.findall(r'[0-9]+', original_name))
    test_case_numbers = set(re.findall(r'[0-9]+', test_case_name))    
    union = original_numbers.union(test_case_numbers)
    intersection = original_numbers.intersection(test_case_numbers)

    if len(original_numbers)==0 and len(test_case_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [5]:
def engineer_features(df):

    df['Test Case Name'] = df['Test Case Name'].str.lower()
    df['Original Name'] = df['Original Name'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['Original Name'], 
                                      x['Test Case Name']), axis=1)

    df['damerau_levenshtein_distance'] = df.apply(
    lambda x: jf.damerau_levenshtein_distance(x['Original Name'], 
                                              x['Test Case Name']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_winkler_similarity'] = df.apply(
    lambda x: jf.jaro_winkler_similarity(x['Original Name'], 
                                         x['Test Case Name']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['Original Name'], 
                                         x['Test Case Name']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['Original Name'], 
                         x['Test Case Name']), axis=1)

    df['partial_ratio'] = df.apply(
    lambda x: fuzz.partial_ratio(x['Original Name'], 
                                 x['Test Case Name']), axis=1)

    df['token_sort_ratio'] = df.apply(
    lambda x: fuzz.token_sort_ratio(x['Original Name'], 
                                    x['Test Case Name']), axis=1)

    df['token_set_ratio'] = df.apply(
    lambda x: fuzz.token_set_ratio(x['Original Name'], 
                                   x['Test Case Name']), axis=1)

    df['w_ratio'] = df.apply(
    lambda x: fuzz.WRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['uq_ratio'] = df.apply(
    lambda x: fuzz.UQRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['q_ratio'] = df.apply(
    lambda x: fuzz.QRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)    

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['Original Name'], 
                               x['Test Case Name']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)

    df['log_fuzz_score'] = (df['ratio'] + df['partial_ratio'] + 
                            df['token_sort_ratio'] + df['token_set_ratio']).apply(np.log)

    df['log_fuzz_score_numbers'] = df['log_fuzz_score'] + (df['matching_numbers']).apply(np.log)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [6]:
df = engineer_features(df)
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


In [7]:
df = df[df['levenshtein_distance'] <= 50]
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


In [8]:
df =  df.drop(index = df[df['Bridger Score'] == 'ERROR WITH TEST CASE'].index.tolist())
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


In [9]:
for i in list(df[df['Bridger Score'] == 'NO MATCH'].index):
    df.loc[i, 'Bridger Score'] = np.random.randint(50, 70)
for i in list(df[df['Bridger Score'] == '0'].index):
    df.loc[i, 'Bridger Score'] = np.random.randint(10, 50)
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,35,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,20,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,20,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,31,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


# Standardization

In [10]:
# scaled_df = df.copy()
# scaler = StandardScaler()
# scaled_df.iloc[:, 4:] = scaler.fit_transform( df.iloc[:, 4:] )
# scaled_df

In [11]:
df['Label'] = df['Label'].map({'Match' : 1, 'Not Match' : 0})
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,1,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,1,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,1,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,1,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,1,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,0,35,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,0,20,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,0,20,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,0,31,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


# Create test and train data

In [12]:
X = df[['Bridger Score', 'levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']].values
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Select the model

In [13]:
classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1, eval_metric = ['logloss', 'auc', 'error']),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "MLP": MLPClassifier(),
    "XGBClassifer tuned": XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1,
                      eval_metric = ['logloss', 'auc', 'error'])
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision',
                                   'recall','f1','roc','run_time','tp','fp',
                                   'tn','fn'])

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results

Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DummyClassifier_stratified,0.503671,0.496329,0.512557,0.522162,0.517315,0.503318,0.0,1449,1378,1295,1326
1,KNeighborsClassifier,0.998164,0.001836,0.998558,0.997838,0.998198,0.998171,0.01,2769,4,2669,6
2,XGBClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.02,2775,0,2673,0
3,DecisionTreeClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2775,0,2673,0
4,RandomForestClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.01,2775,0,2673,0
5,AdaBoostClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2775,0,2673,0
6,GradientBoostingClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.01,2775,0,2673,0
7,Perceptron,0.988987,0.011013,0.981896,0.996757,0.98927,0.988839,0.0,2766,51,2622,9
8,MLP,0.998348,0.001652,0.998918,0.997838,0.998378,0.998358,0.09,2769,3,2670,6
9,XGBClassifer tuned,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2775,0,2673,0


# Select and tune the best model

In [14]:
def get_scale_pos_weight(target, square_root=False, gridsearch=False):
    """Return the scale_pos_weight parameter for the XGBoost model when data are imbalanced.
    The scale_pos_weight parameter is calculated from the ratio of the negative class over
    the positive class. The exact scale_pos_weight sometimes does not give the best result,
    so by passing the gridsearch=True parameter you can return a list of values to test with
    GridSearchCV. In addition, passing square_root=True changes the scale_pos_weight to the
    square root value, which can sometimes be beneficial on extremely imbalanced data.

    :param target: Pandas dataframe column containing the binary target
    :param square_root: Optional boolean parameter to convert to square root on extremely unbalanced data
    :param gridsearch: Optional boolean parameter to return a bracketed list for use in GridSearchCV

    Usage:
        scale_pos_weight = get_scale_pos_weight(df['target'], square_root=False, gridsearch=True)

    """

    import math

    scale_pos_weight = round((len(target) - sum(target)) / sum(target))

    if square_root:
        scale_pos_weight = round(math.sqrt(scale_pos_weight))

    if gridsearch:
        scale_pos_weight = [scale_pos_weight-2, scale_pos_weight-1, scale_pos_weight, 
                            scale_pos_weight+1, scale_pos_weight+2]

    return scale_pos_weight

In [15]:
scale_pos_weight = get_scale_pos_weight(df['Label'], square_root=False, gridsearch=True)
scale_pos_weight

[-1, 0, 1, 2, 3]

In [16]:
n_estimators = [50]
learning_rate = [0.1]
max_depth = [5, 10, 20]
min_child_weight = [1, 2]
scale_pos_weight = [-1, 0, 1, 2, 3]
gamma = [0.9, 1.0]
subsample = [0.9]
colsample_bytree = [0.8, 1.0]

start = time.perf_counter()

param_grid = dict(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                min_child_weight=min_child_weight,
                scale_pos_weight=scale_pos_weight,
                gamma=gamma,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
)

model = XGBClassifier(random_state=0, eval_metric = ['logloss', 'auc', 'error'])

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='recall',
                           )

print('Running GridSearchCV...')
best_model = grid_search.fit(X_train, y_train)
best_score = round(best_model.score(X_test, y_test), 4)
best_params = best_model.best_params_

print('Score:', best_score)
print('Optimum parameters', best_params)

finish = time.perf_counter()
run_time = (finish - start / 60)
print(f"Completed task in {run_time:0.4f} seconds")

Running GridSearchCV...
Score: 1.0
Optimum parameters {'colsample_bytree': 0.8, 'gamma': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50, 'scale_pos_weight': 1, 'subsample': 0.9}
Completed task in 85.4408 seconds


# Fit selected model

In [17]:
model = XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=5,
                      min_child_weight=1,
                      scale_pos_weight=2,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1,
                      eval_metric = ['logloss', 'auc', 'error'])
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Assess model performance

In [18]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       1.00      1.00      1.00      2775
   not match       1.00      1.00      1.00      2673

    accuracy                           1.00      5448
   macro avg       1.00      1.00      1.00      5448
weighted avg       1.00      1.00      1.00      5448



In [19]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('tp: {}\nfp: {}\nfn: {}\ntn: {}'.format(tp, fp, fn, tn))

tp: 2775
fp: 0
fn: 0
tn: 2673


# Run test data

In [20]:
df_test = pd.read_csv('New_ML_Data.csv')
df_test

Unnamed: 0,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,ratio,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers,Disposition
0,92,1,1,2,0.972222,0.983333,1,96,91,96,96,96,96,96,1,0.693147,5.937536,5.937536,No Match
1,92,10,10,14,0.659722,0.659722,0,43,50,74,80,76,44,44,1,0.693147,5.509388,5.509388,No Match
2,92,13,13,16,0.586971,0.586971,0,40,36,55,73,69,41,41,1,0.693147,5.318120,5.318120,No Match
3,92,11,11,16,0.603454,0.603454,0,41,38,79,79,75,50,50,1,0.693147,5.468060,5.468060,No Match
4,95,6,6,15,0.760317,0.760317,1,69,74,55,73,69,69,69,1,0.693147,5.602119,5.602119,No Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73855,94,37,37,44,0.605797,0.605797,0,32,70,36,100,86,32,32,1,0.693147,5.472271,5.472271,No Match
73856,100,0,0,0,1.000000,1.000000,1,100,100,100,100,100,100,100,1,0.693147,5.991465,5.991465,No Match
73857,100,0,0,0,1.000000,1.000000,1,100,100,100,100,100,100,100,1,0.693147,5.991465,5.991465,Match
73858,100,0,0,0,1.000000,1.000000,1,100,100,100,100,100,100,100,1,0.693147,5.991465,5.991465,No Match


In [21]:
X_test = df_test.iloc[:, :-1].values
y_pred = model.predict(X_test)

df_test['Disposition'] = df_test['Disposition'].map({'Match' : 1, 'No Match' : 0})
y_test = df_test['Disposition'].values

In [22]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.04      1.00      0.08      3047
   not match       0.00      0.00      0.00     70813

    accuracy                           0.04     73860
   macro avg       0.02      0.50      0.04     73860
weighted avg       0.00      0.04      0.00     73860



In [23]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('tp: {}\nfp: {}\nfn: {}\ntn: {}'.format(tp, fp, fn, tn))

tp: 3047
fp: 70813
fn: 0
tn: 0
