In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('ml_dataset_score.csv')
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match,100
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match,100
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match,100
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match,100
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match,100
...,...,...,...,...,...
18503,GLOBAL AGE LIMITED,"SHAABAN, Bouthaina",Non-Match,Not Match,0
18504,OBSHCHESTVO S OGRANICHENNOI OTVETSTVENNOSTYU M...,TAMIN KALAYE SABZ ARAS COMPANY,Non-Match,Not Match,0
18505,"AHMED, Qassim Abdullah Ali",KOREA HAEGUMGANG TRADING CORPORATION,Non-Match,Not Match,0
18506,JINGHO TECHNOLOGY CO. LIMITED,COMITE' DE BIENFAISANCE ET DE SECOURS AUX PALE...,Non-Match,Not Match,0


# Feature engineering

In [4]:
def matching_numbers(original_name, test_case_name):

    original_numbers = set(re.findall(r'[0-9]+', original_name))
    test_case_numbers = set(re.findall(r'[0-9]+', test_case_name))    
    union = original_numbers.union(test_case_numbers)
    intersection = original_numbers.intersection(test_case_numbers)

    if len(original_numbers)==0 and len(test_case_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [5]:
def engineer_features(df):

    df['Test Case Name'] = df['Test Case Name'].str.lower()
    df['Original Name'] = df['Original Name'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['Original Name'], 
                                      x['Test Case Name']), axis=1)

    df['damerau_levenshtein_distance'] = df.apply(
    lambda x: jf.damerau_levenshtein_distance(x['Original Name'], 
                                              x['Test Case Name']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['Original Name'], 
                                  x['Test Case Name']), axis=1)

    df['jaro_winkler_similarity'] = df.apply(
    lambda x: jf.jaro_winkler_similarity(x['Original Name'], 
                                         x['Test Case Name']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['Original Name'], 
                                         x['Test Case Name']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['Original Name'], 
                         x['Test Case Name']), axis=1)

    df['partial_ratio'] = df.apply(
    lambda x: fuzz.partial_ratio(x['Original Name'], 
                                 x['Test Case Name']), axis=1)

    df['token_sort_ratio'] = df.apply(
    lambda x: fuzz.token_sort_ratio(x['Original Name'], 
                                    x['Test Case Name']), axis=1)

    df['token_set_ratio'] = df.apply(
    lambda x: fuzz.token_set_ratio(x['Original Name'], 
                                   x['Test Case Name']), axis=1)

    df['w_ratio'] = df.apply(
    lambda x: fuzz.WRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['uq_ratio'] = df.apply(
    lambda x: fuzz.UQRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)

    df['q_ratio'] = df.apply(
    lambda x: fuzz.QRatio(x['Original Name'], 
                          x['Test Case Name']), axis=1)    

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['Original Name'], 
                               x['Test Case Name']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)

    df['log_fuzz_score'] = (df['ratio'] + df['partial_ratio'] + 
                            df['token_sort_ratio'] + df['token_set_ratio']).apply(np.log)

    df['log_fuzz_score_numbers'] = df['log_fuzz_score'] + (df['matching_numbers']).apply(np.log)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [6]:
df = engineer_features(df)
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


In [7]:
df =  df.drop(index = df[df['Bridger Score'] == 'ERROR WITH TEST CASE'].index.tolist())
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


In [8]:
for i in list(df[df['Bridger Score'] == 'NO MATCH'].index):
    df.loc[i, 'Bridger Score'] = np.random.randint(50, 70)
df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
2,korean people's army,korean people's army,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,100,0,0,0,1.000000,1.000000,...,100,100,100,100,100,100,1.0,0.693147,5.991465,5.991465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,0,16,16,18,0.449735,0.449735,...,29,23,23,28,28,28,1.0,0.693147,4.634729,4.634729
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,0,44,44,51,0.510733,0.510733,...,30,32,32,32,27,27,1.0,0.693147,4.795791,4.795791
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,0,28,28,35,0.553419,0.553419,...,35,30,30,32,32,32,1.0,0.693147,4.844187,4.844187
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,0,43,43,54,0.546201,0.546201,...,34,35,36,37,34,34,1.0,0.693147,4.912655,4.912655


# Standardization

In [9]:
scaled_df = df.copy()
scaler = StandardScaler()
scaled_df.iloc[:, 4:] = scaler.fit_transform( df.iloc[:, 4:] )
scaled_df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,Match,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,Match,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
2,korean people's army,korean people's army,Fuzzy Match,Match,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
3,rancho la herradura,rancho la herradura,Fuzzy Match,Match,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,Match,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,Not Match,-0.983421,0.087625,0.089042,-0.046856,-1.130074,-1.132769,...,-1.139635,-1.139413,-1.162921,-1.198015,-0.986142,-0.986142,0.290843,0.290645,-1.041982,-0.195192
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,Not Match,-0.983421,2.012596,2.014366,1.868458,-0.870218,-0.876246,...,-1.107505,-0.865678,-0.900360,-1.069820,-1.016645,-1.016645,0.290843,0.290645,-0.802358,-0.090316
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,Not Match,-0.983421,0.912613,0.914181,0.939821,-0.688371,-0.696730,...,-0.946860,-0.926508,-0.958707,-1.069820,-0.864129,-0.864129,0.290843,0.290645,-0.730354,-0.058803
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,Not Match,-0.983421,1.943847,1.945605,2.042577,-0.719119,-0.727084,...,-0.978989,-0.774434,-0.783665,-0.909577,-0.803123,-0.803123,0.290843,0.290645,-0.628489,-0.014220


In [10]:
scaled_df['Label'] = scaled_df['Test Type'].map({'Exact Match' : 2, 'Fuzzy Match' : 1, 'Non-Match' : 0})
scaled_df

Unnamed: 0,Original Name,Test Case Name,Test Type,Label,Bridger Score,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,jam'yah ta'awun al-islamia,jam'yah ta'awun al-islamia,Fuzzy Match,1,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
1,"globovision tele ca, corp.","globovision tele ca, corp.",Fuzzy Match,1,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
2,korean people's army,korean people's army,Fuzzy Match,1,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
3,rancho la herradura,rancho la herradura,Fuzzy Match,1,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
4,"techno energy, s.a.","techno energy, s.a.",Fuzzy Match,1,1.118883,-1.012359,-1.011144,-1.091572,1.214116,1.181362,...,1.141537,1.202537,1.083441,1.109485,1.210077,1.210077,0.290843,0.290645,0.976546,0.688250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,global age limited,"shaaban, bouthaina",Non-Match,0,-0.983421,0.087625,0.089042,-0.046856,-1.130074,-1.132769,...,-1.139635,-1.139413,-1.162921,-1.198015,-0.986142,-0.986142,0.290843,0.290645,-1.041982,-0.195192
18504,obshchestvo s ogranichennoi otvetstvennostyu m...,tamin kalaye sabz aras company,Non-Match,0,-0.983421,2.012596,2.014366,1.868458,-0.870218,-0.876246,...,-1.107505,-0.865678,-0.900360,-1.069820,-1.016645,-1.016645,0.290843,0.290645,-0.802358,-0.090316
18505,"ahmed, qassim abdullah ali",korea haegumgang trading corporation,Non-Match,0,-0.983421,0.912613,0.914181,0.939821,-0.688371,-0.696730,...,-0.946860,-0.926508,-0.958707,-1.069820,-0.864129,-0.864129,0.290843,0.290645,-0.730354,-0.058803
18506,jingho technology co. limited,comite' de bienfaisance et de secours aux pale...,Non-Match,0,-0.983421,1.943847,1.945605,2.042577,-0.719119,-0.727084,...,-0.978989,-0.774434,-0.783665,-0.909577,-0.803123,-0.803123,0.290843,0.290645,-0.628489,-0.014220


# Create test and train data

In [11]:
X = scaled_df[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']].values
y = scaled_df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Select the model

In [12]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1, eval_metric = ['logloss', 'auc', 'error']),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "MLP": MLPClassifier(),
    "XGBClassifer tuned": XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1,
                      eval_metric = ['logloss', 'auc', 'error'])
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision',
                                   'recall','f1','roc','run_time','tp','fp',
                                   'tn','fn'])

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)
    
    try:
        y_pred = model.predict_proba(X_test)
        roc = roc_auc_score(y_test, y_pred, multi_class='ovo', average='weighted')
    except:
        roc = np.nan

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results

Parameters: { "scale_pos_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DummyClassifier_stratified,0.380043,0.865814,0.379151,0.380043,0.379544,0.503321,0.0,1411,679,318,700
1,KNeighborsClassifier,0.980728,0.019272,0.981378,0.980728,0.980606,0.985093,0.0,2767,9,1290,20
2,XGBClassifier,0.981628,0.018372,0.982247,0.981628,0.981555,0.991527,0.14,2764,12,1298,12
3,DecisionTreeClassifier,0.977486,0.022514,0.977955,0.977486,0.977391,0.986021,0.0,2747,29,1292,18
4,RandomForestClassifier,0.982529,0.017471,0.983179,0.982529,0.982465,0.990787,0.02,2767,9,1300,10
5,AdaBoostClassifier,0.971362,0.028638,0.972485,0.971362,0.970798,0.981414,0.01,2775,1,1230,80
6,GradientBoostingClassifier,0.981988,0.018012,0.982618,0.981988,0.98192,0.99093,0.1,2765,11,1299,11
7,Perceptron,0.974964,0.025036,0.975485,0.974964,0.974775,,0.0,2750,26,1275,31
8,MLP,0.980728,0.019272,0.981345,0.980728,0.98066,0.991554,0.12,2760,16,1297,11
9,XGBClassifer tuned,0.982349,0.017651,0.982998,0.982349,0.982279,0.991392,0.02,2767,9,1299,11


# Assess model performance

In [13]:
classifier = AdaBoostClassifier()
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred, labels=[2, 1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.95      1.00      0.97      1388
   not match       1.00      0.89      0.94      1388

    accuracy                           0.97      5552
   macro avg       0.97      0.96      0.97      5552
weighted avg       0.97      0.97      0.97      5552



In [15]:
tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)
print('tp: {}\nfp: {}\nfn: {}\ntn: {}'.format(tp, fp, fn, tn))

tp: 2775
fp: 1
fn: 80
tn: 1230
