In [1]:
from collections import defaultdict
from itertools import count
import matplotlib.pyplot as plt
from metaphone import doublemetaphone
import pandas as pd
from statistics import mean
import textdistance
import os
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict, cross_validate, StratifiedKFold

from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles

from metaphone import doublemetaphone
import jellyfish

import sys
# Add the ptdraft folder path to the sys.path list
sys.path.append('..')

from model import Siamese
from process import load_data, load_json_config, str2emb

# Parameters:

In [2]:
# Trained folder: contrains all of the information about the run. and how the DL model predicted it.
trained_folder = "../results/gru_soundex"

# results path
csv_save_path = "../results/"

# Best Save for the Run:
run_name = "gru_soundex"
DL_thrsholds = [0.778285384,0.79654932,0.794194341,0.783809662,0.772859931]

# Config file:
config_file = "../configs/gru_soundex.json"

# Phonetic RF addition
phonetic_RF = True

# Result Set: {1800s_ln, 1800s_fn or Normal}:
result_set = "Normal"

In [3]:
# Helper functions:
def compare_dm1(s1, s2):
    return textdistance.levenshtein.normalized_similarity(doublemetaphone(s1)[0],doublemetaphone(s2)[0])

def compare_dm2(s1, s2):
    return textdistance.levenshtein.normalized_similarity(doublemetaphone(s1)[1],doublemetaphone(s2)[1])

# MAX VALUE for RF
vowel_max = 6
consonant_max = 13
character_max = 19

def create_RF_features(dataframe):
    dataframe['levenshtein'] = [textdistance.levenshtein.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaro'] = [textdistance.jaro.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaro_winkler'] = [textdistance.jaro_winkler.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaccard'] = [textdistance.jaccard.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['sorensen_dice'] = [textdistance.sorensen_dice.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['dm1'] = [compare_dm1(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['dm2'] = [compare_dm2(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['vowels_a'] = dataframe['name_a'].apply(lambda x: sum(map(x.count, 'aeiou')))
    dataframe['vowels_b'] = dataframe['name_b'].apply(lambda x: sum(map(x.count, 'aeiou')))
    dataframe['consonants_a'] = dataframe['name_a'].str.len() - dataframe['vowels_a']
    dataframe['consonants_b'] = dataframe['name_b'].str.len() - dataframe['vowels_b']
    dataframe['vowels'] = (dataframe['vowels_a'] - dataframe['vowels_b']).abs()
    dataframe['vowels'] = 1 - (dataframe['vowels'] / vowel_max)
    dataframe['consonants'] = (dataframe['consonants_a'] - dataframe['consonants_b']).abs()
    dataframe['consonants'] = 1 - (dataframe['consonants'] / consonant_max)
    dataframe['characters'] = (dataframe['name_a'].str.len() - dataframe['name_b'].str.len()).abs()
    dataframe['characters'] = 1 - (dataframe['characters'] / character_max)

    #Phonetic Component:
    dataframe["levenshtein_phonetic"] = [textdistance.levenshtein.normalized_similarity(jellyfish.soundex(x), jellyfish.soundex(y)) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe["jw_phonetic"] = [textdistance.jaro_winkler.normalized_similarity(jellyfish.soundex(x), jellyfish.soundex(y)) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]

    dataframe = dataframe.drop(columns=['vowels_a', 'vowels_b', 'consonants_a', 'consonants_b'])
    
    return dataframe

# 1. Get the dataset and folds from trained folder

In [4]:
folds = []

for i in range(5):
  test_csv = os.path.join(trained_folder, f"test_k{i}.csv")
  df = pd.read_csv(test_csv, usecols=["name1", "name2", "label"])
  df = df.astype({"label": bool, "name1": str, "name2": str})
  df = df.rename(columns={"name1": "name_a", "name2": "name_b"})

  folds.append(df)

len(folds)

folds[0]

Unnamed: 0,name_a,name_b,label
0,dessell,pessall,False
1,ellgood,elwood,False
2,ermann,erman,True
3,koland,nowland,False
4,radebach,rasbach,False
...,...,...,...
14995,tildsley,bertini,False
14996,rieck,riek,False
14997,dippery,brohart,False
14998,wipperman,wippermann,True


# 2. Train the RF model

In [5]:
out_pairs = []

for pairs in folds:
    out_pairs.append(create_RF_features(pairs))
    
folds = out_pairs
    
scores = defaultdict(list)
y_prob = []
name_a = []
name_b = []
labels = []

rf_y_pred = []

rf_models = []

for test_fold_index in range(len(folds)):
    val_fold_index = test_fold_index - 1 if test_fold_index - 1 >= 0 else len(folds) - 1

    X_train = pd.DataFrame()
    y_train = pd.Series(dtype=bool)
    for fold_index in range(len(folds)):
        if fold_index != test_fold_index and fold_index != val_fold_index:
            X_train = pd.concat([X_train, folds[fold_index].drop(columns=['name_a', 'name_b', 'label'])])
            y_train = pd.concat([y_train, folds[fold_index]['label']])
    name_a = name_a + folds[test_fold_index]["name_a"].values.tolist()
    name_b = name_b + folds[test_fold_index]["name_b"].values.tolist()
    X_test = folds[test_fold_index].drop(columns=['name_a', 'name_b', 'label'])
    y_test = folds[test_fold_index]['label']
    
    clf = RandomForestClassifier(random_state=0)
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)

    rf_y_pred += np.array(y_pred).tolist()

    scores['test_precision'].append(precision_score(y_test, y_pred))
    scores['test_recall'].append(recall_score(y_test, y_pred))
    scores['f1_score'].append(f1_score(y_test, y_pred))

    yt = np.array(y_test)
    yp = np.array(y_pred)

    to_labels = yt.astype(int).tolist()
    labels = labels + to_labels

    scores['tp'].append(np.count_nonzero(yt & yp))
    scores['tn'].append(np.count_nonzero(np.logical_not(yt) & np.logical_not(yp)))
    scores['fp'].append(np.count_nonzero(np.logical_not(yt) & yp))
    scores['fn'].append(np.count_nonzero(yt & np.logical_not(yp)))

    y_prob += [x[1] for x in clf.predict_proba(X_test)]

    rf_models.append(clf)

print({k: mean(v) for k, v in scores.items()})

{'test_precision': 0.9030918765344335, 'test_recall': 0.93076, 'f1_score': 0.9167086101343502, 'tp': 4653.8, 'tn': 9500.6, 'fp': 499.4, 'fn': 346.2}


# 3. Load the best DF models

In [6]:
dl_models = []

for k in range(5):
    save_file = os.path.join("../saves", f"{run_name}_k{k}_BEST")

    json_file = os.path.join("configs", str(run_name) + ".json")
    DATASET_CONFIG, TRAIN_CONFIG, MODEL_KWARGS = load_json_config(config_file)

    _, model, _, _, _, _, _ = load_data(save_file, TRAIN_CONFIG, MODEL_KWARGS)
    dl_models.append(model)
    
print(dl_models)

Loaded run successfully from ../saves/gru_soundex_k0_BEST
Loaded run successfully from ../saves/gru_soundex_k1_BEST
Loaded run successfully from ../saves/gru_soundex_k2_BEST
Loaded run successfully from ../saves/gru_soundex_k3_BEST
Loaded run successfully from ../saves/gru_soundex_k4_BEST
[Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): 

# 4. Get the Results set

In [14]:
def get_1800s_ln_results_set(path = "../data/1800s_last_name_pairs.tsv"):
    df = pd.read_csv(path, delimiter = "\t", names = ["name0", "name1"])
    df["gt_label"] = 1
    
    return df

def get_1800s_fn_results_set(path = "../data/1800s_first_name_pairs.tsv"):
    df = pd.read_csv(path, delimiter = "\t", names = ["name0", "name1"])
    df["gt_label"] = 1
    df = df.dropna(axis = 0)
    
    return df
    
def get_Normal_results_set(path = "../data/gru_metaphone"):
    folds = []

    for i in range(5):
        test_csv = os.path.join(trained_folder, f"test_k{i}.csv")
        df = pd.read_csv(test_csv, usecols=["name1", "name2", "label"])
        df = df.astype({"label": bool, "name1": str, "name2": str})
        df = df.rename(columns={"name1": "name0", "name2": "name1", "label": "gt_label"})
        df["gt_label"] = df["gt_label"].astype(int)


        folds.append(df)

    df = pd.concat(folds, axis = 0)
    
    return df

directory = {"1800s_ln": get_1800s_ln_results_set, "1800s_fn": get_1800s_fn_results_set, "Normal": get_Normal_results_set}
function = directory[result_set]
results_set = function()
results_set

Unnamed: 0,name0,name1,gt_label
0,dessell,pessall,0
1,ellgood,elwood,0
2,ermann,erman,1
3,koland,nowland,0
4,radebach,rasbach,0
...,...,...,...
14995,eoute,rankle,0
14996,dossin,gossin,0
14997,izor,nordenberg,0
14998,schnars,schnarrs,1


# 5. Run models on result set 

Creates DL classification (score), DL classification (label), and RF classification

In [15]:
results_dir = results_set.to_dict('records')
results_dir_out = list()

for index, pair in enumerate(results_dir):
    # Deep Learning:
    n1_emb = str2emb(pair['name0']).unsqueeze(0)
    n2_emb = str2emb(pair['name1']).unsqueeze(0)

    score_list = []
    label_list = []

    for model, threshold in zip(dl_models, DL_thrsholds):
        #print(n1_emb, n2_emb)
        score, (_, _) = model(n1_emb, n2_emb)
        score = score.item()
        score_list.append(score)

        label = 1 if score > threshold else 0

        label_list.append(label)

    DL_classification_score = sum(score_list) / 5
    DL_classification_label = 1 if label_list.count(1) >= 3 else 0

    # Random Forest:
    pair_rf_input = {"name_a": pair["name0"], "name_b": pair["name1"], "label": pair["gt_label"]}
    input_df = pd.DataFrame(pair_rf_input, index = [0])
    rf_features = create_RF_features(input_df)
    X_test = rf_features.drop(columns=['name_a', 'name_b', 'label'])

    label_list = []

    for model in rf_models:
        label = model.predict(X_test)
        label_list.append(label)

    RF_classification_label = 1 if label_list.count(True) >= 3 else 0

    out = {"name0": pair['name0'], "name1": pair['name1'], "gt_label": pair['gt_label'], "DL_classification_score": DL_classification_score, \
           "DL_classification_label": DL_classification_label, "RF_classification_label": RF_classification_label}
    results_dir_out.append(out)

    if index % 100 == 0:
        print(index, out)

results_set = pd.DataFrame(results_dir_out)
results_set

0 {'name0': 'dessell', 'name1': 'pessall', 'gt_label': 0, 'DL_classification_score': 0.4728097140789032, 'DL_classification_label': 0, 'RF_classification_label': 0}
100 {'name0': 'uselman', 'name1': 'usselman', 'gt_label': 1, 'DL_classification_score': 0.9599664688110352, 'DL_classification_label': 1, 'RF_classification_label': 1}
200 {'name0': 'humphry', 'name1': 'heaslet', 'gt_label': 0, 'DL_classification_score': 0.5513315260410309, 'DL_classification_label': 0, 'RF_classification_label': 0}
300 {'name0': 'hetrick', 'name1': 'hetzke', 'gt_label': 0, 'DL_classification_score': 0.5893280982971192, 'DL_classification_label': 0, 'RF_classification_label': 0}
400 {'name0': 'schaurer', 'name1': 'scheuren', 'gt_label': 0, 'DL_classification_score': 0.4880423963069916, 'DL_classification_label': 0, 'RF_classification_label': 0}
500 {'name0': 'bardson', 'name1': 'maryson', 'gt_label': 0, 'DL_classification_score': 0.43894789218902586, 'DL_classification_label': 0, 'RF_classification_label': 

4900 {'name0': 'draughon', 'name1': 'draughan', 'gt_label': 1, 'DL_classification_score': 0.9752143144607544, 'DL_classification_label': 1, 'RF_classification_label': 1}
5000 {'name0': 'siemsen', 'name1': 'gravenhorst', 'gt_label': 0, 'DL_classification_score': 0.48481252789497375, 'DL_classification_label': 0, 'RF_classification_label': 0}
5100 {'name0': 'morper', 'name1': 'donalan', 'gt_label': 0, 'DL_classification_score': 0.5316181600093841, 'DL_classification_label': 0, 'RF_classification_label': 0}
5200 {'name0': 'bodenstiner', 'name1': 'rosenstine', 'gt_label': 0, 'DL_classification_score': 0.47456197142601014, 'DL_classification_label': 0, 'RF_classification_label': 0}
5300 {'name0': 'naes', 'name1': 'shaws', 'gt_label': 0, 'DL_classification_score': 0.5924167335033417, 'DL_classification_label': 0, 'RF_classification_label': 0}
5400 {'name0': 'arthurhultz', 'name1': 'arthurhults', 'gt_label': 1, 'DL_classification_score': 0.9875868201255799, 'DL_classification_label': 1, 'RF_c

9800 {'name0': 'hardenberg', 'name1': 'dutartre', 'gt_label': 0, 'DL_classification_score': 0.48515431880950927, 'DL_classification_label': 0, 'RF_classification_label': 0}
9900 {'name0': 'etherage', 'name1': 'keenahan', 'gt_label': 0, 'DL_classification_score': 0.5185561418533325, 'DL_classification_label': 0, 'RF_classification_label': 0}
10000 {'name0': 'biedermann', 'name1': 'revermann', 'gt_label': 0, 'DL_classification_score': 0.4400780022144318, 'DL_classification_label': 0, 'RF_classification_label': 0}
10100 {'name0': 'wigness', 'name1': 'holway', 'gt_label': 0, 'DL_classification_score': 0.42742201685905457, 'DL_classification_label': 0, 'RF_classification_label': 0}
10200 {'name0': 'hotsenpillar', 'name1': 'piersey', 'gt_label': 0, 'DL_classification_score': 0.5714124441146851, 'DL_classification_label': 0, 'RF_classification_label': 0}
10300 {'name0': 'tharrington', 'name1': 'remers', 'gt_label': 0, 'DL_classification_score': 0.5320972681045533, 'DL_classification_label': 0

14700 {'name0': 'denley', 'name1': 'denly', 'gt_label': 1, 'DL_classification_score': 0.9740320205688476, 'DL_classification_label': 1, 'RF_classification_label': 1}
14800 {'name0': 'santimore', 'name1': 'santmyre', 'gt_label': 0, 'DL_classification_score': 0.9137677311897278, 'DL_classification_label': 1, 'RF_classification_label': 1}
14900 {'name0': 'schofeld', 'name1': 'reints', 'gt_label': 0, 'DL_classification_score': 0.5466343641281128, 'DL_classification_label': 0, 'RF_classification_label': 0}
15000 {'name0': 'bieske', 'name1': 'breske', 'gt_label': 0, 'DL_classification_score': 0.5818443059921264, 'DL_classification_label': 0, 'RF_classification_label': 1}
15100 {'name0': 'eudaly', 'name1': 'eudaley', 'gt_label': 1, 'DL_classification_score': 0.9805828332901001, 'DL_classification_label': 1, 'RF_classification_label': 1}
15200 {'name0': 'destro', 'name1': 'usmer', 'gt_label': 0, 'DL_classification_score': 0.4736616790294647, 'DL_classification_label': 0, 'RF_classification_lab

19600 {'name0': 'bierd', 'name1': 'bird', 'gt_label': 0, 'DL_classification_score': 0.8453516840934754, 'DL_classification_label': 1, 'RF_classification_label': 0}
19700 {'name0': 'fender', 'name1': 'swender', 'gt_label': 0, 'DL_classification_score': 0.5034403681755066, 'DL_classification_label': 0, 'RF_classification_label': 0}
19800 {'name0': 'lescalleet', 'name1': 'lescallett', 'gt_label': 1, 'DL_classification_score': 0.9779742598533631, 'DL_classification_label': 1, 'RF_classification_label': 1}
19900 {'name0': 'kittrell', 'name1': 'kitrell', 'gt_label': 1, 'DL_classification_score': 0.9411944150924683, 'DL_classification_label': 1, 'RF_classification_label': 1}
20000 {'name0': 'northcotte', 'name1': 'stimers', 'gt_label': 0, 'DL_classification_score': 0.5128183186054229, 'DL_classification_label': 0, 'RF_classification_label': 0}
20100 {'name0': 'broucher', 'name1': 'friskin', 'gt_label': 0, 'DL_classification_score': 0.4621239423751831, 'DL_classification_label': 0, 'RF_classif

24500 {'name0': 'pollison', 'name1': 'woolison', 'gt_label': 0, 'DL_classification_score': 0.5409852564334869, 'DL_classification_label': 0, 'RF_classification_label': 0}
24600 {'name0': 'dieckmann', 'name1': 'blaize', 'gt_label': 0, 'DL_classification_score': 0.5511248886585236, 'DL_classification_label': 0, 'RF_classification_label': 0}
24700 {'name0': 'ofallon', 'name1': 'armstone', 'gt_label': 0, 'DL_classification_score': 0.4205224931240082, 'DL_classification_label': 0, 'RF_classification_label': 0}
24800 {'name0': 'dorcett', 'name1': 'cadarette', 'gt_label': 0, 'DL_classification_score': 0.5665545642375946, 'DL_classification_label': 0, 'RF_classification_label': 0}
24900 {'name0': 'proudlove', 'name1': 'wambsgans', 'gt_label': 0, 'DL_classification_score': 0.46481955647468565, 'DL_classification_label': 0, 'RF_classification_label': 0}
25000 {'name0': 'canden', 'name1': 'canthen', 'gt_label': 0, 'DL_classification_score': 0.49736409783363345, 'DL_classification_label': 0, 'RF_c

29400 {'name0': 'sempers', 'name1': 'semper', 'gt_label': 1, 'DL_classification_score': 0.9434703469276429, 'DL_classification_label': 1, 'RF_classification_label': 1}
29500 {'name0': 'showan', 'name1': 'showen', 'gt_label': 0, 'DL_classification_score': 0.9340993523597717, 'DL_classification_label': 1, 'RF_classification_label': 1}
29600 {'name0': 'berthelsen', 'name1': 'berthelson', 'gt_label': 1, 'DL_classification_score': 0.9727162957191468, 'DL_classification_label': 1, 'RF_classification_label': 1}
29700 {'name0': 'vanluven', 'name1': 'vanluvan', 'gt_label': 1, 'DL_classification_score': 0.9823041558265686, 'DL_classification_label': 1, 'RF_classification_label': 1}
29800 {'name0': 'talar', 'name1': 'traylar', 'gt_label': 0, 'DL_classification_score': 0.531522250175476, 'DL_classification_label': 0, 'RF_classification_label': 0}
29900 {'name0': 'meignier', 'name1': 'desbrow', 'gt_label': 0, 'DL_classification_score': 0.43904598355293273, 'DL_classification_label': 0, 'RF_classifi

34300 {'name0': 'keselring', 'name1': 'cuchman', 'gt_label': 0, 'DL_classification_score': 0.4669219315052032, 'DL_classification_label': 0, 'RF_classification_label': 0}
34400 {'name0': 'kirchin', 'name1': 'kerchin', 'gt_label': 1, 'DL_classification_score': 0.9657846212387085, 'DL_classification_label': 1, 'RF_classification_label': 1}
34500 {'name0': 'swinbourne', 'name1': 'swinbourn', 'gt_label': 1, 'DL_classification_score': 0.982998251914978, 'DL_classification_label': 1, 'RF_classification_label': 1}
34600 {'name0': 'garrison', 'name1': 'larrison', 'gt_label': 0, 'DL_classification_score': 0.47760587334632876, 'DL_classification_label': 0, 'RF_classification_label': 0}
34700 {'name0': 'valberg', 'name1': 'wilberg', 'gt_label': 0, 'DL_classification_score': 0.5046333014965058, 'DL_classification_label': 0, 'RF_classification_label': 0}
34800 {'name0': 'flosi', 'name1': 'flossi', 'gt_label': 1, 'DL_classification_score': 0.9433175325393677, 'DL_classification_label': 1, 'RF_classi

39200 {'name0': 'freudenberger', 'name1': 'frendenberger', 'gt_label': 1, 'DL_classification_score': 0.9172452211380004, 'DL_classification_label': 1, 'RF_classification_label': 1}
39300 {'name0': 'doroling', 'name1': 'chastine', 'gt_label': 0, 'DL_classification_score': 0.41975106596946715, 'DL_classification_label': 0, 'RF_classification_label': 0}
39400 {'name0': 'mcdonic', 'name1': 'mcdonie', 'gt_label': 1, 'DL_classification_score': 0.8877875089645386, 'DL_classification_label': 1, 'RF_classification_label': 1}
39500 {'name0': 'giliam', 'name1': 'gilliem', 'gt_label': 0, 'DL_classification_score': 0.8936335682868958, 'DL_classification_label': 1, 'RF_classification_label': 0}
39600 {'name0': 'struzynski', 'name1': 'hezelwood', 'gt_label': 0, 'DL_classification_score': 0.5091722965240478, 'DL_classification_label': 0, 'RF_classification_label': 0}
39700 {'name0': 'ladeaux', 'name1': 'witmarsh', 'gt_label': 0, 'DL_classification_score': 0.41100395321846006, 'DL_classification_label'

44100 {'name0': 'novakoski', 'name1': 'laniewski', 'gt_label': 0, 'DL_classification_score': 0.4815071105957031, 'DL_classification_label': 0, 'RF_classification_label': 0}
44200 {'name0': 'heatherly', 'name1': 'heatherley', 'gt_label': 1, 'DL_classification_score': 0.9817499637603759, 'DL_classification_label': 1, 'RF_classification_label': 1}
44300 {'name0': 'sholes', 'name1': 'showes', 'gt_label': 0, 'DL_classification_score': 0.49208263754844667, 'DL_classification_label': 0, 'RF_classification_label': 0}
44400 {'name0': 'wiethoff', 'name1': 'weithoff', 'gt_label': 1, 'DL_classification_score': 0.9984649658203125, 'DL_classification_label': 1, 'RF_classification_label': 1}
44500 {'name0': 'blachley', 'name1': 'blackney', 'gt_label': 0, 'DL_classification_score': 0.5850332021713257, 'DL_classification_label': 0, 'RF_classification_label': 0}
44600 {'name0': 'dissette', 'name1': 'dissett', 'gt_label': 1, 'DL_classification_score': 0.9838887095451355, 'DL_classification_label': 1, 'RF

49000 {'name0': 'boullin', 'name1': 'couglin', 'gt_label': 0, 'DL_classification_score': 0.5744135677814484, 'DL_classification_label': 0, 'RF_classification_label': 0}
49100 {'name0': 'healer', 'name1': 'hellyer', 'gt_label': 0, 'DL_classification_score': 0.5948164463043213, 'DL_classification_label': 0, 'RF_classification_label': 0}
49200 {'name0': 'ybanez', 'name1': 'ibanez', 'gt_label': 1, 'DL_classification_score': 0.8616811513900757, 'DL_classification_label': 1, 'RF_classification_label': 1}
49300 {'name0': 'claytin', 'name1': 'hollison', 'gt_label': 0, 'DL_classification_score': 0.4625397503376007, 'DL_classification_label': 0, 'RF_classification_label': 0}
49400 {'name0': 'moorhead', 'name1': 'moorehead', 'gt_label': 1, 'DL_classification_score': 0.9026038408279419, 'DL_classification_label': 1, 'RF_classification_label': 1}
49500 {'name0': 'mayginnes', 'name1': 'mayginnis', 'gt_label': 1, 'DL_classification_score': 0.9734827160835267, 'DL_classification_label': 1, 'RF_classif

53900 {'name0': 'dorrion', 'name1': 'obrion', 'gt_label': 0, 'DL_classification_score': 0.4754667282104492, 'DL_classification_label': 0, 'RF_classification_label': 0}
54000 {'name0': 'beilfuss', 'name1': 'marcia', 'gt_label': 0, 'DL_classification_score': 0.45814881324768064, 'DL_classification_label': 0, 'RF_classification_label': 0}
54100 {'name0': 'kellmann', 'name1': 'gallowa', 'gt_label': 0, 'DL_classification_score': 0.514812308549881, 'DL_classification_label': 0, 'RF_classification_label': 0}
54200 {'name0': 'bitzer', 'name1': 'betzer', 'gt_label': 1, 'DL_classification_score': 0.9228735208511353, 'DL_classification_label': 1, 'RF_classification_label': 1}
54300 {'name0': 'lemirande', 'name1': 'gelsinger', 'gt_label': 0, 'DL_classification_score': 0.49472794532775877, 'DL_classification_label': 0, 'RF_classification_label': 0}
54400 {'name0': 'prysock', 'name1': 'prisock', 'gt_label': 1, 'DL_classification_score': 0.9282411098480224, 'DL_classification_label': 1, 'RF_classific

58800 {'name0': 'trenter', 'name1': 'tresner', 'gt_label': 0, 'DL_classification_score': 0.5076681733131408, 'DL_classification_label': 0, 'RF_classification_label': 0}
58900 {'name0': 'papper', 'name1': 'stapper', 'gt_label': 0, 'DL_classification_score': 0.5724182069301605, 'DL_classification_label': 0, 'RF_classification_label': 0}
59000 {'name0': 'goenner', 'name1': 'grebner', 'gt_label': 0, 'DL_classification_score': 0.5653469324111938, 'DL_classification_label': 0, 'RF_classification_label': 0}
59100 {'name0': 'vandervorste', 'name1': 'asbaugh', 'gt_label': 0, 'DL_classification_score': 0.575076574087143, 'DL_classification_label': 0, 'RF_classification_label': 0}
59200 {'name0': 'litzenberg', 'name1': 'litzenburg', 'gt_label': 1, 'DL_classification_score': 0.9694939732551575, 'DL_classification_label': 1, 'RF_classification_label': 1}
59300 {'name0': 'gieselmann', 'name1': 'giesmann', 'gt_label': 0, 'DL_classification_score': 0.5600980699062348, 'DL_classification_label': 0, 'RF

63700 {'name0': 'bridgman', 'name1': 'bridgeman', 'gt_label': 1, 'DL_classification_score': 0.6716394543647766, 'DL_classification_label': 0, 'RF_classification_label': 1}
63800 {'name0': 'schroy', 'name1': 'septka', 'gt_label': 0, 'DL_classification_score': 0.4744969308376312, 'DL_classification_label': 0, 'RF_classification_label': 0}
63900 {'name0': 'harrowven', 'name1': 'lechman', 'gt_label': 0, 'DL_classification_score': 0.4485111951828003, 'DL_classification_label': 0, 'RF_classification_label': 0}
64000 {'name0': 'windrow', 'name1': 'wodrow', 'gt_label': 0, 'DL_classification_score': 0.7150873899459839, 'DL_classification_label': 0, 'RF_classification_label': 0}
64100 {'name0': 'delikowski', 'name1': 'lisowski', 'gt_label': 0, 'DL_classification_score': 0.5546851336956025, 'DL_classification_label': 0, 'RF_classification_label': 0}
64200 {'name0': 'dubberley', 'name1': 'dubberly', 'gt_label': 1, 'DL_classification_score': 0.9875670671463013, 'DL_classification_label': 1, 'RF_cla

68600 {'name0': 'klontz', 'name1': 'kloutz', 'gt_label': 1, 'DL_classification_score': 0.8691542506217956, 'DL_classification_label': 1, 'RF_classification_label': 1}
68700 {'name0': 'leseman', 'name1': 'riseman', 'gt_label': 0, 'DL_classification_score': 0.43477078676223757, 'DL_classification_label': 0, 'RF_classification_label': 0}
68800 {'name0': 'sprowles', 'name1': 'sprowls', 'gt_label': 1, 'DL_classification_score': 0.9804736256599427, 'DL_classification_label': 1, 'RF_classification_label': 1}
68900 {'name0': 'mccanney', 'name1': 'mchanney', 'gt_label': 0, 'DL_classification_score': 0.8447095036506653, 'DL_classification_label': 1, 'RF_classification_label': 1}
69000 {'name0': 'serovy', 'name1': 'lunceford', 'gt_label': 0, 'DL_classification_score': 0.4975481927394867, 'DL_classification_label': 0, 'RF_classification_label': 0}
69100 {'name0': 'tetsell', 'name1': 'weighley', 'gt_label': 0, 'DL_classification_score': 0.5325801134109497, 'DL_classification_label': 0, 'RF_classifi

73500 {'name0': 'jeanotte', 'name1': 'jeannotte', 'gt_label': 1, 'DL_classification_score': 0.9505088329315186, 'DL_classification_label': 1, 'RF_classification_label': 1}
73600 {'name0': 'bomgarner', 'name1': 'lussen', 'gt_label': 0, 'DL_classification_score': 0.5018584966659546, 'DL_classification_label': 0, 'RF_classification_label': 0}
73700 {'name0': 'caldell', 'name1': 'ayrs', 'gt_label': 0, 'DL_classification_score': 0.5038582801818847, 'DL_classification_label': 0, 'RF_classification_label': 0}
73800 {'name0': 'robbs', 'name1': 'robs', 'gt_label': 0, 'DL_classification_score': 0.7810211777687073, 'DL_classification_label': 1, 'RF_classification_label': 1}
73900 {'name0': 'hatford', 'name1': 'sanford', 'gt_label': 0, 'DL_classification_score': 0.5465218186378479, 'DL_classification_label': 0, 'RF_classification_label': 0}
74000 {'name0': 'baldock', 'name1': 'beldock', 'gt_label': 0, 'DL_classification_score': 0.8554090857505798, 'DL_classification_label': 1, 'RF_classification_l

Unnamed: 0,name0,name1,gt_label,DL_classification_score,DL_classification_label,RF_classification_label
0,dessell,pessall,0,0.472810,0,0
1,ellgood,elwood,0,0.644247,0,0
2,ermann,erman,1,0.974459,1,1
3,koland,nowland,0,0.447516,0,0
4,radebach,rasbach,0,0.470515,0,0
...,...,...,...,...,...,...
74995,eoute,rankle,0,0.525536,0,0
74996,dossin,gossin,0,0.511264,0,0
74997,izor,nordenberg,0,0.499483,0,0
74998,schnars,schnarrs,1,0.943826,1,1


# 6. Create additional data for the result set
DL Bucket, RF bucket, concatenated bucket

In [16]:
def calculate_bucket(gt, pred):
    if gt == 1 and pred == 1:
        return "tp"
    elif gt == 1 and pred == 0:
        return "fn"
    elif gt == 0 and pred == 1:
        return "fp"
    elif gt == 0 and pred == 0:
        return "tn"
    
results_set["DL_bucket"] = [calculate_bucket(x, y) for x, y in results_set[["gt_label", "DL_classification_label"]].itertuples(index=False)]
results_set["RF_bucket"] = [calculate_bucket(x, y) for x, y in results_set[["gt_label", "RF_classification_label"]].itertuples(index=False)]
results_set["concat_bucket"] = [x + "_" + y for x, y in results_set[["DL_bucket", "RF_bucket"]].itertuples(index=False)]

results_set

Unnamed: 0,name0,name1,gt_label,DL_classification_score,DL_classification_label,RF_classification_label,DL_bucket,RF_bucket,concat_bucket
0,dessell,pessall,0,0.472810,0,0,tn,tn,tn_tn
1,ellgood,elwood,0,0.644247,0,0,tn,tn,tn_tn
2,ermann,erman,1,0.974459,1,1,tp,tp,tp_tp
3,koland,nowland,0,0.447516,0,0,tn,tn,tn_tn
4,radebach,rasbach,0,0.470515,0,0,tn,tn,tn_tn
...,...,...,...,...,...,...,...,...,...
74995,eoute,rankle,0,0.525536,0,0,tn,tn,tn_tn
74996,dossin,gossin,0,0.511264,0,0,tn,tn,tn_tn
74997,izor,nordenberg,0,0.499483,0,0,tn,tn,tn_tn
74998,schnars,schnarrs,1,0.943826,1,1,tp,tp,tp_tp


# 7. Save to .csv

In [17]:
results_set.to_csv(os.path.join(csv_save_path, run_name + "_" + result_set + ".csv"))