In [1]:
from collections import defaultdict
from itertools import count
import matplotlib.pyplot as plt
from metaphone import doublemetaphone
import pandas as pd
from statistics import mean
import textdistance
import os
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict, cross_validate, StratifiedKFold

from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles

from metaphone import doublemetaphone
import jellyfish

import sys
# Add the ptdraft folder path to the sys.path list
sys.path.append('..')

from model import Siamese
from process import load_data, load_json_config, str2emb

# Parameters:

In [2]:
# Trained folder: contrains all of the information about the run. and how the DL model predicted it.
trained_folder = "../results/gru_soundex"

# results path
csv_save_path = "../results/"

# Best Save for the Run:
run_name = "gru_soundex"
DL_thrsholds = [0.778285384,0.79654932,0.794194341,0.783809662,0.772859931]

# Config file:
config_file = "../configs/gru_soundex.json"

# Phonetic RF addition
phonetic_RF = True

# Result Set: {1800s_ln, 1800s_fn or Normal}:
result_set = "Normal"

In [3]:
# Helper functions:
def compare_dm1(s1, s2):
    return textdistance.levenshtein.normalized_similarity(doublemetaphone(s1)[0],doublemetaphone(s2)[0])

def compare_dm2(s1, s2):
    return textdistance.levenshtein.normalized_similarity(doublemetaphone(s1)[1],doublemetaphone(s2)[1])

# MAX VALUE for RF
vowel_max = 6
consonant_max = 13
character_max = 19

def create_RF_features(dataframe):
    dataframe['levenshtein'] = [textdistance.levenshtein.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaro'] = [textdistance.jaro.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaro_winkler'] = [textdistance.jaro_winkler.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['jaccard'] = [textdistance.jaccard.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['sorensen_dice'] = [textdistance.sorensen_dice.normalized_similarity(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['dm1'] = [compare_dm1(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['dm2'] = [compare_dm2(x, y) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe['vowels_a'] = dataframe['name_a'].apply(lambda x: sum(map(x.count, 'aeiou')))
    dataframe['vowels_b'] = dataframe['name_b'].apply(lambda x: sum(map(x.count, 'aeiou')))
    dataframe['consonants_a'] = dataframe['name_a'].str.len() - dataframe['vowels_a']
    dataframe['consonants_b'] = dataframe['name_b'].str.len() - dataframe['vowels_b']
    dataframe['vowels'] = (dataframe['vowels_a'] - dataframe['vowels_b']).abs()
    dataframe['vowels'] = 1 - (dataframe['vowels'] / vowel_max)
    dataframe['consonants'] = (dataframe['consonants_a'] - dataframe['consonants_b']).abs()
    dataframe['consonants'] = 1 - (dataframe['consonants'] / consonant_max)
    dataframe['characters'] = (dataframe['name_a'].str.len() - dataframe['name_b'].str.len()).abs()
    dataframe['characters'] = 1 - (dataframe['characters'] / character_max)

    #Phonetic Component:
    dataframe["levenshtein_phonetic"] = [textdistance.levenshtein.normalized_similarity(jellyfish.soundex(x), jellyfish.soundex(y)) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]
    dataframe["jw_phonetic"] = [textdistance.jaro_winkler.normalized_similarity(jellyfish.soundex(x), jellyfish.soundex(y)) for x, y in dataframe[['name_a', 'name_b']].itertuples(index=False)]

    dataframe = dataframe.drop(columns=['vowels_a', 'vowels_b', 'consonants_a', 'consonants_b'])
    
    return dataframe

# 1. Get the dataset and folds from trained folder

In [4]:
folds = []

for i in range(5):
  test_csv = os.path.join(trained_folder, f"test_k{i}.csv")
  df = pd.read_csv(test_csv, usecols=["name1", "name2", "label"])
  df = df.astype({"label": bool, "name1": str, "name2": str})
  df = df.rename(columns={"name1": "name_a", "name2": "name_b"})

  folds.append(df)

len(folds)

folds[0]

Unnamed: 0,name_a,name_b,label
0,dessell,pessall,False
1,ellgood,elwood,False
2,ermann,erman,True
3,koland,nowland,False
4,radebach,rasbach,False
...,...,...,...
14995,tildsley,bertini,False
14996,rieck,riek,False
14997,dippery,brohart,False
14998,wipperman,wippermann,True


# 2. Train the RF model

In [5]:
out_pairs = []

for pairs in folds:
    out_pairs.append(create_RF_features(pairs))
    
folds = out_pairs
    
scores = defaultdict(list)
y_prob = []
name_a = []
name_b = []
labels = []

rf_y_pred = []

rf_models = []

for test_fold_index in range(len(folds)):
    val_fold_index = test_fold_index - 1 if test_fold_index - 1 >= 0 else len(folds) - 1

    X_train = pd.DataFrame()
    y_train = pd.Series(dtype=bool)
    for fold_index in range(len(folds)):
        if fold_index != test_fold_index and fold_index != val_fold_index:
            X_train = pd.concat([X_train, folds[fold_index].drop(columns=['name_a', 'name_b', 'label'])])
            y_train = pd.concat([y_train, folds[fold_index]['label']])
    name_a = name_a + folds[test_fold_index]["name_a"].values.tolist()
    name_b = name_b + folds[test_fold_index]["name_b"].values.tolist()
    X_test = folds[test_fold_index].drop(columns=['name_a', 'name_b', 'label'])
    y_test = folds[test_fold_index]['label']
    
    clf = RandomForestClassifier(random_state=0)
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)

    rf_y_pred += np.array(y_pred).tolist()

    scores['test_precision'].append(precision_score(y_test, y_pred))
    scores['test_recall'].append(recall_score(y_test, y_pred))
    scores['f1_score'].append(f1_score(y_test, y_pred))

    yt = np.array(y_test)
    yp = np.array(y_pred)

    to_labels = yt.astype(int).tolist()
    labels = labels + to_labels

    scores['tp'].append(np.count_nonzero(yt & yp))
    scores['tn'].append(np.count_nonzero(np.logical_not(yt) & np.logical_not(yp)))
    scores['fp'].append(np.count_nonzero(np.logical_not(yt) & yp))
    scores['fn'].append(np.count_nonzero(yt & np.logical_not(yp)))

    y_prob += [x[1] for x in clf.predict_proba(X_test)]

    rf_models.append(clf)

print({k: mean(v) for k, v in scores.items()})

{'test_precision': 0.9030918765344335, 'test_recall': 0.93076, 'f1_score': 0.9167086101343502, 'tp': 4653.8, 'tn': 9500.6, 'fp': 499.4, 'fn': 346.2}


# 3. Load the best DF models

In [6]:
dl_models = []

for k in range(5):
    save_file = os.path.join("../saves", f"{run_name}_k{k}_BEST")

    json_file = os.path.join("configs", str(run_name) + ".json")
    DATASET_CONFIG, TRAIN_CONFIG, MODEL_KWARGS = load_json_config(config_file)

    _, model, _, _, _, _, _ = load_data(save_file, TRAIN_CONFIG, MODEL_KWARGS)
    dl_models.append(model)
    
print(dl_models)

Loaded run successfully from ../saves/gru_soundex_k0_BEST
Loaded run successfully from ../saves/gru_soundex_k1_BEST
Loaded run successfully from ../saves/gru_soundex_k2_BEST
Loaded run successfully from ../saves/gru_soundex_k3_BEST
Loaded run successfully from ../saves/gru_soundex_k4_BEST
[Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): GRU(36, 25, num_layers=8, batch_first=True, bidirectional=True)
  (bidirectional_linear): Linear(in_features=3050, out_features=25, bias=True)
), Siamese(
  (A_function): 

# 4. Get the Results set

In [13]:
def get_1800s_ln_results_set(path = "../data/1800s_last_name_pairs.tsv"):
    df = pd.read_csv(path, delimiter = "\t", names = ["name0", "name1"])
    df["gt_label"] = 1
    
    return df

def get_1800s_fn_results_set(path = "../data/1800s_first_name_pairs.tsv"):
    df = pd.read_csv(path, delimiter = "\t", names = ["name0", "name1"])
    df["gt_label"] = 1
    df = df.dropna(axis = 0)
    
    return df
    
def get_Normal_results_set(path = "../data/gru_metaphone"):
    folds = []

    for i in range(5):
        test_csv = os.path.join(trained_folder, f"test_k{i}.csv")
        df = pd.read_csv(test_csv, usecols=["name1", "name2", "label"])
        df = df.astype({"label": bool, "name1": str, "name2": str})
        df = df.rename(columns={"name1": "name0", "name2": "name1", "label": "gt_label"})
        df["gt_label"] = df["gt_label"].astype(int)


        folds.append(df)

    df = pd.concat(folds, axis = 0)
    
    return df

directory = {"1800s_ln": get_1800s_ln_results_set, "1800s_fn": get_1800s_fn_results_set, "Normal": get_Normal_results_set}
function = directory[result_set]
results_set = function()
results_set

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 25)

# 5. Run models on result set 

Creates DL classification (score), DL classification (label), and RF classification

In [11]:
results_dir = results_set.to_dict('records')
results_dir_out = list()

for index, pair in enumerate(results_dir):
    # Deep Learning:
    n1_emb = str2emb(pair['name0']).unsqueeze(0)
    n2_emb = str2emb(pair['name1']).unsqueeze(0)

    score_list = []
    label_list = []

    for model, threshold in zip(dl_models, DL_thrsholds):
        #print(n1_emb, n2_emb)
        score, (_, _) = model(n1_emb, n2_emb)
        score = score.item()
        score_list.append(score)

        label = 1 if score > threshold else 0

        label_list.append(label)

    DL_classification_score = sum(score_list) / 5
    DL_classification_label = 1 if label_list.count(1) >= 3 else 0

    # Random Forest:
    pair_rf_input = {"name_a": pair["name0"], "name_b": pair["name1"], "label": pair["gt_label"]}
    input_df = pd.DataFrame(pair_rf_input, index = [0])
    rf_features = create_RF_features(input_df)
    X_test = rf_features.drop(columns=['name_a', 'name_b', 'label'])

    label_list = []

    for model in rf_models:
        label = model.predict(X_test)
        label_list.append(label)

    RF_classification_label = 1 if label_list.count(True) >= 3 else 0

    out = {"name0": pair['name0'], "name1": pair['name1'], "gt_label": pair['gt_label'], "DL_classification_score": DL_classification_score, \
           "DL_classification_label": DL_classification_label, "RF_classification_label": RF_classification_label}
    results_dir_out.append(out)

    if index % 100 == 0:
        print(index, out)

results_set = pd.DataFrame(results_dir_out)
results_set

0 {'name0': 'dessell', 'name1': 'pessall', 'gt_label': False, 'DL_classification_score': 0.4728097140789032, 'DL_classification_label': 0, 'RF_classification_label': 0}
100 {'name0': 'uselman', 'name1': 'usselman', 'gt_label': True, 'DL_classification_score': 0.9599664688110352, 'DL_classification_label': 1, 'RF_classification_label': 1}
200 {'name0': 'humphry', 'name1': 'heaslet', 'gt_label': False, 'DL_classification_score': 0.5513315260410309, 'DL_classification_label': 0, 'RF_classification_label': 0}
300 {'name0': 'hetrick', 'name1': 'hetzke', 'gt_label': False, 'DL_classification_score': 0.5893280982971192, 'DL_classification_label': 0, 'RF_classification_label': 0}
400 {'name0': 'schaurer', 'name1': 'scheuren', 'gt_label': False, 'DL_classification_score': 0.4880423963069916, 'DL_classification_label': 0, 'RF_classification_label': 0}
500 {'name0': 'bardson', 'name1': 'maryson', 'gt_label': False, 'DL_classification_score': 0.43894789218902586, 'DL_classification_label': 0, 'RF_

KeyboardInterrupt: 

# 6. Create additional data for the result set
DL Bucket, RF bucket, concatenated bucket

In [None]:
def calculate_bucket(gt, pred):
    if gt == 1 and pred == 1:
        return "tp"
    elif gt == 1 and pred == 0:
        return "fn"
    elif gt == 0 and pred == 1:
        return "fp"
    elif gt == 0 and pred == 0:
        return "tn"
    
results_set["DL_bucket"] = [calculate_bucket(x, y) for x, y in results_set[["gt_label", "DL_classification_label"]].itertuples(index=False)]
results_set["RF_bucket"] = [calculate_bucket(x, y) for x, y in results_set[["gt_label", "RF_classification_label"]].itertuples(index=False)]
results_set["concat_bucket"] = [x + "_" + y for x, y in results_set[["DL_bucket", "RF_bucket"]].itertuples(index=False)]

results_set

# 7. Save to .csv

In [None]:
results_set.to_csv(os.path.join(csv_save_path, run_name + "_" + result_set + ".csv"))