In [4]:
#!pip install annoy

# Load model
from model import Siamese
from process import load_data, load_json_config

import os

model_list = []

run_name = "gru_soundex"
for k in range(5):
    save_file = os.path.join("saves", f"{run_name}_k{k}_BEST")

    json_file = os.path.join("configs", str(run_name) + ".json")
    DATASET_CONFIG, TRAIN_CONFIG, MODEL_KWARGS = load_json_config(json_file)

    _, model, _, _, _, _, _ = load_data(save_file, TRAIN_CONFIG, MODEL_KWARGS)
    model_list.append(model)

Loaded run successfully from saves/gru_soundex_k0_BEST
Loaded run successfully from saves/gru_soundex_k1_BEST
Loaded run successfully from saves/gru_soundex_k2_BEST
Loaded run successfully from saves/gru_soundex_k3_BEST
Loaded run successfully from saves/gru_soundex_k4_BEST


In [5]:
import pandas as pd
from process import str2emb

dataset_name = "1800s_last_name_pairs.tsv"


dataset_path = os.path.join("data", dataset_name)

dataset = pd.read_csv(dataset_path, sep = "\t", names = ["name1", "name2"])
dataset = dataset.to_dict("records")

thresholds = [0.778285384,0.79654932,0.794194341,0.783809662,0.772859931]

out = []

for i, data in enumerate(dataset):
    n1_emb = str2emb(data['name1']).unsqueeze(0)
    n2_emb = str2emb(data['name2']).unsqueeze(0)

    score_list = []
    label_list = []

    for model, threshold in zip(model_list, thresholds):
        score, (_, _) = model(n1_emb, n2_emb)
        score = score.item()

        score_list.append(score)
        
        label = 1 if score >= threshold else 0

        label_list.append(label)

    label_verdict = 1 if sum(label_list) >= 3 else 0

    score_adj_verdict_list = [score - threshold for score, threshold in zip(score_list, thresholds)]
    score_adj_verdict = sum(score_adj_verdict_list) / 5

    score_verdict = sum(score_list) / 5

    out.append({"name_a": data['name1'], "name_b": data['name2'], "score": score_verdict, "score_adjusted": score_adj_verdict, "pred": label_verdict, "score_plus_minus": score_adj_verdict_list, "label_list": label_list})

print(out)

dataframe = pd.DataFrame(out)
dataframe.to_csv(os.path.join("results", run_name, "DL_results_full.csv"))


[{'name_a': 'bee', 'name_b': 'bee', 'score': 1.0, 'score_adjusted': 0.2148602724, 'label': 1, 'score_plus_minus': [0.22171461599999998, 0.20345068, 0.20580565900000003, 0.21619033799999998, 0.22714006900000006], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'dickson', 'name_b': 'dickson', 'score': 1.0, 'score_adjusted': 0.2148602724, 'label': 1, 'score_plus_minus': [0.22171461599999998, 0.20345068, 0.20580565900000003, 0.21619033799999998, 0.22714006900000006], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'smith', 'name_b': 'smith', 'score': 1.0, 'score_adjusted': 0.2148602724, 'label': 1, 'score_plus_minus': [0.22171461599999998, 0.20345068, 0.20580565900000003, 0.21619033799999998, 0.22714006900000006], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'cavan', 'name_b': 'cavan', 'score': 1.0, 'score_adjusted': 0.2148602724, 'label': 1, 'score_plus_minus': [0.22171461599999998, 0.20345068, 0.20580565900000003, 0.21619033799999998, 0.22714006900000006], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 