In [3]:
#!pip install annoy

# Load model
from model import Siamese
from process import load_data, load_json_config

import os

model_list = []

run_name = "edit_emb20_lay8"
for k in range(5):
    save_file = os.path.join("saves", f"{run_name}_k{k}_BEST")

    json_file = os.path.join("configs", str(run_name) + ".json")
    DATASET_CONFIG, TRAIN_CONFIG, MODEL_KWARGS = load_json_config(json_file)

    _, model, _, _, _, _, _ = load_data(save_file, TRAIN_CONFIG, MODEL_KWARGS)
    model_list.append(model)

Loaded run successfully from saves/edit_emb20_lay8_k0_BEST
Loaded run successfully from saves/edit_emb20_lay8_k1_BEST
Loaded run successfully from saves/edit_emb20_lay8_k2_BEST
Loaded run successfully from saves/edit_emb20_lay8_k3_BEST
Loaded run successfully from saves/edit_emb20_lay8_k4_BEST


In [4]:
import pandas as pd
from process import str2emb

dataset_name = "1800s_first_name_pairs.tsv"


dataset_path = os.path.join("data", dataset_name)

dataset = pd.read_csv(dataset_path, sep = "\t", names = ["name1", "name2"])
dataset = dataset.to_dict("records")

thresholds = [0.7362139821052551, 0.8045336008071899, 0.791245698928833, 0.7750964164733887, 0.7543946504592896]

out = []

for i, data in enumerate(dataset):
    n1_emb = str2emb(data['name1']).unsqueeze(0)
    n2_emb = str2emb(data['name2']).unsqueeze(0)

    score_list = []
    label_list = []

    for model, threshold in zip(model_list, thresholds):
        score, (_, _) = model(n1_emb, n2_emb)
        score = score.item()

        score_list.append(score)
        
        label = 1 if score >= threshold else 0

        label_list.append(label)

    label_verdict = 1 if sum(label_list) >= 3 else 0

    score_adj_verdict_list = [score - threshold for score, threshold in zip(score_list, thresholds)]
    score_adj_verdict = sum(score_adj_verdict_list) / 5

    score_verdict = sum(score_list) / 5

    out.append({"name_a": data['name1'], "name_b": data['name2'], "score": score_verdict, "score_adjusted": score_adj_verdict, "label": label_verdict, "score_plus_minus": score_adj_verdict_list, "label_list": label_list})

print(out)

dataframe = pd.DataFrame(out)
dataframe.to_csv(os.path.join("results", run_name, "DL_results_full.csv"))


[{'name_a': 'david', 'name_b': 'david', 'score': 1.0, 'score_adjusted': 0.22770313024520875, 'label': 1, 'tp': 1, 'tn': 0, 'fn': 0, 'score_plus_minus': [0.2637860178947449, 0.19546639919281006, 0.208754301071167, 0.22490358352661133, 0.24560534954071045], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'isabella', 'name_b': 'isabella', 'score': 1.0, 'score_adjusted': 0.22770313024520875, 'label': 1, 'tp': 1, 'tn': 0, 'fn': 0, 'score_plus_minus': [0.2637860178947449, 0.19546639919281006, 0.208754301071167, 0.22490358352661133, 0.24560534954071045], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'jessie', 'name_b': 'jessie', 'score': 1.0, 'score_adjusted': 0.22770313024520875, 'label': 1, 'tp': 1, 'tn': 0, 'fn': 0, 'score_plus_minus': [0.2637860178947449, 0.19546639919281006, 0.208754301071167, 0.22490358352661133, 0.24560534954071045], 'label_list': [1, 1, 1, 1, 1]}, {'name_a': 'janet', 'name_b': 'janet', 'score': 1.0, 'score_adjusted': 0.22770313024520875, 'label': 1, 'tp': 1, 'tn': 0, 'fn': 0,