In [None]:
# Code for computing PCS adapted from bootphon/measuring-regularities-in-word-embeddings
from util import *
import plotly.graph_objects as go
from sklearn.metrics import auc
import os

In [None]:
args = \
    {"embedding_dir" : "../embeddings/", 
     "analogy_dir" : "../dataset/BATS_3.0/", 
     "nb_perms" : 50}

### Load BATS dataset

In [None]:
models = []; scores = {}

if os.path.isdir(args["embedding_dir"]):  
    for filename in os.listdir(args["embedding_dir"]):
        if ".txt" in filename:
            name = " ".join(filename.split(".")[:-1])
            models.append((name, load_model(embedding_dir = os.path.join(args["embedding_dir"], filename))))
else:
    models.append(load_model(args["embedding_dir"]))

In [None]:
names, pairs_sets = bats_names_pairs(dir=args["analogy_dir"])

### Compute PCS

In [None]:
args["save_results"] = "../results/pcs/"

In [None]:
roc = {}

for model in models:
    model_sim, model_neg = metrics_from_model(model[1], names, pairs_sets, nb_perms=args["nb_perms"])
    model_roc_fpr, model_roc_tpr = compute_roc_curves(model_sim, model_neg, nb_perms=args["nb_perms"])

    roc[model[0]] = (model_roc_fpr, model_roc_tpr)

In [None]:
auc_scores = {model[0]:[] for model in models}

for index, name in enumerate(names):
    fpr_perms = [row[index] for row in column(list(roc.values()), 0)]
    tpr_perms = [row[index] for row in column(list(roc.values()), 1)]
    x = np.linspace(0, 1, len(min(fpr_perms, key=len)))

    fig = go.Figure()
    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

    for model in roc:
        fpr = np.interp(x, np.linspace(0, 1, len(roc[model][0][index])), roc[model][0][index])
        tpr = np.interp(x, np.linspace(0, 1, len(roc[model][1][index])), roc[model][1][index])
        auc_score = auc(fpr, tpr)
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=model + f" (AUC = {auc_score:.4f})", mode='lines'))
        auc_scores[model].append(auc_score)

    fig.update_layout(
        title=name,
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'))
    fig.write_image(args["save_results"] + name + ".pdf")

In [None]:
import plotly.express as px

fpr_means = []; tpr_means = []

for model in roc:
    fpr, tpr = roc[model]
    x = np.linspace(0, 1, len(min(fpr, key=len)))
    
    fpr = [np.interp(x, np.linspace(0, 1, len(item)), item) for item in fpr]
    tpr = [np.interp(x, np.linspace(0, 1, len(item)), item) for item in tpr]

    fpr_mean = np.mean(fpr, axis=0)
    tpr_mean = np.mean(tpr, axis=0)
    fpr_means.append(fpr_mean)
    tpr_means.append(tpr_mean)

In [None]:
fig = go.Figure()
fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

for index, fpr_mean in enumerate(fpr_means):
    tpr_mean = tpr_means[index]
    fig.add_trace(go.Scatter(x=fpr_mean, y=tpr_mean, name=models[index][0], mode='lines'))
    print("AUC for {}: {}".format(models[index][0], np.around(auc(fpr_mean, tpr_mean), 4)))

fig.update_layout(
    title="",
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    font={"size" : 16})

fig.write_image(args["save_results"] + "total_roc.pdf")

In [None]:
fig = go.Figure()
for model_name in auc_scores:
    fig.add_trace(go.Box(y=auc_scores[model_name], name=model_name))

fig.update_layout(title="",
                  xaxis_title='', yaxis_title='PCS',
                  width=1200, height=1200)

fig.write_image(args["save_results"] + "PCS_box_plot.pdf")
fig.show()

### Compute MSM

In [None]:
args["save_results"] = "../results/msm/"

In [None]:
def compute_alignment(vectors):
    # Normalize all vectors and compute mean
    vectors = [i/np.linalg.norm(i) for i in vectors]
    vec_mean = np.mean(vectors, axis=0); vec_mean = vec_mean/np.linalg.norm(vec_mean)

    deviations = [np.dot(i, vec_mean) for i in vectors]

    return tuple(deviations)

def compute_msm(model, names, pairs_sets):
    vocab_set = set(list(model.index_to_key))
    pairs_sets = [[d for d in list(pairs_sets[i]) if d[0] in vocab_set and d[1] in vocab_set] for i in range(len(pairs_sets))]
    name_to_score = {}
    for index, pair_set in tqdm(enumerate(pairs_sets), leave=False):
        vectors = []
        for word_pair in pair_set:
            vectors.append(model[word_pair[1]] - model[word_pair[0]])

        name_to_score[names[index]] = compute_alignment(vectors)
    return name_to_score

msm_scores = {}

for model in models:
    model_score = compute_msm(model[1], names, pairs_sets)
    msm_scores[model[0]] = model_score
    print("DAS for {}: {}".format(model[0], np.around(np.mean([item for sublist in model_score.values() for item in sublist]), 4)))

for name in names:
    fig = go.Figure()
    for model_name in msm_scores:
        fig.add_trace(go.Box(y=msm_scores[model_name][name], name=model_name))

    fig.update_layout(title=name, 
                      xaxis_title='', yaxis_title='Deviation',
                      yaxis_range=[0,1],
                      width=1200, height=1200)
    fig.write_image(args["save_results"] + name + ".pdf")

fig = go.Figure()
for model_name in msm_scores:
    fig.add_trace(go.Box(y=[item for sublist in msm_scores[model_name].values() for item in sublist], name=model_name))
    
fig.update_layout(title="Total Deviations",
                  xaxis_title='', yaxis_title='Deviation',
                  width=1200, height=1200)

fig.write_image(args["save_results"] + "total_deviation.pdf")