# Playground notebook

## HuBERT test

In [None]:
from transformers import AutoProcessor, HubertModel
from datasets import load_dataset

In [None]:
model_name = "facebook/hubert-large-ls960-ft"

processor = AutoProcessor.from_pretrained(model_name)
model = HubertModel.from_pretrained(model_name)

In [None]:
ds = load_dataset("C:\\Users\\mj115gl\\work_dir\\thesis\\audio-semantics\\data\\LibriSpeech\\dev-clean")

In [None]:
input_values = processor(ds["train"][2]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
transcription

In [None]:
ds["train"][1]["audio"]["array"].shape

In [None]:
input_values.shape

In [None]:
logits.shape

In [None]:
model

In [None]:
output = model(input_values)

In [None]:
output.last_hidden_state.shape

## Baseline W2V

In [None]:
from gensim.models.word2vec import Word2Vec, LineSentence
# from gensim.test.utils import datapath

In [None]:
line_fp = open("data/gtbrg_i.txt", "r", encoding="utf-16")
sentences = LineSentence(line_fp)
# line_fp.close()

In [None]:
line_fp.seek(0)
line_fp.readline()
# line_fp.close()

In [None]:
vector_size = 100
window = 5
w2v_model_tag = "TEST"
W2V_MODEL_PATH = f"models/w2v_vs{vector_size}_w{window}_{w2v_model_tag}.model"

In [None]:
w2v_model = Word2Vec(
    sentences,
    window=window,
    vector_size=vector_size,
    min_count=0,
    workers=4,
    epochs=10
)

In [None]:
w2v_model.save(W2V_MODEL_PATH)

In [None]:
list(w2v_model.wv.key_to_index.keys())[:15]

In [None]:
w2v_model.wv.most_similar("man")

In [None]:
w2v_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

In [None]:
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [None]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in wordsim_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

In [None]:
simlex_scores = []

with open("data/SimLex-999/SimLex-999.txt") as simlex_fp:
    for line in simlex_fp.readlines()[1:]:
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[3])
        simlex_scores.append([w1, w2, gold_score])

In [None]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in simlex_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(simlex_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

In [None]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
n_clusters = []
words = list(w2v_model.wv.key_to_index.keys())

tqdm_iterator = tqdm(range(0, 99, 1))

for threshold in tqdm_iterator:
    threshold = threshold / 100
    word_to_cluster = dict()  # Stores map from word to cluster
    cluster_to_words = dict()  # Stores map from cluster to words
    cluster_idx = 0  # Counter

    for word in words:
        tqdm_iterator.set_postfix({"Word": word})
        # Check if word has already been clustered
        if word not in word_to_cluster.keys():
            # Create new cluster
            cluster_idx += 1
            # cluster_key = chr(0x0020 + cluster_idx)
            cluster_key = cluster_idx

            # Add new word to cluster
            tqdm_iterator.set_postfix({"status" : "adding new word to cluster"})
            cluster_to_words[cluster_key] = [word]
            word_to_cluster[word] = cluster_key
            
            # Add all similar words
            tqdm_iterator.set_postfix({"status" : "adding all words"})
            for similar_word, score in w2v_model.wv.most_similar(word, topn=50):
                tqdm_iterator.set_postfix({"status" : "checked 50 words"})
                if score > threshold:
                    cluster_to_words[cluster_key].append(similar_word)
                    word_to_cluster[similar_word] = cluster_key
    
    n_clusters.append(len(cluster_to_words))

In [None]:
len(words)

In [None]:
plt.plot(n_clusters)

## Convert old

In [None]:
from string import ascii_letters

In [None]:
letters = {
    i: key for i, key in enumerate(ascii_letters)
}

In [None]:
word_to_key = {}
key_to_word = {}

with open("data/quantized/dev-gold.csv", "r") as key_file:
    for line in key_file.readlines()[1:]:
        dataset, key, _, word = line.strip().split(",")
        if word not in word_to_key:
            word_to_key[word] = {
                'librispeech': [],
                'synthetic': []
            }
        word_to_key[word][dataset].append(key)
        key_to_word[key] = word

In [None]:
utterances = {}

for dataset in ["librispeech", "synthetic"]:
    with open(f"data/quantized/semantic/dev/{dataset}/quantized_outputs.txt", "r") as utterance_file:
        for line in utterance_file.readlines():
            key, seq = line.strip().split("\t")
            utterance = seq.split(",")[1:]

            key = "ls_" + key_to_word[key] if dataset == "librispeech" else "sy_" + key_to_word[key]
            if key not in utterances:
                utterances[key] = []

            utterances[key].append(
                "".join(
                    [letters[int(v)] for i, v in enumerate(utterance)]
                )
            )

In [None]:
with open("data/level_wise/level0/utterances_original.txt", "w+", encoding="utf-8") as ufp:
    for word in utterances:
        for utterance in utterances[word]:
            ufp.write(word + "\t" + utterance + "\n")

In [None]:
sim_pairs = []
rel_pairs = []

with open("data/quantized/dev-pairs.csv", "r") as pairs_file:
    for line in pairs_file.readlines()[1:]:
        dataset, _, w1, w2, sim, rel = line.strip().split(",")
        if sim:
            sim_pairs.append((dataset, w1, w2, float(sim)))
        if rel:
            rel_pairs.append((dataset, w1, w2, float(rel)))

with open("data/level_wise/level0/pairs.txt", "w+", encoding="utf-8") as pairs_fp:
    for pair in sim_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            str(score) + "," + "\n"
        )
    for pair in rel_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            "," + str(score) + "\n"
        )

In [None]:
str("a")

## FastText

In [None]:
import sentencepiece as spm

In [None]:
sp_model = spm.SentencePieceProcessor()

sp_model.Load("models/original_60k_250x1/level1/unigram_vs60000_lw.model")

In [None]:
original_corpus = []

with open("data/level_wise/level0/dev_corpus_original.txt", "r", encoding="utf-8") as ocfp:
    for line in ocfp.readlines():
        original_corpus.append(line.strip())

In [None]:
with open("data/original_60k_250x1/level1/dev_corpus_ft.txt", "w+", encoding="utf-8") as ncfp:
    for line in original_corpus:
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(line)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]
        
        ncfp.write(" ".join(units) + "\n")

In [None]:
import fasttext

In [None]:
ft_model = fasttext.train_unsupervised(
    "data/original_60k_250x1/level1/corpus_ft.txt",
    "cbow",
    dim=250,
    thread=4,
    epoch=7
)

In [None]:
type(ft_model)

In [None]:
ft_model.save_model("models/fasttext_cbow")

In [None]:
# ft_model = fasttext.load_model("models/fasttext_cbow")

In [None]:
from levelwise_model.test_bench import LSTestBench
from levelwise_model.utterances import WordToUtteranceMapping

In [None]:
utterance_mapping = WordToUtteranceMapping(map_file="data/level_wise/level0/utterances_original.txt")

In [None]:
test_bench = LSTestBench(scores_file="data/level_wise/level0/pairs.txt")

In [None]:
test_bench.ft_score_and_save(ft_model=ft_model, utterances=utterance_mapping, results_file="results/ft_cbow")

## DistilBERT

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('C:\\Users\\mj115gl\\work_dir\\thesis\\audio-semantics\\models\\db_final')

In [None]:
import sentencepiece as spm

In [None]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load("models/comp_60k_250x1/level1/unigram_vs60000_lw.model")

In [None]:
input_text = "PlQQQQoobbbbbIkkQorrrAAAfflllQQooVVVpppjjUQzzzzOOOOOOOOOOO"

pieces = list(
    filter(
        lambda x: x != "▁",
        sp_model.EncodeAsPieces(input_text)
    )
)

units = [piece.replace("▁", "") for piece in pieces]
print(" ".join(units))

In [None]:
embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarities = cosine_similarity(embeddings, dense_output=True)
similarities

In [None]:
np.mean(similarities[np.where(np.tril(similarities, -1))])

In [None]:
from levelwise_model.utterances import WordToUtteranceMapping

In [None]:
utterances = WordToUtteranceMapping("data/level_wise/level0/utterances.txt")

In [None]:
updated_utterances = {}

for word in utterances.utterances:
    updated_utterances[word] = []
    for utt in utterances.utterances[word]:
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(utt)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]
        updated_utterances[word].append(
            " ".join(units)
        )

In [None]:
import numpy as np

In [None]:
model.encode("test").shape

In [None]:
# Test positive
ls_sims_pos = []
sy_sims_pos = []
tested = 0

for word in updated_utterances:
    if len(updated_utterances[word]) > 1:
        tested += 1
        utterances_list = [
            " ".join(
                [
                    piece.replace("▁", "")
                    for piece in list(
                        filter(
                            lambda x: x != "▁",
                            sp_model.EncodeAsPieces(utterance)
                        )
                    )
                ]
            ) for utterance in updated_utterances[word]
        ]

        embeddings = model.encode(utterances_list)
        similarities = cosine_similarity(embeddings)

        if word.startswith("ls_"):
            ls_sims_pos.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
        if word.startswith("sy_"):
            sy_sims_pos.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg positive score (LS) = {sum(ls_sims_pos)/len(ls_sims_pos)} ({len(ls_sims_pos)} words).")
print(f"Avg positive score (SY) = {sum(sy_sims_pos)/len(sy_sims_pos)} ({len(sy_sims_pos)} words).")

In [None]:
n_utts = []

for word in updated_utterances:
    n_utts.append(len(updated_utterances[word]))

print(sum(n_utts)/len(n_utts))

In [None]:
# Test negative
ls_sims_neg = []
sy_sims_neg = []
tested = 0
n_negative_samples = 5

ls_word_list = set([word for word in updated_utterances.keys() if word.startswith("ls_")])
sy_word_list = set([word for word in updated_utterances.keys() if word.startswith("sy_")])

for word in updated_utterances:
    tested += 1

    negative_samples = []
    if word.startswith("ls_"):
        use_list = ls_word_list
    else:
        use_list = sy_word_list
    sample = np.random.choice(list(use_list - {word}), size=(n_negative_samples))
    for s in sample:
        negative_samples.append(
            np.random.choice(updated_utterances[s])
        )

    utterances_list = [
        " ".join(
            [
                piece.replace("▁", "")
                for piece in list(
                    filter(
                        lambda x: x != "▁",
                        sp_model.EncodeAsPieces(utterance)
                    )
                )
            ]
        ) for utterance in negative_samples
    ]
    
    embeddings = model.encode(utterances_list)
    similarities = cosine_similarity(embeddings)

    if word.startswith("ls_"):
        ls_sims_neg.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
    if word.startswith("sy_"):
        sy_sims_neg.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg negative score (LS) = {sum(ls_sims_neg)/len(ls_sims_neg)} ({len(ls_sims_neg)} words).")
print(f"Avg negative score (SY) = {sum(sy_sims_neg)/len(sy_sims_neg)} ({len(sy_sims_neg)} words).")

In [None]:
sum(sy_sims)/len(sy_sims)

### ABX Test

In [51]:
def word_vec_fn(utt):
    pieces = list(
        filter(
            lambda x: x != "▁",
            sp_model.EncodeAsPieces(utt)
        )
    )

    units = [piece.replace("▁", "") for piece in pieces]
    return model.encode(" ".join(units))

In [None]:
from string import ascii_letters

In [None]:
preds = 0
total = 0
use_noise_for_x = True

def get_correct_word(pair, word):
    if pair[0] == word:
        return pair[1]
    return pair[1]

for word_a in updated_utterances:
    # Sort words by similarity
    similar_words = sorted(
        # Get only pairs containing word A
        filter(
            lambda x: x[0] == word_a or x[1] == word_a,
            rel_pairs
        ),
        key=lambda x: x[2],
        reverse=True
    )

    similar_words = list(
        map(
            lambda x: get_correct_word(x, word_a),
            similar_words
        )
    )

    for utt_a in updated_utterances[word_a]:
        word_b = similar_words[0]
        utt_b = np.random.choice(
            updated_utterances[word_b]
        )

        if not use_noise_for_x:
            word_x = similar_words[-1]
            utt_x = np.random.choice(
                updated_utterances[word_x]
            )

        else:
            utt_x = np.random.choice(
                list(ascii_letters),
                len(utt_a),
                replace=True
            )
            utt_x = "".join(utt_x)

        v_a = word_vec_fn(utt_a).reshape(1, -1)
        v_b = word_vec_fn(utt_b).reshape(1, -1)
        v_x = word_vec_fn(utt_x).reshape(1, -1)

        if cosine_similarity(v_a, v_b) > cosine_similarity(v_a, v_x):
            preds += 1
        total += 1

print({"ABX Result": preds/total})

In [52]:
preds = 0
total = 0
use_noise_for_x = False

def get_correct_word(pair, word):
    if pair[0] == word:
        return pair[1]
    return pair[1]

with open("results/abx_test_db.txt", "w+") as abx_results_file:
    abx_results_file.write("A,B,X,sim(AB),sim(AX),chosen\n")
    for word_a in updated_utterances:
        # Sort words by similarity
        similar_words = sorted(
            # Get only pairs containing word A
            filter(
                lambda x: x[0] == word_a or x[1] == word_a,
                rel_pairs
            ),
            key=lambda x: x[2],
            reverse=True
        )

        similar_words = list(
            map(
                lambda x: get_correct_word(x, word_a),
                similar_words
            )
        )

        for utt_a in updated_utterances[word_a]:
            word_b = similar_words[0]
            utt_b = np.random.choice(
                updated_utterances[word_b]
            )

            if not use_noise_for_x:
                word_x = similar_words[-1]
                utt_x = np.random.choice(
                    updated_utterances[word_x]
                )

            else:
                word_x = "noise"
                utt_x = np.random.choice(
                    list(ascii_letters),
                    len(utt_a),
                    replace=True
                )
                utt_x = "".join(utt_x)

            if len(set([word_a, word_b, word_x])) == 3:
                v_a = word_vec_fn(utt_a).reshape(1, -1)
                v_b = word_vec_fn(utt_b).reshape(1, -1)
                v_x = word_vec_fn(utt_x).reshape(1, -1)

                sim_ab = cosine_similarity(v_a, v_b)
                sim_ax = cosine_similarity(v_a, v_x)

                chosen = "X"
                if sim_ab > sim_ax:
                    chosen = "B"
                    preds += 1
                total += 1

                abx_results_file.write(",".join([word_a, word_b, word_x, str(sim_ab), str(sim_ax), chosen]) + "\n")

print({"ABX Result": preds/total})

{'ABX Result': 0.5062111801242236}


In [None]:
len(set([word_a, word_b, word_x]))

In [None]:
sim_pairs = []
rel_pairs = []

with open("data/level_wise/level0/pairs.txt", "r") as pairs_file:
    for line in pairs_file.readlines()[1:]:
        w1, w2, sim, rel = line.strip().split(",")
        if sim:
            sim_pairs.append((w1, w2, float(sim)))
        if rel:
            rel_pairs.append((w1, w2, float(rel)))

In [None]:
def get_model_vectors(word):
    return model.encode(updated_utterances[word])

In [None]:
from scipy.stats import pearsonr

In [None]:
scores = {
    test_set: {
        method: [] for method in ["min", "max", "avg", "all"]
    } for test_set in ["librispeech", "synthetic"]
}
gold_standard = {
    "librispeech": [],
    "synthetic": []
}
trials = 0
errors = 0

for pair in rel_pairs:
    try:
        w1, w2, rel = pair

        test_set = "librispeech" \
            if w1.startswith("ls_") \
            else "synthetic"
        w1.replace("ls_", "").replace("sy_", "")
        w2.replace("ls_", "").replace("sy_", "")

        w1_vectors = get_model_vectors(
            w1
        )
        w2_vectors = get_model_vectors(
            w2
        )

        similarities = [
            cosine_similarity(i.reshape(1, -1), j.reshape(1, -1))
            for i in w1_vectors
            for j in w2_vectors
        ]

        scores[test_set]["min"].append(np.min(similarities))
        scores[test_set]["avg"].append(np.mean(similarities))
        scores[test_set]["max"].append(np.max(similarities))

        gold_standard[test_set].append(rel)
    except Exception as e:
        print(e)
        errors += 1
    trials += 1

print({
    'score': {
        test_set: {
            var: pearsonr(
                scores[test_set][var],
                gold_standard[test_set]
            )[0] * 100
            for var in ['min', 'avg', 'max']
        }
        for test_set in ['librispeech', 'synthetic']
    },
    'errors': errors,
    'trials': trials
})

In [44]:
from gensim.models.word2vec import Word2Vec

In [45]:
w2v_model = Word2Vec.load("models/original_60k_250x1/level1/w2v_vs250_w1_lw.model")

In [None]:
updated_utterances

In [None]:
# Test positive
ls_sims = []
sy_sims = []
tested = 0

for word in updated_utterances:
    if len(updated_utterances[word]) > 1:
        tested += 1
        embeddings = utterances.get_vectors_from_word(
            word, sp_model, w2v_model
        )[:, 0, :]

        similarities = cosine_similarity(embeddings)

        if word.startswith("ls_"):
            ls_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
        if word.startswith("sy_"):
            sy_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg positive score (LS) = {sum(ls_sims)/len(ls_sims)} ({len(ls_sims)} words).")
print(f"Avg positive score (SY) = {sum(sy_sims)/len(sy_sims)} ({len(sy_sims)} words).")

In [None]:
def get_vector_from_utterance(
            utterance,
            sp_model: spm.SentencePieceProcessor,
            w2v_model: Word2Vec
    ):
        """
        Gets the embeddings of the given utterance.
        """
        if utterance in w2v_model.wv.key_to_index.keys():
            return w2v_model.wv[utterance].reshape(1, -1)
        else:
            pieces = list(
                filter(
                    lambda x: x != "▁",
                    sp_model.EncodeAsPieces(utterance)
                )
            )

            units = [piece.replace("▁", "") for piece in pieces]

            vectors = np.array([w2v_model.wv[unit] for unit in units])
            return vectors.mean(axis=0)

In [None]:
# Test negative
ls_sims = []
sy_sims = []
tested = 0
n_negative_samples = 5

ls_word_list = set([word for word in updated_utterances.keys() if word.startswith("ls_")])
sy_word_list = set([word for word in updated_utterances.keys() if word.startswith("sy_")])

for word in updated_utterances:
    tested += 1

    negative_samples = []
    if word.startswith("ls_"):
        use_list = ls_word_list
    else:
        use_list = sy_word_list
    sample = np.random.choice(list(use_list - {word}), size=(n_negative_samples))
    for s in sample:
        negative_samples.append(
            np.random.choice(updated_utterances[s])
        )

    embeddings = np.array([
        get_vector_from_utterance(utterance, sp_model, w2v_model) for utterance in negative_samples
    ])
    
    similarities = cosine_similarity(embeddings)

    if word.startswith("ls_"):
        ls_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
    if word.startswith("sy_"):
        sy_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg negative score (LS) = {sum(ls_sims)/len(ls_sims)} ({len(ls_sims)} words).")
print(f"Avg negative score (SY) = {sum(sy_sims)/len(sy_sims)} ({len(sy_sims)} words).")

In [None]:
embeddings.shape

In [None]:
embeddings[:, 0, :].shape

In [48]:
sp_model.Load("models/original_60k_250x1/level1/unigram_vs60000_lw.model")

True

In [49]:
def word_vec_fn(utterance):
    if utterance in w2v_model.wv.key_to_index.keys():
        return w2v_model.wv[utterance].reshape(1, -1)
    else:
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(utterance)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]

        vectors = np.array([w2v_model.wv[unit] for unit in units])
        return vectors.mean(axis=0).reshape(1, -1)

In [50]:
preds = 0
total = 0
use_noise_for_x = False

def get_correct_word(pair, word):
    if pair[0] == word:
        return pair[1]
    return pair[1]

with open("results/abx_test.txt", "w+") as abx_results_file:
    abx_results_file.write("A,B,X,sim(AB),sim(AX),chosen\n")
    for word_a in updated_utterances:
        # Sort words by similarity
        similar_words = sorted(
            # Get only pairs containing word A
            filter(
                lambda x: x[0] == word_a or x[1] == word_a,
                rel_pairs
            ),
            key=lambda x: x[2],
            reverse=True
        )

        similar_words = list(
            map(
                lambda x: get_correct_word(x, word_a),
                similar_words
            )
        )

        for utt_a in updated_utterances[word_a]:
            word_b = similar_words[0]
            utt_b = np.random.choice(
                updated_utterances[word_b]
            )

            if not use_noise_for_x:
                word_x = similar_words[-1]
                utt_x = np.random.choice(
                    updated_utterances[word_x]
                )

            else:
                word_x = "noise"
                utt_x = np.random.choice(
                    list(ascii_letters),
                    len(utt_a),
                    replace=True
                )
                utt_x = "".join(utt_x)

            if len(set([word_a, word_b, word_x])) == 3:
                v_a = word_vec_fn(utt_a).reshape(1, -1)
                v_b = word_vec_fn(utt_b).reshape(1, -1)
                v_x = word_vec_fn(utt_x).reshape(1, -1)

                sim_ab = cosine_similarity(v_a, v_b)
                sim_ax = cosine_similarity(v_a, v_x)

                chosen = "X"
                if sim_ab > sim_ax:
                    chosen = "B"
                    preds += 1
                total += 1

                abx_results_file.write(",".join([word_a, word_b, word_x, str(sim_ab), str(sim_ax), chosen]) + "\n")

print({"ABX Result": preds/total})

{'ABX Result': 0.4767080745341615}
