# Playground notebook

## HuBERT test

In [57]:
from transformers import AutoProcessor, HubertModel
from datasets import load_dataset

In [58]:
model_name = "facebook/hubert-large-ls960-ft"

processor = AutoProcessor.from_pretrained(model_name)
model = HubertModel.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
ds = load_dataset("C:\\Users\\mj115gl\\work_dir\\thesis\\audio-semantics\\data\\LibriSpeech\\dev-clean")

Resolving data files:   0%|          | 0/2800 [00:00<?, ?it/s]

Found cached dataset audiofolder (C:/Users/mj115gl/.cache/huggingface/datasets/audiofolder/dev-clean-6671ed00cafc447b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
input_values = processor(ds["train"][2]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
transcription

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


AttributeError: 'BaseModelOutput' object has no attribute 'logits'

In [None]:
ds["train"][1]["audio"]["array"].shape

(77040,)

In [None]:
input_values.shape

torch.Size([1, 199760])

In [34]:
logits.shape

torch.Size([1, 624, 32])

In [66]:
model

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertLayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout): Dropout(p=

In [62]:
output = model(input_values)

In [64]:
output.last_hidden_state.shape

torch.Size([1, 624, 1024])

## Baseline W2V

In [10]:
from gensim.models.word2vec import Word2Vec, LineSentence
# from gensim.test.utils import datapath

In [11]:
line_fp = open("data/gtbrg_i.txt", "r", encoding="utf-16")
sentences = LineSentence(line_fp)
# line_fp.close()

In [9]:
line_fp.seek(0)
line_fp.readline()
# line_fp.close()

'project gutenbergs the house on the borderland by william hope hodgson this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever\n'

'test\n'

In [12]:
vector_size = 100
window = 5
w2v_model_tag = "TEST"
W2V_MODEL_PATH = f"models/w2v_vs{vector_size}_w{window}_{w2v_model_tag}.model"

In [13]:
w2v_model = Word2Vec(
    sentences,
    window=window,
    vector_size=vector_size,
    min_count=0,
    workers=4,
    epochs=10
)

In [14]:
w2v_model.save(W2V_MODEL_PATH)

In [15]:
list(w2v_model.wv.key_to_index.keys())[:15]

['the',
 'and',
 'of',
 'to',
 'a',
 'in',
 'i',
 'that',
 'he',
 'was',
 'it',
 'his',
 'with',
 'you',
 'as']

In [17]:
w2v_model.wv.most_similar("man")

[('woman', 0.8657098412513733),
 ('gentleman', 0.8089630603790283),
 ('fellow', 0.7922055125236511),
 ('person', 0.7875133752822876),
 ('creature', 0.7498121857643127),
 ('soldier', 0.7496156692504883),
 ('scotchman', 0.7041886448860168),
 ('girl', 0.6975987553596497),
 ('nobleman', 0.6809559464454651),
 ('chap', 0.6759620308876038)]

In [18]:
w2v_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.9092535972595215),
 ('princess', 0.8003181219100952),
 ('prince', 0.7828630805015564),
 ('sultan', 0.7463798522949219),
 ('empress', 0.7256040573120117),
 ('isabella', 0.7094663977622986),
 ('dowager', 0.6953573226928711),
 ('emperor', 0.6942340135574341),
 ('dauphin', 0.6938880085945129),
 ('duchess', 0.6818597316741943)]

In [19]:
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [22]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in wordsim_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.621953172787471 , tested 200/203 pairs
0.6182286374851715 , including OOV


In [23]:
simlex_scores = []

with open("data/SimLex-999/SimLex-999.txt") as simlex_fp:
    for line in simlex_fp.readlines()[1:]:
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[3])
        simlex_scores.append([w1, w2, gold_score])

In [25]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in simlex_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(simlex_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.339987359598551 , tested 994/999 pairs
0.3385031092150666 , including OOV


In [26]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [33]:
n_clusters = []
words = list(w2v_model.wv.key_to_index.keys())

tqdm_iterator = tqdm(range(0, 99, 1))

for threshold in tqdm_iterator:
    threshold = threshold / 100
    word_to_cluster = dict()  # Stores map from word to cluster
    cluster_to_words = dict()  # Stores map from cluster to words
    cluster_idx = 0  # Counter

    for word in words:
        tqdm_iterator.set_postfix({"Word": word})
        # Check if word has already been clustered
        if word not in word_to_cluster.keys():
            # Create new cluster
            cluster_idx += 1
            # cluster_key = chr(0x0020 + cluster_idx)
            cluster_key = cluster_idx

            # Add new word to cluster
            tqdm_iterator.set_postfix({"status" : "adding new word to cluster"})
            cluster_to_words[cluster_key] = [word]
            word_to_cluster[word] = cluster_key
            
            # Add all similar words
            tqdm_iterator.set_postfix({"status" : "adding all words"})
            for similar_word, score in w2v_model.wv.most_similar(word, topn=50):
                tqdm_iterator.set_postfix({"status" : "checked 50 words"})
                if score > threshold:
                    cluster_to_words[cluster_key].append(similar_word)
                    word_to_cluster[similar_word] = cluster_key
    
    n_clusters.append(len(cluster_to_words))

  0%|          | 0/99 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
len(words)

681566

In [None]:
plt.plot(n_clusters)

## Convert old

In [1]:
from string import ascii_letters

In [2]:
letters = {
    i: key for i, key in enumerate(ascii_letters)
}

In [3]:
word_to_key = {}
key_to_word = {}

with open("data/quantized/dev-gold.csv", "r") as key_file:
    for line in key_file.readlines()[1:]:
        dataset, key, _, word = line.strip().split(",")
        if word not in word_to_key:
            word_to_key[word] = {
                'librispeech': [],
                'synthetic': []
            }
        word_to_key[word][dataset].append(key)
        key_to_word[key] = word

In [5]:
utterances = {}

for dataset in ["librispeech", "synthetic"]:
    with open(f"data/quantized/semantic/dev/{dataset}/quantized_outputs.txt", "r") as utterance_file:
        for line in utterance_file.readlines():
            key, seq = line.strip().split("\t")
            utterance = seq.split(",")[1:]

            key = "ls_" + key_to_word[key] if dataset == "librispeech" else "sy_" + key_to_word[key]
            if key not in utterances:
                utterances[key] = []

            utterances[key].append(
                "".join(
                    [letters[int(v)] for i, v in enumerate(utterance)]
                )
            )

In [10]:
with open("data/level_wise/level0/utterances_original.txt", "w+", encoding="utf-8") as ufp:
    for word in utterances:
        for utterance in utterances[word]:
            ufp.write(word + "\t" + utterance + "\n")

In [4]:
sim_pairs = []
rel_pairs = []

with open("data/quantized/dev-pairs.csv", "r") as pairs_file:
    for line in pairs_file.readlines()[1:]:
        dataset, _, w1, w2, sim, rel = line.strip().split(",")
        if sim:
            sim_pairs.append((dataset, w1, w2, float(sim)))
        if rel:
            rel_pairs.append((dataset, w1, w2, float(rel)))

with open("data/level_wise/level0/pairs.txt", "w+", encoding="utf-8") as pairs_fp:
    for pair in sim_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            str(score) + "," + "\n"
        )
    for pair in rel_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            "," + str(score) + "\n"
        )

In [1]:
str("a")

'a'

## FastText

In [2]:
import sentencepiece as spm

In [3]:
sp_model = spm.SentencePieceProcessor()

sp_model.Load("models/original_60k_250x1/level1/unigram_vs60000_lw.model")

True

In [4]:
original_corpus = []

with open("data/level_wise/level0/dev_corpus_original.txt", "r", encoding="utf-8") as ocfp:
    for line in ocfp.readlines():
        original_corpus.append(line.strip())

In [5]:
with open("data/original_60k_250x1/level1/dev_corpus_ft.txt", "w+", encoding="utf-8") as ncfp:
    for line in original_corpus:
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(line)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]
        
        ncfp.write(" ".join(units) + "\n")

In [1]:
import fasttext

In [3]:
ft_model = fasttext.train_unsupervised(
    "data/original_60k_250x1/level1/corpus_ft.txt",
    "cbow",
    dim=250,
    thread=4,
    epoch=7
)

In [4]:
type(ft_model)

fasttext.FastText._FastText

In [5]:
ft_model.save_model("models/fasttext_cbow")

In [6]:
# ft_model = fasttext.load_model("models/fasttext_cbow")

In [7]:
from levelwise_model.test_bench import LSTestBench
from levelwise_model.utterances import WordToUtteranceMapping

In [8]:
utterance_mapping = WordToUtteranceMapping(map_file="data/level_wise/level0/utterances_original.txt")

In [9]:
test_bench = LSTestBench(scores_file="data/level_wise/level0/pairs.txt")

In [10]:
test_bench.ft_score_and_save(ft_model=ft_model, utterances=utterance_mapping, results_file="results/ft_cbow")

## DistilBERT

In [1]:
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('C:\\Users\\mj115gl\\work_dir\\thesis\\audio-semantics\\models\\db_final')

No sentence-transformers model found with name C:\Users\mj115gl\work_dir\thesis\audio-semantics\models\db_final. Creating a new one with MEAN pooling.


Some weights of the model checkpoint at C:\Users\mj115gl\work_dir\thesis\audio-semantics\models\db_final were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
import sentencepiece as spm

In [10]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load("models/comp_60k_250x1/level1/unigram_vs60000_lw.model")

True

In [50]:
input_text = "PlQQQQoobbbbbIkkQorrrAAAfflllQQooVVVpppjjUQzzzzOOOOOOOOOOO"

pieces = list(
    filter(
        lambda x: x != "▁",
        sp_model.EncodeAsPieces(input_text)
    )
)

units = [piece.replace("▁", "") for piece in pieces]
print(" ".join(units))

PlQQQQ oobbbbbIkk QorrrAAAff lllQQo oVVVpppjj UQ zzzzOOOOOOOOOOO


In [51]:
#Our sentences we like to encode
sentences = ['UUUQ obbbbbIpkk QQ rrAAffff QQQ oVVVppppjj rrrKKKKK OOOOOOOOOOMMMMMM',
    'lQ oobbbbIII kkkkQQQ orrAAff llQQ oVVVVpppjjj KKKKKKzOO VV',
    "PQQ obbbbbbkkkQ rrrrrfff QlQQQ oVVVpppjjjj rrrKKKKKK zzzO HHH",
    "pDjj CCCCCCobbbbII pkkkQQQ rrrrAAff ffllll QQooVVV pppjjjj E KKKKKKKKKO KKKKOOOOOOOOOO",
    "PlQQQQ oobbbbbIkk QorrrAAAff lllQQo oVVVpppjj UQ zzzzOOOOOOOOOOO"
]

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [52]:
embeddings

array([[ 1.2528386 , -1.2860142 ,  0.8338024 , ..., -0.7757539 ,
         0.14618087, -1.0259092 ],
       [ 1.0756059 , -1.3480805 ,  1.1199704 , ..., -0.3672155 ,
         0.57059145, -1.5578766 ],
       [ 0.7610003 , -1.17496   ,  1.1490151 , ..., -0.3124733 ,
         0.29432905, -1.3124402 ],
       [ 0.6083768 , -1.2210584 ,  1.4475336 , ..., -0.35673165,
         0.3092123 , -1.241608  ],
       [ 1.0120195 , -1.1010975 ,  1.4700158 , ..., -0.53832185,
         0.29323524, -1.0156256 ]], dtype=float32)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
similarities = cosine_similarity(embeddings, dense_output=True)
similarities

array([[1.0000002 , 0.9521097 , 0.93304825, 0.9465009 , 0.9377239 ],
       [0.9521097 , 0.9999995 , 0.93354774, 0.9546412 , 0.9466411 ],
       [0.93304825, 0.93354774, 0.99999964, 0.9444392 , 0.9428807 ],
       [0.9465009 , 0.9546412 , 0.9444392 , 0.9999999 , 0.9522452 ],
       [0.9377239 , 0.9466411 , 0.9428807 , 0.9522452 , 1.0000002 ]],
      dtype=float32)

In [62]:
np.mean(similarities[np.where(np.tril(similarities, -1))])

0.9443777

In [2]:
from levelwise_model.utterances import WordToUtteranceMapping

In [11]:
utterances = WordToUtteranceMapping("data/level_wise/level0/utterances.txt")

In [12]:
updated_utterances = {}

for word in utterances.utterances:
    updated_utterances[word] = []
    for utt in utterances.utterances[word]:
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(utt)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]
        updated_utterances[word].append(
            " ".join(units)
        )

In [14]:
import numpy as np

In [98]:
model.encode("test").shape

(768,)

In [100]:
# Test positive
ls_sims_pos = []
sy_sims_pos = []
tested = 0

for word in updated_utterances:
    if len(updated_utterances[word]) > 1:
        tested += 1
        utterances_list = [
            " ".join(
                [
                    piece.replace("▁", "")
                    for piece in list(
                        filter(
                            lambda x: x != "▁",
                            sp_model.EncodeAsPieces(utterance)
                        )
                    )
                ]
            ) for utterance in updated_utterances[word]
        ]

        embeddings = model.encode(utterances_list)
        similarities = cosine_similarity(embeddings)

        if word.startswith("ls_"):
            ls_sims_pos.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
        if word.startswith("sy_"):
            sy_sims_pos.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg positive score (LS) = {sum(ls_sims_pos)/len(ls_sims_pos)} ({len(ls_sims_pos)} words).")
print(f"Avg positive score (SY) = {sum(sy_sims_pos)/len(sy_sims_pos)} ({len(sy_sims_pos)} words).")

Avg positive score (LS) = 0.9355412933498762 ([0.9478882, 0.9341124, 0.9349109, 0.92906487, 0.93097824, 0.919134, 0.95544744, 0.92582834, 0.93838286, 0.92994934, 0.93962383, 0.9301245, 0.9431162, 0.94423187, 0.94626683, 0.9413644, 0.930119, 0.9322844, 0.93690395, 0.93456966, 0.9336634, 0.93571436, 0.9314061, 0.93531317, 0.94441867, 0.9393776, 0.9473236, 0.9428841, 0.9286855, 0.9383246, 0.94521725, 0.9484435, 0.9259017, 0.93798345, 0.93186796, 0.92320585, 0.94726497, 0.92739666, 0.93498486, 0.9584384, 0.9345622, 0.93492544, 0.934797, 0.93858486, 0.9498227, 0.93013906, 0.9253137, 0.9444783, 0.91060156, 0.9337509, 0.9257393, 0.93742037, 0.9397783, 0.9404488, 0.9348502, 0.931848, 0.9314847, 0.93038434, 0.95055205, 0.93702954, 0.9408466, 0.95822686, 0.9228851, 0.9493038, 0.9305943, 0.9484994, 0.9359726, 0.9392512, 0.94092494, 0.93935937, 0.9496282, 0.9377645, 0.9383466, 0.9265093, 0.93705744, 0.9304095, 0.9282099, 0.9382057, 0.9431409, 0.9332338, 0.9283988, 0.927216, 0.9556505, 0.9328858, 0

In [64]:
n_utts = []

for word in updated_utterances:
    n_utts.append(len(updated_utterances[word]))

print(sum(n_utts)/len(n_utts))

4.53003300330033


In [102]:
# Test negative
ls_sims_neg = []
sy_sims_neg = []
tested = 0
n_negative_samples = 5

ls_word_list = set([word for word in updated_utterances.keys() if word.startswith("ls_")])
sy_word_list = set([word for word in updated_utterances.keys() if word.startswith("sy_")])

for word in updated_utterances:
    tested += 1

    negative_samples = []
    if word.startswith("ls_"):
        use_list = ls_word_list
    else:
        use_list = sy_word_list
    sample = np.random.choice(list(use_list - {word}), size=(n_negative_samples))
    for s in sample:
        negative_samples.append(
            np.random.choice(updated_utterances[s])
        )

    utterances_list = [
        " ".join(
            [
                piece.replace("▁", "")
                for piece in list(
                    filter(
                        lambda x: x != "▁",
                        sp_model.EncodeAsPieces(utterance)
                    )
                )
            ]
        ) for utterance in negative_samples
    ]
    
    embeddings = model.encode(utterances_list)
    similarities = cosine_similarity(embeddings)

    if word.startswith("ls_"):
        ls_sims_neg.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
    if word.startswith("sy_"):
        sy_sims_neg.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg negative score (LS) = {sum(ls_sims_neg)/len(ls_sims_neg)} ({len(ls_sims_neg)} words).")
print(f"Avg negative score (SY) = {sum(sy_sims_neg)/len(sy_sims_neg)} ({len(sy_sims_neg)} words).")

Tested 1515/1515 words.
Avg negative score = 0.9371212101218724.


In [104]:
sum(sy_sims)/len(sy_sims)

0.9440270919979656

In [1]:
sim_pairs = []
rel_pairs = []

with open("data/level_wise/level0/pairs.txt", "r") as pairs_file:
    for line in pairs_file.readlines()[1:]:
        w1, w2, sim, rel = line.strip().split(",")
        if sim:
            sim_pairs.append((w1, w2, float(sim)))
        if rel:
            rel_pairs.append((w1, w2, float(rel)))

In [94]:
def get_model_vectors(word):
    return model.encode(updated_utterances[word])

In [95]:
from scipy.stats import pearsonr

In [97]:
scores = {
    test_set: {
        method: [] for method in ["min", "max", "avg", "all"]
    } for test_set in ["librispeech", "synthetic"]
}
gold_standard = {
    "librispeech": [],
    "synthetic": []
}
trials = 0
errors = 0

for pair in rel_pairs:
    try:
        w1, w2, rel = pair

        test_set = "librispeech" \
            if w1.startswith("ls_") \
            else "synthetic"
        w1.replace("ls_", "").replace("sy_", "")
        w2.replace("ls_", "").replace("sy_", "")

        w1_vectors = get_model_vectors(
            w1
        )
        w2_vectors = get_model_vectors(
            w2
        )

        similarities = [
            cosine_similarity(i.reshape(1, -1), j.reshape(1, -1))
            for i in w1_vectors
            for j in w2_vectors
        ]

        scores[test_set]["min"].append(np.min(similarities))
        scores[test_set]["avg"].append(np.mean(similarities))
        scores[test_set]["max"].append(np.max(similarities))

        gold_standard[test_set].append(rel)
    except Exception as e:
        print(e)
        errors += 1
    trials += 1

print({
    'score': {
        test_set: {
            var: pearsonr(
                scores[test_set][var],
                gold_standard[test_set]
            )[0] * 100
            for var in ['min', 'avg', 'max']
        }
        for test_set in ['librispeech', 'synthetic']
    },
    'errors': errors,
    'trials': trials
})

{'score': {'librispeech': {'min': 7.805751624454922, 'avg': 8.578354383032712, 'max': 7.286344671214396}, 'synthetic': {'min': -1.3427713587253782, 'avg': -1.174001161944971, 'max': -3.791671346703998}}, 'errors': 0, 'trials': 1013}


In [5]:
from gensim.models.word2vec import Word2Vec

In [6]:
w2v_model = Word2Vec.load("models/comp_60k_250x5/level1/w2v_vs250_w5_lw.model")

In [8]:
updated_utterances

{'ls_individual': []}

In [16]:
# Test positive
ls_sims = []
sy_sims = []
tested = 0

for word in updated_utterances:
    if len(updated_utterances[word]) > 1:
        tested += 1
        embeddings = utterances.get_vectors_from_word(
            word, sp_model, w2v_model
        )[:, 0, :]

        similarities = cosine_similarity(embeddings)

        if word.startswith("ls_"):
            ls_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
        if word.startswith("sy_"):
            sy_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg positive score (LS) = {sum(ls_sims)/len(ls_sims)} ({len(ls_sims)} words).")
print(f"Avg positive score (SY) = {sum(sy_sims)/len(sy_sims)} ({len(sy_sims)} words).")

Avg positive score (LS) = 0.3204226875545289 (422 words).
Avg positive score (SY) = 0.6264196041416614 (1034 words).


In [17]:
def get_vector_from_utterance(
            utterance,
            sp_model: spm.SentencePieceProcessor,
            w2v_model: Word2Vec
    ):
        """
        Gets the embeddings of the given utterance.
        """
        if utterance in w2v_model.wv.key_to_index.keys():
            return w2v_model.wv[utterance].reshape(1, -1)
        else:
            pieces = list(
                filter(
                    lambda x: x != "▁",
                    sp_model.EncodeAsPieces(utterance)
                )
            )

            units = [piece.replace("▁", "") for piece in pieces]

            vectors = np.array([w2v_model.wv[unit] for unit in units])
            return vectors.mean(axis=0)

In [18]:
# Test negative
ls_sims = []
sy_sims = []
tested = 0
n_negative_samples = 5

ls_word_list = set([word for word in updated_utterances.keys() if word.startswith("ls_")])
sy_word_list = set([word for word in updated_utterances.keys() if word.startswith("sy_")])

for word in updated_utterances:
    tested += 1

    negative_samples = []
    if word.startswith("ls_"):
        use_list = ls_word_list
    else:
        use_list = sy_word_list
    sample = np.random.choice(list(use_list - {word}), size=(n_negative_samples))
    for s in sample:
        negative_samples.append(
            np.random.choice(updated_utterances[s])
        )

    embeddings = np.array([
        get_vector_from_utterance(utterance, sp_model, w2v_model) for utterance in negative_samples
    ])
    
    similarities = cosine_similarity(embeddings)

    if word.startswith("ls_"):
        ls_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))
    if word.startswith("sy_"):
        sy_sims.append(np.mean(similarities[np.where(np.tril(similarities, -1))]))

print(f"Avg negative score (LS) = {sum(ls_sims)/len(ls_sims)} ({len(ls_sims)} words).")
print(f"Avg negative score (SY) = {sum(sy_sims)/len(sy_sims)} ({len(sy_sims)} words).")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

In [114]:
embeddings.shape

(8, 1, 250)

In [115]:
embeddings[:, 0, :].shape

(8, 250)