# Playground notebook

## HuBERT test

In [57]:
from transformers import AutoProcessor, HubertModel
from datasets import load_dataset

In [58]:
model_name = "facebook/hubert-large-ls960-ft"

processor = AutoProcessor.from_pretrained(model_name)
model = HubertModel.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
ds = load_dataset("C:\\Users\\mj115gl\\work_dir\\thesis\\audio-semantics\\data\\LibriSpeech\\dev-clean")

Resolving data files:   0%|          | 0/2800 [00:00<?, ?it/s]

Found cached dataset audiofolder (C:/Users/mj115gl/.cache/huggingface/datasets/audiofolder/dev-clean-6671ed00cafc447b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
input_values = processor(ds["train"][2]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
transcription

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


AttributeError: 'BaseModelOutput' object has no attribute 'logits'

In [None]:
ds["train"][1]["audio"]["array"].shape

(77040,)

In [None]:
input_values.shape

torch.Size([1, 199760])

In [34]:
logits.shape

torch.Size([1, 624, 32])

In [66]:
model

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertLayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout): Dropout(p=

In [62]:
output = model(input_values)

In [64]:
output.last_hidden_state.shape

torch.Size([1, 624, 1024])

## Baseline W2V

In [10]:
from gensim.models.word2vec import Word2Vec, LineSentence
# from gensim.test.utils import datapath

In [11]:
line_fp = open("data/gtbrg_i.txt", "r", encoding="utf-16")
sentences = LineSentence(line_fp)
# line_fp.close()

In [9]:
line_fp.seek(0)
line_fp.readline()
# line_fp.close()

'project gutenbergs the house on the borderland by william hope hodgson this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever\n'

'test\n'

In [12]:
vector_size = 100
window = 5
w2v_model_tag = "TEST"
W2V_MODEL_PATH = f"models/w2v_vs{vector_size}_w{window}_{w2v_model_tag}.model"

In [13]:
w2v_model = Word2Vec(
    sentences,
    window=window,
    vector_size=vector_size,
    min_count=0,
    workers=4,
    epochs=10
)

In [14]:
w2v_model.save(W2V_MODEL_PATH)

In [15]:
list(w2v_model.wv.key_to_index.keys())[:15]

['the',
 'and',
 'of',
 'to',
 'a',
 'in',
 'i',
 'that',
 'he',
 'was',
 'it',
 'his',
 'with',
 'you',
 'as']

In [17]:
w2v_model.wv.most_similar("man")

[('woman', 0.8657098412513733),
 ('gentleman', 0.8089630603790283),
 ('fellow', 0.7922055125236511),
 ('person', 0.7875133752822876),
 ('creature', 0.7498121857643127),
 ('soldier', 0.7496156692504883),
 ('scotchman', 0.7041886448860168),
 ('girl', 0.6975987553596497),
 ('nobleman', 0.6809559464454651),
 ('chap', 0.6759620308876038)]

In [18]:
w2v_model.wv.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.9092535972595215),
 ('princess', 0.8003181219100952),
 ('prince', 0.7828630805015564),
 ('sultan', 0.7463798522949219),
 ('empress', 0.7256040573120117),
 ('isabella', 0.7094663977622986),
 ('dowager', 0.6953573226928711),
 ('emperor', 0.6942340135574341),
 ('dauphin', 0.6938880085945129),
 ('duchess', 0.6818597316741943)]

In [19]:
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [22]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in wordsim_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.621953172787471 , tested 200/203 pairs
0.6182286374851715 , including OOV


In [23]:
simlex_scores = []

with open("data/SimLex-999/SimLex-999.txt") as simlex_fp:
    for line in simlex_fp.readlines()[1:]:
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[3])
        simlex_scores.append([w1, w2, gold_score])

In [25]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in simlex_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = w2v_model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        # if w1 not in w2v_model.wv.vocab.keys():
        #     w1_units = sp.EncodeAsPieces(w1)[1:]
        #     w1_vectors = np.array([w2v_model.wv[unit] for unit in w1_units])
        #     w1_vector = w1_vectors.mean(axis=0)
        # else:
        #     w1_vector = w2v_model.wv[w1]
        # if w2 not in w2v_model.wv.vocab.keys():
        #     w2_units = sp.EncodeAsPieces(w2)[1:]
        #     w2_vectors = np.array([w2v_model.wv[unit] for unit in w2_units])
        #     w2_vector = w2_vectors.mean(axis=0)
        # else:
        #     w2_vector = w2v_model.wv[w2]

        # pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(simlex_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.339987359598551 , tested 994/999 pairs
0.3385031092150666 , including OOV


In [26]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [33]:
n_clusters = []
words = list(w2v_model.wv.key_to_index.keys())

tqdm_iterator = tqdm(range(0, 99, 1))

for threshold in tqdm_iterator:
    threshold = threshold / 100
    word_to_cluster = dict()  # Stores map from word to cluster
    cluster_to_words = dict()  # Stores map from cluster to words
    cluster_idx = 0  # Counter

    for word in words:
        tqdm_iterator.set_postfix({"Word": word})
        # Check if word has already been clustered
        if word not in word_to_cluster.keys():
            # Create new cluster
            cluster_idx += 1
            # cluster_key = chr(0x0020 + cluster_idx)
            cluster_key = cluster_idx

            # Add new word to cluster
            tqdm_iterator.set_postfix({"status" : "adding new word to cluster"})
            cluster_to_words[cluster_key] = [word]
            word_to_cluster[word] = cluster_key
            
            # Add all similar words
            tqdm_iterator.set_postfix({"status" : "adding all words"})
            for similar_word, score in w2v_model.wv.most_similar(word, topn=50):
                tqdm_iterator.set_postfix({"status" : "checked 50 words"})
                if score > threshold:
                    cluster_to_words[cluster_key].append(similar_word)
                    word_to_cluster[similar_word] = cluster_key
    
    n_clusters.append(len(cluster_to_words))

  0%|          | 0/99 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
len(words)

681566

In [None]:
plt.plot(n_clusters)

## Convert old

In [1]:
from string import ascii_letters

In [2]:
letters = {
    i: key for i, key in enumerate(ascii_letters)
}

In [5]:
word_to_key = {}
key_to_word = {}

with open("data/quantized/dev-gold.csv", "r") as key_file:
    for line in key_file.readlines()[1:]:
        dataset, key, _, word = line.strip().split(",")
        if word not in word_to_key:
            word_to_key[word] = {
                'librispeech': [],
                'synthetic': []
            }
        word_to_key[word][dataset].append(key)
        key_to_word[key] = word

In [8]:
utterances = {}

for dataset in ["librispeech", "synthetic"]:
    with open(f"data/quantized/semantic/dev/{dataset}/quantized_outputs.txt", "r") as utterance_file:
        for line in utterance_file.readlines():
            key, seq = line.strip().split("\t")
            utterance = seq.split(",")[1:]

            key = "ls_" + key_to_word[key] if dataset == "librispeech" else "sy_" + key_to_word[key]
            if key not in utterances:
                utterances[key] = []

            utterances[key].append(
                "".join(
                    [letters[int(v)] for i, v in enumerate(utterance) if i == 0 or v != utterance[i - 1]]
                )
            )

In [9]:
utterances

{'ls_individual': ['UQobIpkQrAfQoVpjrKOM',
  'lQobIkQorAflQoVpjKzOV',
  'PQobkQrfQlQoVpjrKzOH',
  'PlQobIkQorAflQoVpjUQzO',
  'pDjCobIpkQrAflQoVpjEKOKO',
  'URobIkQrAfPlQoVypjdKSzO',
  'nCobIpPQrAfloVpjEzOvR',
  'cCobIkQorAflQoVpjrKORC'],
 'ls_band': ['rflcTobIA',
  'yfKflclcTobIM',
  'wyflTobIpj',
  'yflcTobIpjM',
  'ryryflTobIk',
  'yflTobIbIpjQM'],
 'ls_speech': ['sugwyfJxVpj',
  'sugwyflBxVypjW',
  'uwyfBxVpj',
  'ugwyfJxVp',
  'uwynflBcxVpujwM',
  'sugwMnfBxpuw',
  'sugwyfJxVypujMj',
  'LsgwyfJUxcxVpujM',
  'ugwynfWJQUdLsuwMn',
  'suwyfJxVpjw'],
 'ls_weight': ['OKNcxVRS',
  'KOKNcxVy',
  'OKNcxVpR',
  'rKNcxVypjM',
  'rKNcxLp',
  'rKNJcxVpspsugM',
  'KNEcJxVypjM'],
 'ls_deep': ['ypjBxVyfA',
  'ynfJxVyMfM',
  'ypjBxryf',
  'ypkPBxVyf',
  'npjJxVyf',
  'VypjlJBxry',
  'wpjBxryfw',
  'ynpjBxVy',
  'IypjJBxryf',
  'VypjBxVyfAf'],
 'ls_bird': ['yfWEoVp',
  'VyDfWEVy',
  'ynfWEoVy',
  'VyfWEoVypjM',
  'ryfWEoVI',
  'ryflWEoVypjRMR',
  'IynynfSzaGoV',
  'yflEoVypjRMR'],
 'ls_compound': [

In [10]:
with open("data/level_wise/level0/utterances", "w+", encoding="utf-8") as ufp:
    for word in utterances:
        for utterance in utterances[word]:
            ufp.write(word + "\t" + utterance + "\n")

In [4]:
sim_pairs = []
rel_pairs = []

with open("data/quantized/dev-pairs.csv", "r") as pairs_file:
    for line in pairs_file.readlines()[1:]:
        dataset, _, w1, w2, sim, rel = line.strip().split(",")
        if sim:
            sim_pairs.append((dataset, w1, w2, float(sim)))
        if rel:
            rel_pairs.append((dataset, w1, w2, float(rel)))

with open("data/level_wise/level0/pairs.txt", "w+", encoding="utf-8") as pairs_fp:
    for pair in sim_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            str(score) + "," + "\n"
        )
    for pair in rel_pairs:
        dataset, w1, w2, score = pair
        pairs_fp.write(
            ("ls_" + w1 if dataset == "librispeech" else "sy_" + w1) + "," +
            ("ls_" + w2 if dataset == "librispeech" else "sy_" + w2) + "," +
            "," + str(score) + "\n"
        )

In [1]:
str("a")

'a'