# Step 2

Converting sequences of words into their phoneme counterparts, and then using it to create embeddings

In [1]:
import os
import numpy as np

In [2]:
input_file = os.getcwd() + "/data/gtbrg_phonemes_8m.txt"
version = "s2"

## Part 1 - SentencePiece embeddings

In [3]:
import sentencepiece as spm

In [5]:
max_sentence_length = 6000
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME = f"models/{model_type}_vs{vocab_size}_{version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [6]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

['▁', 'a', 'p', 'pl', 'e']
[4, 554, 390, 19096, 19096, 22]
[4, 554]
[4, 121, 19097, 104]


## Step 2 - Word2Vec encodings

We use the SentencePiece encoder on our phoneme dataset to create a new dataset with it's tokens.

In [None]:
with open(input_file) as corpus_file:
    corpus = corpus_file.readlines()

sentences = [sp.EncodeAsPieces(sentence) for sentence in corpus]

Now, training Word2Vec

In [7]:
from gensim.models.word2vec import Word2Vec

In [8]:
window = 5
vector_size = 100

In [9]:
W2V_MODEL_PATH = f"models/w2v_vs{vector_size}_w{5}_{version}.model"

In [None]:
model = Word2Vec(sentences, size=vector_size, window=window, min_count=0, workers=4)

In [None]:
model.save(W2V_MODEL_PATH)

## Step 3 - Testing

In [10]:
from g2p_en import G2p

In [11]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [12]:
g2p = G2p()

In [13]:
def convert_to_phonemes(word):
    parsed_text = "".join(g2p(word))
    final_text = ""

    for char in parsed_text:
        if char.isalpha():
            final_text += char.lower()

    return final_text

In [12]:
def most_similar_pwords(test_word):
    parsed_text = "".join(g2p(test_word))
    final_text = ""

    for char in parsed_text:
        if char.isalpha():
            final_text += char.lower()

    return model.wv.most_similar(final_text, topn=5)

In [13]:
most_similar_pwords("human")

KeyError: "word 'hhyuwmahn' not in vocabulary"

In [23]:
most_similar_pwords("fruit")

[('fruwts', 0.7007562518119812),
 ('dhahfruwt', 0.6919422149658203),
 ('vehjhtahbahlz', 0.6699405908584595),
 ('fuwd', 0.6603603363037109),
 ('brehd', 0.6302711963653564)]

In [28]:
model.wv.most_similar(positive=["kihng", "wuhmahn"], negative=["maen"], topn=5)

[('kwiyn', 0.6846692562103271),
 ('daoter', 0.6765732765197754),
 ('prihnsehs', 0.6557174921035767),
 ('kawntahs', 0.6210659742355347),
 ('sihster', 0.6079518795013428)]

## Step 4 - Scoring

In [17]:
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [20]:
'q' in list(model.wv.vocab.keys())

False

In [23]:
"".join([i.lower() for i in "".join(g2p(w1)) if i.isalpha()])

['k', 'i', 'h', 'n', 'g']

In [24]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in wordsim_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    w1 = "".join([i.lower() for i in "".join(g2p(w1)) if i.isalpha()])
    w2 = "".join([i.lower() for i in "".join(g2p(w2)) if i.isalpha()])
    
    try:
        pred = model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        if w1 not in model.wv.vocab.keys():
            w1_units = sp.EncodeAsPieces(w1)[1:]
            w1_vectors = np.array([model.wv[unit] for unit in w1_units])
            w1_vector = w1_vectors.sum(axis=0)
        else:
            w1_vector = model.wv[w1]
        if w2 not in model.wv.vocab.keys():
            w2_units = sp.EncodeAsPieces(w2)[1:]
            w2_vectors = np.array([model.wv[unit] for unit in w2_units])
            w2_vector = w2_vectors.sum(axis=0)
        else:
            w2_vector = model.wv[w2]

        pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.4872273878508886 , tested 121/203 pairs
0.35459617632606777 , including OOV


  a = np.asarray(a)


In [25]:
simlex_scores = []

with open("data/SimLex-999/SimLex-999.txt") as simlex_fp:
    for line in simlex_fp.readlines()[1:]:
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[3])
        simlex_scores.append([w1, w2, gold_score])

In [29]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in simlex_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    w1 = "".join([i.lower() for i in "".join(g2p(w1)) if i.isalpha()])
    w2 = "".join([i.lower() for i in "".join(g2p(w2)) if i.isalpha()])
    
    try:
        pred = model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        if w1 not in model.wv.vocab.keys():
            w1_units = sp.EncodeAsPieces(w1)[1:]
            w1_vectors = np.array([model.wv[unit] for unit in w1_units])
            w1_vector = w1_vectors.sum(axis=0)
        else:
            w1_vector = model.wv[w1]
        if w2 not in model.wv.vocab.keys():
            w2_units = sp.EncodeAsPieces(w2)[1:]
            w2_vectors = np.array([model.wv[unit] for unit in w2_units])
            w2_vector = w2_vectors.sum(axis=0)
        else:
            w2_vector = model.wv[w2]

        pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(simlex_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.24245591386571747 , tested 775/999 pairs
0.14236843257028958 , including OOV


  a = np.asarray(a)


In [65]:
model.wv.similarity("fihz", "kehm")

0.40965265

In [76]:
import random

In [14]:
list(model.wv.vocab.keys())[:5]

['▁', 'dɐ', 'praʄektɠ', 'ó', 'tɐnbɜg']