# Step 3

Converting sequences of words into their phoneme counterparts, and also including some noise.

In [1]:
import os
import numpy as np

In [17]:
input_file = os.getcwd() + "/data/gutenberg_noisy_phonemes.txt"
version = "3.2_8m"

## Part 1 - SentencePiece embeddings

In [3]:
import sentencepiece as spm

In [4]:
max_sentence_length = 6000
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME = f"models/{model_type}_{vocab_size}_v{version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [9]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

['▁', 'ap', 'ple']
[177, 338, 99, 567, 1652]
[177, 338, 99]
[177, 1232]


## Step 2 - Word2Vec encodings

We use the SentencePiece encoder on our phoneme dataset to create a new dataset with it's tokens.

In [11]:
# TODO : Replace with entire dataset, if possible
with open(input_file) as corpus_file:
    corpus = corpus_file.readlines()

sentences = [sp.EncodeAsPieces(sentence) for sentence in corpus]

In [42]:
select_from = 130

sentences[0][select_from:select_from + 10]

['lʌń', 'he', 'e', 'ńgʌl', 'ʃɐ', 'fɜ', 'tik', 'sɛt', 'ɛn', 'kod']

Now, training Word2Vec

In [13]:
from gensim.models.word2vec import Word2Vec

In [14]:
W2V_MODEL_PATH = f"models/w2v_100_v{version}.model"

In [16]:
model = Word2Vec(sentences, window=5, min_count=0, workers=4)

In [18]:
model.save(W2V_MODEL_PATH)

## Step 3 - Testing

In [5]:
from g2p_en import G2p

In [8]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [9]:
g2p = G2p()

In [69]:
def convert_to_phonemes(word):
    parsed_text = "".join(g2p(word))
    final_text = ""

    for char in parsed_text:
        if char.isalpha():
            final_text += char.lower()

    return final_text

In [10]:
def most_similar_pwords(test_word):
    parsed_text = "".join(g2p(test_word))
    final_text = ""

    for char in parsed_text:
        if char.isalpha():
            final_text += char.lower()

    return model.wv.most_similar(final_text, topn=5)

In [45]:
model.wv.most_similar("hyómɐn")

[('hyómʌn', 0.9748860001564026),
 ('dɨvæn', 0.7308272123336792),
 ('ɖivæn', 0.7231025695800781),
 ('divæn', 0.6888520121574402),
 ('ɖɨvæn', 0.6820935010910034),
 ('ɐvmän', 0.6775056719779968),
 ('ʌvmän', 0.6167342662811279),
 ('sośɐl', 0.611885130405426),
 ('mɔrʌl', 0.6032884120941162),
 ('äbsʌlót', 0.5927882194519043)]

In [24]:
most_similar_pwords("human")

[('dhahhhyuwmahn', 0.752903401851654),
 ('spihrihchahwahl', 0.7227702140808105),
 ('ihksternahl', 0.7202848196029663),
 ('kaorperiyl', 0.7090017795562744),
 ('raeshahnahl', 0.6954680681228638)]

In [23]:
most_similar_pwords("fruit")

[('fruwts', 0.7007562518119812),
 ('dhahfruwt', 0.6919422149658203),
 ('vehjhtahbahlz', 0.6699405908584595),
 ('fuwd', 0.6603603363037109),
 ('brehd', 0.6302711963653564)]

In [28]:
model.wv.most_similar(positive=["kihng", "wuhmahn"], negative=["maen"], topn=5)

[('kwiyn', 0.6846692562103271),
 ('daoter', 0.6765732765197754),
 ('prihnsehs', 0.6557174921035767),
 ('kawntahs', 0.6210659742355347),
 ('sihster', 0.6079518795013428)]

## Step 4 - Scoring

In [71]:
from scipy.stats import spearmanr

In [68]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [75]:
gold = []
preds = []
tested = 0

for pairs in wordsim_scores:
    w1, w2 = convert_to_phonemes(pairs[0]), convert_to_phonemes(pairs[1])
    
    try:
        pred = model.wv.similarity(w1, w2)
        preds.append(pred)
        gold.append(pairs[2])
        tested += 1
    
    except KeyError:
        continue

print(spearmanr(preds, gold)[0], f", tested {tested}/{len(wordsim_scores)} pairs")

0.5078932269309655 , tested 121/203 pairs


In [66]:
word1 = "king"
word2 = "cabbage"

parsed_text1 = "".join(g2p(word1))
final_text1 = ""
parsed_text2 = "".join(g2p(word2))
final_text2 = ""

for char in parsed_text1:
    if char.isalpha():
        final_text1 += char.lower()

for char in parsed_text2:
    if char.isalpha():
        final_text2 += char.lower()

model.wv.wmdistance(sp.EncodeAsPieces(final_text1)[1:], sp.EncodeAsPieces(final_text2)[1:])

16.86804855485916

In [67]:
sp.EncodeAsPieces(final_text1)[1:], sp.EncodeAsPieces(final_text2)[1:]

(['kihng'], ['kaeb', 'ahjh'])

In [65]:
model.wv.similarity("fihz", "kehm")

0.40965265

In [76]:
import random

In [79]:
random.choice(["a"])

'a'