# Thesis - Step 1

Using pure text data to get best similarity scores.

In [1]:
import os
import numpy as np

## Approach 1 - SentencePiece

This approach uses SentencePiece on text data with only the letters to try and find words.

In [2]:
import sentencepiece as spm

In [3]:
input_file = os.getcwd() + "/data/gtbrg_8m_lines.txt"
version = "s1"
max_sentence_length = 5003
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME = f"models/{model_type}_{vocab_size}_{version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [4]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

['▁', 'apple']
[3, 16882]
[3, 616]
[3, 62]


Loading the vocabulary created by SentencePiece

In [None]:
with open(input_file) as corpus_file:
    corpus = corpus_file.readlines()

sentences = [sp.EncodeAsPieces(sentence) for sentence in corpus]
# sentences = [' '.join(sentence) for sentence in corpus]
# sentences = [list(sentence) for sentence in corpus]

### Approach 1.1 - Word2Vec

In [5]:
from gensim.models.word2vec import Word2Vec

In [6]:
W2V_MODEL_PATH = f"models/w2v_100_{version}.model"

In [None]:
model = Word2Vec(sentences, window=5, min_count=0, workers=4)
# model.build_vocab()

In [None]:
model.save(W2V_MODEL_PATH)

### Testing

In [7]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [None]:
print(sentences[0][:10])

In [8]:
sp.EncodeAsPieces("banana")

['▁', 'ban', 'ana']

In [10]:
model.wv.most_similar("human")

[('thehuman', 0.7530105113983154),
 ('sensuous', 0.749160885810852),
 ('spiritual', 0.7185163497924805),
 ('external', 0.7111231684684753),
 ('supernatural', 0.7045252323150635),
 ('creative', 0.7019811868667603),
 ('corporeal', 0.6909513473510742),
 ('rational', 0.6686607599258423),
 ('universal', 0.6654500961303711),
 ('complex', 0.6542152166366577)]

In [None]:
model.wv.most_similar("apple")

In [25]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7470260858535767),
 ('maiden', 0.6551803350448608),
 ('daughter', 0.6430643796920776),
 ('sister', 0.6402992010116577),
 ('prince', 0.638314962387085),
 ('countess', 0.5871137380599976),
 ('dame', 0.5857516527175903),
 ('thequeen', 0.5773256421089172),
 ('princess', 0.5710912346839905),
 ('damsel', 0.560126543045044)]

In [26]:
model.wv.most_similar("banana")

KeyError: "word 'banana' not in vocabulary"

In [9]:
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
wordsim_scores = []

with open("data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt") as wordsim_fp:
    for line in wordsim_fp.readlines():
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[2])
        wordsim_scores.append([w1, w2, gold_score])

In [11]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in wordsim_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        if w1 not in model.wv.vocab.keys():
            w1_units = sp.EncodeAsPieces(w1)[1:]
            w1_vectors = np.array([model.wv[unit] for unit in w1_units])
            w1_vector = w1_vectors.mean(axis=0)
        else:
            w1_vector = model.wv[w1]
        if w2 not in model.wv.vocab.keys():
            w2_units = sp.EncodeAsPieces(w2)[1:]
            w2_vectors = np.array([model.wv[unit] for unit in w2_units])
            w2_vector = w2_vectors.mean(axis=0)
        else:
            w2_vector = model.wv[w2]

        pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.6076737958802871 , tested 134/203 pairs
0.4252012334227323 , including OOV


  a = np.asarray(a)


In [15]:
simlex_scores = []

with open("data/SimLex-999/SimLex-999.txt") as simlex_fp:
    for line in simlex_fp.readlines()[1:]:
        scores = line.split("\t")
        w1, w2 = scores[0], scores[1]
        gold_score = float(scores[3])
        simlex_scores.append([w1, w2, gold_score])

In [16]:
gold_vocab = []
gold_all = []
preds_vocab = []
preds_all = []
tested = 0
oov = 0

for pairs in simlex_scores:
    w1, w2 = pairs[0].lower(), pairs[1].lower()
    
    try:
        pred = model.wv.similarity(w1, w2)
        preds_vocab.append(pred)
        gold_vocab.append(pairs[2])
        tested += 1
    
    except KeyError:
        if w1 not in model.wv.vocab.keys():
            w1_units = sp.EncodeAsPieces(w1)[1:]
            w1_vectors = np.array([model.wv[unit] for unit in w1_units])
            w1_vector = w1_vectors.mean(axis=0)
        else:
            w1_vector = model.wv[w1]
        if w2 not in model.wv.vocab.keys():
            w2_units = sp.EncodeAsPieces(w2)[1:]
            w2_vectors = np.array([model.wv[unit] for unit in w2_units])
            w2_vector = w2_vectors.mean(axis=0)
        else:
            w2_vector = model.wv[w2]

        pred = cosine_similarity(w1_vector.reshape(1, -1), w2_vector.reshape(1, -1))
        oov += 1
    
    preds_all.append(pred)
    gold_all.append(pairs[2])
        

print(spearmanr(preds_vocab, gold_vocab)[0], f", tested {tested}/{len(wordsim_scores)} pairs")
print(spearmanr(preds_all, gold_all)[0], f", including OOV")

0.2814099441077932 , tested 808/1202 pairs
0.18743511615356478 , including OOV


  a = np.asarray(a)
