# Thesis - Step 1

Using pure text data to get best similarity scores.

In [1]:
import os
import numpy as np

## Approach 1.1 - SentencePiece + FastText

This approach uses SentencePiece on text data with only the letters to try and find words.

In [2]:
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec
import sentencepiece as spm

In [3]:
input_file = os.getcwd() + "/data/gutenberg_no_spaces.txt"
max_sentence_length = 180000
vocab_size = 2000
model_type = "unigram"
SP_MODEL_NAME = f"./models/{model_type}_{vocab_size}_v3"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [None]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('man'))

Saving the vocabulary created by SentencePiece

In [4]:
vocab = {}
with open(f"{model_name}.vocab", "r", encoding="utf-8") as f:
    for line in f:
        word, freq = line.strip().split('\t')
        vocab[word] = np.exp(float(freq))


In [5]:
with open("data/gutenberg_no_spaces.txt") as corpus_file:
    corpus = corpus_file.readlines()

sentences = [[' '.join(sp.EncodeAsPieces(sentence)) for sentence in corpus]]

NameError: name 'sp' is not defined

### Approach 1.1 - Using FastText

In [None]:
model = FastText(size=150, window=5, min_count=0)
model.build_vocab_from_freq(vocab)
model.train(sentences, total_examples=len(sentences), epochs=5)

In [16]:
model.save("models/fasttext_150_v1")

In [None]:
model.wv.similarity("plane", "car")

### Approach 1.2 - Word2Vec

In [None]:
W2V_MODEL_PATH = "models/w2v_100_v1.model"

In [11]:
model = Word2Vec(window=5, min_count=0, workers=4)
model.build_vocab(sentences)

In [12]:
model.train(
    sentences,
    total_examples=model.corpus_count,
    epochs=10
)

model.save(W2V_MODEL_PATH)

(14380, 14380)

In [None]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [18]:
model.load()

In [17]:
list(model.wv.vocab.keys())

## Approach 2 - SGT

In [4]:
from sgt import SGT

In [5]:
with open("data/gutenberg_no_spaces.txt") as corpus_file:
    corpus = list("".join(corpus_file.readlines()))

In [11]:
sgt = SGT(
    kappa=10, 
    flatten=True, 
    lengthsensitive=True, 
    mode='default'
)

embeddings = sgt.fit(corpus[:100000])

In [None]:
embeddings[0]