# Thesis - Step 1

Using pure text data to get best similarity scores.

In [None]:
import os
import numpy as np

## Approach 1 - SentencePiece

This approach uses SentencePiece on text data with only the letters to try and find words.

In [None]:
import sentencepiece as spm

In [None]:
input_file = os.getcwd() + "/data/gtbrg_8m_lines.txt"
version = 8
max_sentence_length = 4002
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME = f"models/{model_type}_{vocab_size}_v{version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [None]:
# with open("data/gutenberg_no_spaces.txt") as fp:
#     print(fp.readlines()[0])

In [None]:
# SP_MODEL_NAME = "models/unigram_8k_v2"

In [None]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

Loading the vocabulary created by SentencePiece

In [None]:
with open("data/gutenberg_no_spaces.txt") as corpus_file:
    corpus = corpus_file.readlines()

sentences = [sp.EncodeAsPieces(sentence) for sentence in corpus]
# sentences = [' '.join(sentence) for sentence in corpus]
# sentences = [list(sentence) for sentence in corpus]

### Approach 1.1 - Word2Vec

In [20]:
from gensim.models.word2vec import Word2Vec

In [None]:
W2V_MODEL_PATH = f"models/w2v_100_v{version}.model"

In [None]:
model = Word2Vec(sentences, window=5, min_count=0, workers=4)
# model.build_vocab()

In [None]:
model.save(W2V_MODEL_PATH)

In [21]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [None]:
print(sentences[0][:10])

In [22]:
sp.EncodeAsPieces("banana")

['▁', 'ban', 'ana']

In [23]:
model.wv.most_similar("human")

[('thehuman', 0.6968255043029785),
 ('external', 0.6862231492996216),
 ('spiritual', 0.6827719211578369),
 ('rational', 0.6694786548614502),
 ('physical', 0.6432474255561829),
 ('universal', 0.6383942365646362),
 ('complex', 0.6383225321769714),
 ('animate', 0.6229571104049683),
 ('vulgar', 0.6203749179840088),
 ('ofthemind', 0.6202679872512817)]

In [24]:
model.wv.most_similar("apple")

[('tree', 0.6462252140045166),
 ('orange', 0.6357700824737549),
 ('cherry', 0.6354914903640747),
 ('worm', 0.6290103197097778),
 ('egg', 0.6227107048034668),
 ('apples', 0.6179237365722656),
 ('clover', 0.6081827878952026),
 ('cake', 0.6053444147109985),
 ('crab', 0.6045732498168945),
 ('salmon', 0.598516583442688)]

In [25]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7470260858535767),
 ('maiden', 0.6551803350448608),
 ('daughter', 0.6430643796920776),
 ('sister', 0.6402992010116577),
 ('prince', 0.638314962387085),
 ('countess', 0.5871137380599976),
 ('dame', 0.5857516527175903),
 ('thequeen', 0.5773256421089172),
 ('princess', 0.5710912346839905),
 ('damsel', 0.560126543045044)]

In [26]:
model.wv.most_similar("banana")

KeyError: "word 'banana' not in vocabulary"

## Approach 2 - WordPiece tokenizer

In [None]:
from tokenizers.pre_tokenizers import PreTokenizer

class FixedLengthPreTokenizer(PreTokenizer):
    def __init__(self, n=3):
        self.n = n
        super().__init__()

    def pre_tokenize(self, text):
        return [(i, i+self.n) for i in range(0, len(text), self.n)]


In [4]:
from tokenizers import trainers
from tokenizers.models import WordPiece
from tokenizers import Tokenizer

tokenizer = Tokenizer(WordPiece())

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, 
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    initial_alphabet= list(
        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    ),
    continuing_subword_prefix="##",
    show_progress=True,
    min_frequency=1
)

# tokenizer.pre_tokenizer = FixedLengthPreTokenizer()  # or another value for n

tokenizer.train(
    files=["data/gutenberg_no_spaces.txt"], 
    trainer=trainer
)

tokenizer.save("models/tokenizer.json")


## Approach 3 - Character CNN

In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

In [12]:
# To prevent recomputing alphabet each time
vocab_path = 'data/gutenberg_vocabulary.txt'

vocabulary = sorted(set(open(vocab_path).read().split()))
print(vocabulary)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ù', 'û', 'ü', 'ā', 'œ', 'α', 'β', 'δ', 'ε', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'ω']


In [13]:
class CharDataset(Dataset):
    def __init__(self, corpus_path, seq_length):
        self.corpus_path = corpus_path
        self.seq_length = seq_length
        self.vocab = vocabulary # Load the unique characters in the corpus
        self.char_to_index = {c: i for i, c in enumerate(self.vocab)} # Map each character to an index
        self.index_to_char = {i: c for i, c in enumerate(self.vocab)} # Map each index to a character
        self.corpus_size = os.path.getsize(corpus_path)
        # self.num_chunks = int(self.corpus_size / (1024 * 1024)) # Split the corpus into 1MB chunks
        # self.chunk_size = int(self.corpus_size / self.num_chunks)
    
    def __len__(self):
        return self.num_chunks
    
    def __getitem__(self, idx):
        start_pos = idx * self.chunk_size
        # end_pos = (idx + 1) * self.chunk_size
        with open(self.corpus_path) as f:
            f.seek(start_pos)
            chunk = f.read(self.chunk_size).replace('\n', '')
        input_seq = chunk[:-1]
        target = chunk[1:]
        input_seq = [self.char_to_index[c] for c in input_seq]
        target = [self.char_to_index[c] for c in target]
        input_seq = torch.LongTensor(input_seq)
        target = torch.LongTensor(target)
        return input_seq, target


In [5]:
class CharacterCNN(nn.Module):
    def __init__(self, input_size, embedding_size, filter_sizes, num_filters):
        super(CharacterCNN, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_size, out_channels=num_filters, kernel_size=fs, padding=0)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, input_size)
        
    def forward(self, input):
        # input shape: (seq_len, batch_size)
        embedded = self.embedding(input) # shape: (seq_len, batch_size, embedding_size)
        embedded = embedded.permute(1, 2, 0) # shape: (batch_size, embedding_size, seq_len)
        conv_outputs = []
        for conv in self.conv_layers:
            conv_outputs.append(torch.relu(conv(embedded)))
        pooled_outputs = [torch.max(conv_output, dim=-1)[0] for conv_output in conv_outputs]
        fc_input = torch.cat(pooled_outputs, dim=-1)
        output = self.fc(fc_input)
        # print(output.shape)
        return output


In [6]:
corpus_path = 'data/gutenberg.txt'
seq_length = 20
vocab_size = len(vocabulary)
embedding_size = 128
# output_size = 100 # size of the word embeddings
filter_sizes = [3, 4] # filter sizes for convolutional layers
num_filters = 64 # number of filters for each convolutional layer
batch_size = 4
learning_rate = 0.001
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Create dataset and dataloader
dataset = CharDataset(corpus_path, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# Create model and optimizer
model = CharacterCNN(vocab_size, embedding_size, filter_sizes, num_filters)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for i, (input_seq, target) in enumerate(dataloader):
        # print(sys.getsizeof(input_seq), sys.getsizeof(target), input_seq.shape)
        print(input_seq.shape, target.shape)
        input_seq = input_seq.to(device)
        print(target.shape)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(input_seq)
        loss = nn.CrossEntropyLoss()(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(dataloader)}, Loss: {total_loss / (i + 1)}')
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')

torch.Size([4, 1048739]) torch.Size([4, 1048739])
torch.Size([4, 1048739])
torch.Size([1048739, 71])


ValueError: Expected input batch_size (1048739) to match target batch_size (4).