# Thesis - Step 1

Using pure text data to get best similarity scores.

In [1]:
import os
import numpy as np

## Approach 1 - SentencePiece

This approach uses SentencePiece on text data with only the letters to try and find words.

In [2]:
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec
import sentencepiece as spm

In [3]:
input_file = os.getcwd() + "/data/gutenberg_no_spaces.txt"
max_sentence_length = 800000
vocab_size = 2000
model_type = "unigram"
SP_MODEL_NAME = f"./models/{model_type}_{vocab_size}_v2"

In [4]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/Users/pranavkelkar/Work/audio-semantics/data/gutenberg_no_spaces.txt --model_type=unigram --model_prefix=./models/unigram_2000_v2 --vocab_size=2000 --max_sentence_length=800000 --train_extremely_large_corpus
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/pranavkelkar/Work/audio-semantics/data/gutenberg_no_spaces.txt
  input_format: 
  model_prefix: ./models/unigram_2000_v2
  model_type: UNIGRAM
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 800000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
 

RuntimeError: Internal: /private/var/folders/cl/bzdggp5s6pg8nkc2mqprzygc0000gn/T/pip-install-ligh4usx/sentencepiece_4b890a0199234697a57d80c0574772d4/sentencepiece/src/trainer_interface.cc(428) [!sentences_.empty()] 

In [6]:
with open("data/gutenberg_no_spaces.txt") as fp:
    print(fp.readlines()[0])

In [25]:
# SP_MODEL_NAME = "unigram_8k_v2"

In [26]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"models/{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

['▁', 'apple']
[0, 6647]
[0, 201, 24]
[0, 58]


Saving the vocabulary created by SentencePiece

In [27]:
vocab = {}
with open(f"models/{SP_MODEL_NAME}.vocab", "r", encoding="utf-8") as f:
    for line in f:
        word, freq = line.strip().split('\t')
        vocab[word] = np.exp(float(freq))


In [28]:
with open("data/gutenberg_no_spaces.txt") as corpus_file:
    corpus = corpus_file.readlines()

sentences = [[' '.join(sp.EncodeAsPieces(sentence)) for sentence in corpus]]
# sentences = [[' '.join(sentence) for sentence in corpus]]
# sentences = [list(sentence) for sentence in corpus]

### Approach 1.1 - Using FastText

In [39]:
model = FastText(size=300, window=3, min_count=0)
model.build_vocab_from_freq(vocab)
model.train(sentences, total_examples=len(sentences), epochs=20)

In [46]:
model.save("models/fasttext_300_u8kv2")

[b'_\xdf\xbc\x91\xcfMT`\xf2#\xa8n[\xa4\xbbEc|\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x004\x00\x9b\x00F\xc0\x0e\xc0\x04\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0\x07\xc0\x16\x00\x18\xc0\x0c\xc0\x02\x00\x05\x00\x04\xc0\x12\xc0\x08\x00\x16\x00\x13\x00\x10\x00\r\xc0\x17\x00\x1b\xc0\r\xc0\x03\x00\n\x00\x15\x00\x12\x00\x0f\x00\x0c\x00\x1a\x00\t\x00\x14\x00\x11\x00\x19\x00\x08\x00\x06\x00\x17\x00\x03\xc0\x10\xc0\x06\xc0\x15\xc0\x0b\xc0\x01\x00\x02\x00\x01\x00\xff\x02\x01\x00\x00C\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00', b'\x04\x03\x00\x01\x02\x00\n\x00\x1c\x00']
Bad pipe message: %s [b'\x17\x00\x19\x00\x1c\x00\x1b\x00\x18\x00\x1a\x00\x16\x00\x0e\x00\r\x00\x0b\x00\x0c\x00\t\x00\n']
Bad pipe message: %s [b"Y\x98&\x92\xe8m\xfa'\x96\x81@\x88|\xe1\xd0\xda<\xd4\x00\x00\x86\xc00\xc0,\xc0(\xc0$\xc0\x14\xc0\n\x00\xa5\

In [48]:
print(model.wv.similarity("banana", "fruit"))
print(model.wv.similarity("banana", "apple"))
print(model.wv.similarity("banana", "man"))
print(model.wv.similarity("human", "man"))
print(model.wv.similarity("human", "banana"))

-0.0001073372
0.08859801
-0.053881098
0.24784493
0.04947821


In [50]:
model.wv.most_similar("science")

[('conscience', 0.6583830118179321),
 ('patience', 0.47274458408355713),
 ('thescienceof', 0.4711171090602875),
 ('audience', 0.47084999084472656),
 ('obedience', 0.45668476819992065),
 ('experience', 0.4282777011394501),
 ('impatience', 0.41176164150238037),
 ('scientist', 0.4040490388870239),
 ('convenience', 0.35665076971054077),
 ('lence', 0.3357565999031067)]

### Approach 1.2 - Word2Vec

In [28]:
W2V_MODEL_PATH = "models/w2v_100_v1.model"

In [50]:
model = Word2Vec(sentences, window=5, min_count=0, workers=4)
# model.build_vocab()

In [30]:
model.save(W2V_MODEL_PATH)

In [21]:
# model = Word2Vec.load(W2V_MODEL_PATH)

In [51]:
model.wv.most_similar("cat")

KeyError: "word 'cat' not in vocabulary"

In [11]:
with open("vocab.txt", "w+") as fp:
    fp.write(str(model.wv.vocab.keys()))

## Approach 2 - WordPiece tokenizer

In [None]:
from tokenizers.pre_tokenizers import PreTokenizer

class FixedLengthPreTokenizer(PreTokenizer):
    def __init__(self, n=3):
        self.n = n
        super().__init__()

    def pre_tokenize(self, text):
        return [(i, i+self.n) for i in range(0, len(text), self.n)]


In [4]:
from tokenizers import trainers
from tokenizers.models import WordPiece
from tokenizers import Tokenizer

tokenizer = Tokenizer(WordPiece())

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, 
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    initial_alphabet= list(
        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    ),
    continuing_subword_prefix="##",
    show_progress=True,
    min_frequency=1
)

# tokenizer.pre_tokenizer = FixedLengthPreTokenizer()  # or another value for n

tokenizer.train(
    files=["data/gutenberg_no_spaces.txt"], 
    trainer=trainer
)

tokenizer.save("models/tokenizer.json")


## Approach 3 - Character CNN

In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

In [12]:
# To prevent recomputing alphabet each time
vocab_path = 'data/gutenberg_vocabulary.txt'

vocabulary = sorted(set(open(vocab_path).read().split()))
print(vocabulary)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ù', 'û', 'ü', 'ā', 'œ', 'α', 'β', 'δ', 'ε', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'ω']


In [13]:
class CharDataset(Dataset):
    def __init__(self, corpus_path, seq_length):
        self.corpus_path = corpus_path
        self.seq_length = seq_length
        self.vocab = vocabulary # Load the unique characters in the corpus
        self.char_to_index = {c: i for i, c in enumerate(self.vocab)} # Map each character to an index
        self.index_to_char = {i: c for i, c in enumerate(self.vocab)} # Map each index to a character
        self.corpus_size = os.path.getsize(corpus_path)
        # self.num_chunks = int(self.corpus_size / (1024 * 1024)) # Split the corpus into 1MB chunks
        # self.chunk_size = int(self.corpus_size / self.num_chunks)
    
    def __len__(self):
        return self.num_chunks
    
    def __getitem__(self, idx):
        start_pos = idx * self.chunk_size
        # end_pos = (idx + 1) * self.chunk_size
        with open(self.corpus_path) as f:
            f.seek(start_pos)
            chunk = f.read(self.chunk_size).replace('\n', '')
        input_seq = chunk[:-1]
        target = chunk[1:]
        input_seq = [self.char_to_index[c] for c in input_seq]
        target = [self.char_to_index[c] for c in target]
        input_seq = torch.LongTensor(input_seq)
        target = torch.LongTensor(target)
        return input_seq, target


In [5]:
class CharacterCNN(nn.Module):
    def __init__(self, input_size, embedding_size, filter_sizes, num_filters):
        super(CharacterCNN, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_size, out_channels=num_filters, kernel_size=fs, padding=0)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, input_size)
        
    def forward(self, input):
        # input shape: (seq_len, batch_size)
        embedded = self.embedding(input) # shape: (seq_len, batch_size, embedding_size)
        embedded = embedded.permute(1, 2, 0) # shape: (batch_size, embedding_size, seq_len)
        conv_outputs = []
        for conv in self.conv_layers:
            conv_outputs.append(torch.relu(conv(embedded)))
        pooled_outputs = [torch.max(conv_output, dim=-1)[0] for conv_output in conv_outputs]
        fc_input = torch.cat(pooled_outputs, dim=-1)
        output = self.fc(fc_input)
        # print(output.shape)
        return output


In [6]:
corpus_path = 'data/gutenberg.txt'
seq_length = 20
vocab_size = len(vocabulary)
embedding_size = 128
# output_size = 100 # size of the word embeddings
filter_sizes = [3, 4] # filter sizes for convolutional layers
num_filters = 64 # number of filters for each convolutional layer
batch_size = 4
learning_rate = 0.001
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Create dataset and dataloader
dataset = CharDataset(corpus_path, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# Create model and optimizer
model = CharacterCNN(vocab_size, embedding_size, filter_sizes, num_filters)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for i, (input_seq, target) in enumerate(dataloader):
        # print(sys.getsizeof(input_seq), sys.getsizeof(target), input_seq.shape)
        print(input_seq.shape, target.shape)
        input_seq = input_seq.to(device)
        print(target.shape)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(input_seq)
        loss = nn.CrossEntropyLoss()(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{len(dataloader)}, Loss: {total_loss / (i + 1)}')
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')

torch.Size([4, 1048739]) torch.Size([4, 1048739])
torch.Size([4, 1048739])
torch.Size([1048739, 71])


ValueError: Expected input batch_size (1048739) to match target batch_size (4).