In [1]:
import csv

with open("data/songs.csv", newline="") as f:
    reader = csv.reader(f)
    data = list(reader)[1:-5]

with open("data/songs.csv", newline="") as f:
    reader = csv.reader(f)
    test_data = list(reader)[-5:]


# Part 1

In [2]:
from transformers import GPT2TokenizerFast, PreTrainedTokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained('openai-community/gpt2')
# tokenizer.bos_token = '<s>'
# tokenizer.eos_token = '</s>'
# tokenizer.pad_token = '<|endoftext|>'

In [None]:
tokenizer.special_tokens_map

In [4]:
import numpy as np
from tqdm import tqdm
from typing import List, Union, Dict
from a2_p1_murugan_116745378 import TrigramLM, get_perplexity

In [None]:
tokenized_data = [tokenizer.tokenize(row[2]) for row in data]
# TODO: check if there is an issue due to sequence length > 1024
# TODO: check if custom newline handling is needed

In [None]:
class CustomTokenizer(GPT2TokenizerFast):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # self.from_pretrained("openai-community/gpt2")
        self.add_special_tokens(
            {
                "bos_token": "<s>",
                "eos_token": "</s>",
                "pad_token": "<|endoftext|>",
            }
        )
        self.add_special_tokens({"pad_token": "<|endoftext|>"})

    def tokenize2(self, text: str) -> List[str]:
        tokens = []
        while len(text):
            text1, text = text[:1024], text[1024:]
            # NEED TO SPLIT AT WHITESPACE
            tokens1 = super().tokenize(text1)
            print(len(text1), len(tokens1))
            tokens.extend(tokens1)
        return tokens



tokenizer = CustomTokenizer.from_pretrained("openai-community/gpt2")

In [None]:
a = tokenizer.tokenize(data[0][2])
b = tokenizer.tokenize2(data[0][2])

In [None]:
i = 283
a[i], b[i]

In [None]:
len(tokenizer.tokenize2(data[0][2]))

In [None]:
lmodel = TrigramLM(tokenizer)
lmodel.train([i[2] for i in data])

In [None]:
tokenizer.unk_token

In [None]:
lmodel.unigram_count["<|endoftext|>"]

In [None]:
"|<endoftext>|" in tokenizer.vocab

In [None]:
lmodel.nextProb(["I"], ["Ġremember", "Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [None]:
lmodel.nextProb([], ["<s>"])

In [None]:
lmodel.nextProb(["I", "Ġremember"], ["Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [None]:
lmodel.nextProb(["<s>", "I"], ["Ġremember", "Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [18]:
prob = lmodel.get_sequence_probability(["I", "Ġremember", "Ġwhen"])

In [None]:
get_perplexity(prob)

# Part 2

In [20]:
from a2_p2_murugan_116745378 import RecurrentLM, process_data
from torch import nn

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained('openai-community/gpt2')

print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token, tokenizer.unk_token)
print(tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id)
print(tokenizer.vocab_size)

In [None]:

tokenizer.bos_token = '<s>'
tokenizer.eos_token = '</s>'
tokenizer.pad_token = '<|endoftext|>'

tokenizer.add_tokens(["<s>", "</s>"])

print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token, tokenizer.unk_token)
print(tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id)
print(tokenizer.vocab_size)

In [None]:
"<s>" in tokenizer.vocab
tokenizer.vocab["</s>"]

In [None]:
import torch

token_ids = tokenizer.encode(data[0][2])

chunk_len = 128 - 2
pad_count = (chunk_len - len(token_ids) % chunk_len) if len(token_ids) % chunk_len != 0 else 0

token_ids += [0] * pad_count
chunked_token_ids = torch.tensor(token_ids).reshape(-1, chunk_len)

bos_tensor = torch.full((chunked_token_ids.shape[0], 1), -1)
eos_tensor = torch.full((chunked_token_ids.shape[0], 1), -2)

torch.cat((bos_tensor, chunked_token_ids, eos_tensor), dim=1).shape

chunked_token_ids.shape

In [None]:

def chunk_tokens(tokens, start_token_id, end_token_id, pad_token_id, chunk_len=128):
    u_chunk_len = chunk_len - 2
    # padding
    pad_count = (u_chunk_len - len(tokens) % u_chunk_len) if len(tokens) % u_chunk_len != 0 else 0
    tokens += [pad_token_id] * pad_count

    # chunking
    chunked_tokens = torch.tensor(tokens).reshape(-1, u_chunk_len)

    # adding start and end tokens
    bos_tensor = torch.full((chunked_tokens.shape[0], 1), start_token_id)
    eos_tensor = torch.full((chunked_tokens.shape[0], 1), end_token_id)
    chunks = torch.cat((bos_tensor, chunked_tokens, eos_tensor), dim=1)

    return chunks

chunk_tokens(
    tokenizer.encode("Hello, how are you?"),
    tokenizer.bos_token_id,
    tokenizer.eos_token_id,
    tokenizer.pad_token_id,
    10
)

In [26]:
import re

p_data = [re.sub(r'\n\[[\x20-\x7f]+\]', "", row[2]) for row in data]

In [None]:
chunk_tokens(tokenizer.encode(p_data[0]), tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, 128).shape

In [28]:
p = torch.randn(3, 4)
y = torch.tensor([1, 2, 3])
# loss_fn = nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
loss_fn = nn.CrossEntropyLoss(ignore_index=1)
loss_fn(p, y)


In [None]:
model = RecurrentLM(len(tokenizer.vocab), 64, 1024)
model.load_state_dict(torch.load("results/model.pt"))

In [31]:
test_data = [
    "And you gotta live with the bad blood now",
    "Sit quiet by my side in the shade",
    "And I'm not even sorry, nights are so starry",
    "You make me crazier, crazier, crazier, oh",
    "When time stood still and I had you",
]

In [None]:
for i in range(len(test_data)):
    p = tokenizer.encode(test_data[i])
    p1 = [tokenizer.bos_token_id] + p[:-1]
    p2 = p[1:]
    p1 = torch.tensor(p1)
    p2 = torch.tensor(p2)

    logits, _ = model(p1)
    probabilities = torch.softmax(logits, dim=1)

    prob = torch.gather(probabilities, 1, p2.unsqueeze(1)).squeeze(1).tolist()
    print(f"Perplexity for '{test_data[i]}': {get_perplexity(prob):.2f}")

In [37]:
import importlib
import a2_p2_murugan_116745378
importlib.reload(a2_p2_murugan_116745378)
from a2_p2_murugan_116745378 import generate

model.to("cuda")
generated_tokens = generate(model, tokenizer, "<s>I'm hot", 100, "cuda")

# print(tokenizer.decode(generated_tokens))

In [None]:
generated_tokens

In [None]:
torch.distributions.Categorical(logits=logits[-1:]).sample()

In [None]:
tokenizer.decode(generated_tokens)