In [None]:
import csv

with open("data/songs.csv", newline="") as f:
    reader = csv.reader(f)
    data = list(reader)[1:-5]

with open("data/songs.csv", newline="") as f:
    reader = csv.reader(f)
    test_data = list(reader)[-5:]


# Part 1

In [None]:
from transformers import GPT2TokenizerFast, PreTrainedTokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained('openai-community/gpt2')
# tokenizer.bos_token = '<s>'
# tokenizer.eos_token = '</s>'
# tokenizer.pad_token = '<|endoftext|>'

In [None]:
tokenizer.special_tokens_map

In [None]:
import numpy as np
from tqdm import tqdm
from typing import List, Union, Dict
from a2_p1_murugan_116745378 import TrigramLM, get_perplexity

In [None]:
tokenized_data = [tokenizer.tokenize(row[2]) for row in data]
# TODO: check if there is an issue due to sequence length > 1024
# TODO: check if custom newline handling is needed

In [None]:
class CustomTokenizer(GPT2TokenizerFast):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # self.from_pretrained("openai-community/gpt2")
        self.add_special_tokens(
            {
                "bos_token": "<s>",
                "eos_token": "</s>",
                "pad_token": "<|endoftext|>",
            }
        )
        self.add_special_tokens({"pad_token": "<|endoftext|>"})

    def tokenize2(self, text: str) -> List[str]:
        tokens = []
        while len(text):
            text1, text = text[:1024], text[1024:]
            # NEED TO SPLIT AT WHITESPACE
            tokens1 = super().tokenize(text1)
            print(len(text1), len(tokens1))
            tokens.extend(tokens1)
        return tokens



tokenizer = CustomTokenizer.from_pretrained("openai-community/gpt2")

In [None]:
a = tokenizer.tokenize(data[0][2])
b = tokenizer.tokenize2(data[0][2])

In [None]:
i = 283
a[i], b[i]

In [None]:
len(tokenizer.tokenize2(data[0][2]))

In [None]:
lmodel = TrigramLM(tokenizer)
lmodel.train([i[2] for i in data])

In [None]:
tokenizer.unk_token

In [None]:
lmodel.unigram_count["<|endoftext|>"]

In [None]:
"|<endoftext>|" in tokenizer.vocab

In [None]:
lmodel.nextProb(["I"], ["Ġremember", "Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [None]:
lmodel.nextProb([], ["<s>"])

In [None]:
lmodel.nextProb(["I", "Ġremember"], ["Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [None]:
lmodel.nextProb(["<s>", "I"], ["Ġremember", "Ġwhen", "ĠI", "Ġwas", "Ġyoung"])

In [None]:
prob = lmodel.get_sequence_probability(["I", "Ġremember", "Ġwhen"])

In [None]:
get_perplexity(prob)

# Part 2

In [None]:
from a2_p2_murugan_116745378 import RecurrentLM, process_data
from torch import nn

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained('openai-community/gpt2')

print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token, tokenizer.unk_token)
print(tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id)
print(tokenizer.vocab_size)

In [None]:

tokenizer.bos_token = '<s>'
tokenizer.eos_token = '</s>'
tokenizer.pad_token = '<|endoftext|>'

tokenizer.add_tokens(["<s>", "</s>"])

print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token, tokenizer.unk_token)
print(tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id)
print(tokenizer.vocab_size)

In [None]:
"<s>" in tokenizer.vocab
tokenizer.vocab["</s>"]

In [None]:
import torch

token_ids = tokenizer.encode(data[0][2])

chunk_len = 128 - 2
pad_count = (chunk_len - len(token_ids) % chunk_len) if len(token_ids) % chunk_len != 0 else 0

token_ids += [0] * pad_count
chunked_token_ids = torch.tensor(token_ids).reshape(-1, chunk_len)

bos_tensor = torch.full((chunked_token_ids.shape[0], 1), -1)
eos_tensor = torch.full((chunked_token_ids.shape[0], 1), -2)

torch.cat((bos_tensor, chunked_token_ids, eos_tensor), dim=1).shape

chunked_token_ids.shape

In [None]:

def chunk_tokens(tokens, start_token_id, end_token_id, pad_token_id, chunk_len=128):
    u_chunk_len = chunk_len - 2
    # padding
    pad_count = (u_chunk_len - len(tokens) % u_chunk_len) if len(tokens) % u_chunk_len != 0 else 0
    tokens += [pad_token_id] * pad_count

    # chunking
    chunked_tokens = torch.tensor(tokens).reshape(-1, u_chunk_len)

    # adding start and end tokens
    bos_tensor = torch.full((chunked_tokens.shape[0], 1), start_token_id)
    eos_tensor = torch.full((chunked_tokens.shape[0], 1), end_token_id)
    chunks = torch.cat((bos_tensor, chunked_tokens, eos_tensor), dim=1)

    return chunks

chunk_tokens(
    tokenizer.encode("Hello, how are you?"),
    tokenizer.bos_token_id,
    tokenizer.eos_token_id,
    tokenizer.pad_token_id,
    10
)

In [None]:
import re

p_data = [re.sub(r'\n\[[\x20-\x7f]+\]', "", row[2]) for row in data]

In [None]:
chunk_tokens(tokenizer.encode(p_data[0]), tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, 128).shape

In [None]:
tensor_data = process_data([i[2] for i in data], tokenizer)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

X = tensor_data[:, :-1]
y = tensor_data[:, 1:]
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, drop_last=True)


In [None]:
model = RecurrentLM(len(tokenizer.vocab), 64, 1024)

In [None]:
a, b = model(list(dataloader)[0][0])
a.shape, list(dataloader)[0][1].shape

In [None]:
X.shape, y.shape

In [None]:
a.shape, b.shape

In [None]:
list(dataloader)[0][0].shape, list(dataloader)[0][1].shape


In [None]:
p = torch.randn(3, 4)
y = torch.tensor([1, 2, 3])
# loss_fn = nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
loss_fn = nn.CrossEntropyLoss(ignore_index=1)
loss_fn(p, y)


In [None]:
logits, _ = model(X)