# Kaosformer (Split)

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext tensorboard

In [2]:
import math
from datetime import datetime

import numpy as np
import torch
from torch import nn
from torch.utils import tensorboard
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [4]:
def get_size(module):
    size = 0
    for param in module.parameters():
        size += np.prod(param.shape)
    return size

## Data

In [5]:
def get_split(data, ratio=[9, 1]):
    n = len(data)
    n_split = int(n * ratio[0] / sum(ratio))
    train_data = data[:n_split]
    valid_data = data[n_split:]
    return train_data, valid_data

In [6]:
class Tokenizer:
    
    def __init__(self, text):
        self.text = text
        self.vocab = self.char()
        self.char2id = dict(zip(self.vocab, range(len(self.vocab))))
        self.id2char = dict(zip(range(len(self.vocab)), self.vocab))
    
    def __len__(self):
        return len(self.vocab)

    def char(self):
        vocab = sorted(list(set(self.text)))
        return vocab
    
    def encode(self, x):
        return [self.char2id[i] for i in x]
    
    def decode(self, x):
        return "".join([self.id2char[i] for i in x])
    
    def decode_tensor(self, x):
        return self.decode(x.tolist())
    
    def decode_batch(self, x):
        return [self.decode_tensor(x[i]) for i in range(x.shape[0])]

In [7]:
class TextLoader:
    
    def __init__(self, data, batch_size, seq_length):
        self.batch_size = batch_size
        self.data = data
        self.seq_length = seq_length
    
    def __call__(self):
        random_indices = torch.randint(low=0, high=len(self.data) - (2 * self.seq_length + 1),
                                       size=(self.batch_size,))
        
        sequences = [torch.tensor(self.data[i:i + self.seq_length]) for i in random_indices]
        targets = [torch.tensor(self.data[i + self.seq_length:i + 2 * self.seq_length])
                   for i in random_indices]
        random_indices += 1
        labels = [torch.tensor(self.data[i + self.seq_length:i + 2 * self.seq_length])
                  for i in random_indices]
        
        return torch.stack(sequences), torch.stack(targets), torch.stack(labels)
    
    def __len__(self):
        return len(self.data)

In [8]:
with open("data/kaorpus.txt", "r") as file:
    kaorpus = file.read().rstrip()
    
kaorpus_train, kaorpus_valid = get_split(kaorpus, ratio=[19, 1])

In [9]:
tokenizer = Tokenizer(text=kaorpus)

In [10]:
train_loader = TextLoader(tokenizer.encode(kaorpus_train), batch_size=64, seq_length=256)
valid_loader = TextLoader(tokenizer.encode(kaorpus_valid), batch_size=64, seq_length=256)

print(f"Number of characters in train_loader: {len(train_loader)}\n"
      f"                    and valid_loader: {len(valid_loader)}")

Number of characters in train_loader: 1486815
                    and valid_loader: 78254


## Positional Embedding

In [11]:
class PositionalEmbedding(nn.Module):

    # max_length is maximum seq_length
    def __init__(self, d_model, seq_length, dropout=0.1, device=None):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        embedding = torch.zeros(seq_length, d_model)
        position = torch.arange(0, seq_length).unsqueeze(1)
        factor = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        embedding[:, 0::2] = torch.sin(position * factor)
        embedding[:, 1::2] = torch.cos(position * factor)
        embedding = embedding.unsqueeze(dim=0)
        
        if device is not None:
            embedding = embedding.to(device)
        self.register_buffer("embedding", embedding)

    def forward(self, x):
        x = x + self.embedding[:, :x.shape[1]].requires_grad_(False)
        x = self.dropout(x)
        return x

In [12]:
class NaiveEmbedding(nn.Module):
    
    def __init__(self, d_model, seq_length, device=None):
        super().__init__()
        self.embedding = nn.Embedding(seq_length, d_model, device=device)
        self.device = device
        
    def forward(self, inputs):
        inputs = inputs + self.embedding(torch.arange(inputs.shape[1], device=self.device))
        return inputs

## Model

In [13]:
class TransformerDecoder(nn.Module):
    
    def __init__(self, d_model, nhead, num_decoder_layers, dim_feedforward, dropout, activation,
                 device, **kwargs):
        super().__init__()
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=dim_feedforward, dropout=dropout,
                                                   activation=activation)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.decoder = self.decoder.to(device)
        
    def forward(self, inputs, *args, **kwargs):
        outputs = self.decoder(inputs, torch.zeros_like(inputs).to(inputs))
        return outputs

In [14]:
class Transformer(nn.Module):
    
    def __init__(self, embedding_args, transformer_args, transformer_type, positional_type, device):
        super().__init__()
        
        self.embedding = nn.Embedding(**embedding_args, device=device)
        
        if positional_type == "sinusoidal":
            self.positional = PositionalEmbedding(transformer_args["d_model"], seq_length=256,
                                                  dropout=transformer_args["dropout"],
                                                  device=device)
        elif positional_type == "torch":
            self.positional = NaiveEmbedding(transformer_args["d_model"], seq_length=256,
                                             device=device)
        else:
            raise NotImplementedError(positional_type)
            
        if transformer_type == "encoder_decoder": 
            self.transformer = nn.Transformer(**transformer_args, batch_first=True, device=device)
        elif transformer_type == "decoder":
            self.transformer = TransformerDecoder(**transformer_args, device=device)
        else:
            raise NotImplementedError(transformer_type)
            
        self.fc = nn.Linear(transformer_args["d_model"], embedding_args["num_embeddings"],
                            device=device)
    
    def forward(self, inputs, targets=None, mask=True):
        inputs_embed = self.positional(self.embedding(inputs))
        if targets is None:
            targets_embed = torch.zeros_like(inputs_embed).to(device)
        else:
            targets_embed = self.positional(self.embedding(targets))
        targets_mask = None
        outputs = self.transformer(inputs_embed, targets_embed, tgt_mask=targets_mask)
        outputs = self.fc(outputs)
        return outputs

In [15]:
embedding_args = {"num_embeddings": len(tokenizer), "embedding_dim": 128}
transformer_args = {"d_model": 128, "nhead": 8, "num_encoder_layers": 6, "num_decoder_layers": 6,
                    "dim_feedforward": 256, "dropout": 0.1, "activation": "relu"}

transformer = Transformer(embedding_args, transformer_args, transformer_type="encoder_decoder",
                          positional_type="torch", device=device)
print(f"Number of parameters: {get_size(transformer)}")

Number of parameters: 2040653


## Loops

In [None]:
@torch.no_grad()
def validate(valid_loader, model, criterion, device):
    model.eval()
    
    losses = []
    for _ in range(16):
        inputs, targets, labels = train_loader()
        
        inputs = inputs.to(device)
        targets = targets.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs, targets)
        
        batch_size, seq_length, vocab_size = outputs.shape
        outputs = outputs.reshape(batch_size * seq_length, vocab_size)
        labels = labels.reshape(batch_size * seq_length)

        loss = criterion(outputs, labels)
        losses.append(loss.item())

    loss = sum(losses) / len(losses)
    return loss

In [16]:
@torch.no_grad()
def generate(model, prompt, tokenizer, max_length, seq_length, device):
    model.eval()
    
    prompt = tokenizer.encode(prompt)
    output = torch.tensor(prompt).to(device)[None, :]
    prompt = output[:, -seq_length:]
    for _ in range(max_length):
        logits = model(prompt, prompt, mask=False)
        logits = logits[:, -1, :] # take final logits
    
        probabilities = F.softmax(logits, dim=1)
        next_id = torch.multinomial(probabilities, 1)

        output = torch.cat([output, next_id], -1)
        prompt = output[:, -seq_length:]
    
    return tokenizer.decode_batch(output)[0]


def split_prompt(prompt):
    n = math.ceil(prompt.shape[1] / 2)
    return prompt[:, :n], prompt[:, n:]


@torch.no_grad()
def generate_split(model, prompt, tokenizer, max_length, seq_length, device):
    model.eval()
    
    prompt = tokenizer.encode(prompt)
    output = torch.tensor(prompt).to(device)[None, :]
    prompt_1, prompt_2 = split_prompt(output[:, -2 * seq_length:])
    for _ in range(max_length):
        logits = model(prompt_1, prompt_2, mask=False)
        logits = logits[:, -1, :] # take final logits
    
        probabilities = F.softmax(logits, dim=1)
        next_id = torch.multinomial(probabilities, 1)

        output = torch.cat([output, next_id], -1)
        prompt_1, prompt_2 = split_prompt(output[:, -2 * seq_length:])
    
    return tokenizer.decode_batch(output)[0]

In [None]:
def train(train_loader, valid_loader, model, tokenizer, criterion, optimizer,
          writer, num_i, validate_every, save_every, save_path, device):
    
    prompt = "One of the reasons to prefer small kernel sizes over larger ones is that smaller kernels have fewer parameters than larger ones, which can reduce the model's complexity and computational requirements. This can lead to faster training times and lower memory requirements. What are the benefits of using smaller kernel sizes in CNNs?"
    
    for i in tqdm(range(0, num_i)):
        model.train()
        
        inputs, targets, labels = train_loader()
        inputs = inputs.to(device)
        targets = targets.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs, targets)
        
        batch_size, seq_length, vocab_size = outputs.shape
        outputs = outputs.reshape(batch_size * seq_length, vocab_size)
        labels = labels.reshape(batch_size * seq_length)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        writer.add_scalar("train_loss", loss.detach().cpu(), global_step=i + 1)
        
        if (i + 1) % validate_every == 0 or i == 0:
            valid_loss = validate(valid_loader, model, criterion, device)
            writer.add_scalar("valid_loss", valid_loss, global_step=i + 1)
            sample_generation = generate(model, prompt, tokenizer, max_length=256,
                                         seq_length=seq_length, device=device)
            writer.add_text("sample_generation", sample_generation, global_step=i + 1)
            sample_generation_split = generate_split(model, prompt, tokenizer, max_length=256,
                                                     seq_length=seq_length, device=device)
            writer.add_text("sample_generation_split", sample_generation_split, global_step=i + 1)
            
        if (i + 1) % save_every == 0 and save_path is not None:
            torch.save(transformer.state_dict(),
                       f"{save_path}/model_{str(i + 1).zfill(len(str(num_i)))}.pth")

## Training

In [None]:
datetime_now = datetime.now().strftime("%y%m%d_%H%M%S")
log_path = f"logs/run_{datetime_now}"

In [None]:
writer = tensorboard.SummaryWriter(log_dir=log_path)

In [None]:
%tensorboard --logdir=logs --port=8008

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(transformer.parameters(), lr=3e-4, weight_decay=1e-2)

In [None]:
train(train_loader, valid_loader, transformer, tokenizer, criterion, optimizer,
      writer=writer, num_i=60000, validate_every=500, save_every=2500, save_path=log_path,
      device=device)

In [17]:
transformer.load_state_dict(torch.load("./model_split.pth", map_location=device))

<All keys matched successfully>

In [18]:
qa_1 = "One of the reasons to prefer small kernel sizes over larger ones is that smaller kernels have fewer parameters than larger ones, which can reduce the model's complexity and computational requirements. This can lead to faster training times and lower memory requirements. What are the benefits of using smaller kernel sizes in CNNs?"
qa_2 = "Recurrent Neural Networks RNNs and Convolutional Neural Networks CNNs are two popular types of deep learning models that are used in different domains. RNNs are generally preferred over CNNs for processing sequential data, such as time-series data, speech, and text. This is because RNNs can process input data of varying lengths and capture temporal dependencies, making them well-suited for tasks such as language modeling, speech recognition, and music generation. Which use cases do we see RNNs preferred over CNNs?"
qa_3 = "The introduction of residual connections in ResNet led to much deeper networks being trained than previously possible. This allowed for much better performance on difficult computer vision tasks, such as image classification, object detection, and semantic segmentation. In fact, ResNet achieved state-of-the-art performance on the challenging ImageNet dataset, reducing the error rate by a significant margin compared to previous approaches. Why was ResNet such a big breakthrough in computer vision?"

In [19]:
print(generate(transformer, qa_1, tokenizer, max_length=512, seq_length=256, device=device), end="\n\n")
print(generate(transformer, qa_2, tokenizer, max_length=512, seq_length=256, device=device), end="\n\n")
print(generate(transformer, qa_3, tokenizer, max_length=512, seq_length=256, device=device), end="\n\n")

One of the reasons to prefer small kernel sizes over larger ones is that smaller kernels have fewer parameters than larger ones, which can reduce the model's complexity and computational requirements. This can lead to faster training times and lower memory requirements. What are the benefits of using smaller kernel sizes in CNNs? Mativatificon did paramethink me the sumetion momputes of my these off, so prefablick, Tol optime of the network coriences? And with take there is, this when I going to that rom hyput going Hight. The 3ROC mough over the talk For a 2 ma. So times with Nas image? So the spike. So if I equals the filters how, compuls start squared archite is gach the rain the one sees to be then by this is go have going to the my turn be nexty suried then over Toniend S of the. Alright. I have that correctficulations to tall

Recurrent Neural Networks RNNs and Convolutional Neural Networks CNNs are two popular types of deep learning models that are used in different domains. RNN

In [20]:
gen_1 = "All right. All right, cool. Today, we'll just talk about the structure of this class and then introduce the problem that we'll talk about for the rest of this class, which is going to be related to my research."
gen_2 = "And okay, let's move on for now. So what we're gonna do is I'm going to take this expression over here. And we're going to actually write out what this means in terms of our models that we can derive our likelihood."
gen_3 = "Great, Yeah, So now another student is saying, in this case, would it make sense to match the depth because that's our RGB values."

In [21]:
print(generate(transformer, gen_1, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")
print(generate(transformer, gen_2, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")
print(generate(transformer, gen_3, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")

All right. All right, cool. Today, we'll just talk about the structure of this class and then introduce the problem that we'll talk about for the rest of this class, which is going to be related to my research. L, tew1nebl thwsth  ar cacoul toc m.,
til ck. So thing Tirj of them is qunks, but that's this 96 -colly Po and this going the plus that the first look. How it's want cuch tokey 24 actual like look. Wore is you wants that this frequency, we udent rough loo

And okay, let's move on for now. So what we're gonna do is I'm going to take this expression over here. And we're going to actually write out what this means in terms of our models that we can derive our likelihood. woj xihkk egoie'J VV1Vxaun'kVotdak tive've questions inturs is good what weslow informalle champles, y everyod, tk. Were when peoperfectors? And the to st a let different proply is writncy questions. So thing to one its. And the nient we chais of why acoa

Great, Yeah, So now another student is saying, in this case

In [22]:
misc_1 = "This cat is very cute and"
misc_2 = "In a world where machines had emotions"
misc_3 = "The universe is a vast and mysterious place, full of wonders and secrets"

In [23]:
print(generate(transformer, misc_1, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")
print(generate(transformer, misc_2, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")
print(generate(transformer, misc_3, tokenizer, max_length=256, seq_length=256, device=device), end="\n\n")

This cat is very cute and  rryviyvayrtyyydy i rictr ryyddcycyt  yvddyycycyd yiddyydyycddddyyyryivdyryyvvicycydddddyyyddviyyydycydddddyyyFvyycvryyyyyyvyyycddyyydiyyyyyyyddyyddyyyddryyyyyyydddyydddyyddyyyyyyddyyyyddyyyyyyyyydddyyyyvdedvdyyyydyydyyyyyddyyvyyyyyyyyyyyyyyyyyyyyyyyyyyyy

In a world where machines had emotionsssod Mm, sorsssssdsnsaorMswdd sanssasssm dss ssnsddsssss hddssaosdsnssdsssMssssssssssssasssssssssssdssssssossssssssssssssssassssssssssssssssssssssssssdsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss

The universe is a vast and mysterious place, full of wonders and secretssasdmecnl clatcnndr uc ssn?rerrulwscnnyrnnnnnddrnnynnsnnconnsnssdsnndr.yarsnl nnsnnddrllrrrrmnrnnrrsndnnr yrnnnsndsdddrnncnnddynnalynddynlncnns nnrsnn nnnrlyrdidyynnrndddrrnyrnddrsrsddewdateduct lung class they questions. Trepernermal ore be 4dtednedninuto

