<a href="https://colab.research.google.com/github/pashok3d/RemarqueGPT/blob/main/Lets_build_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preparation

In [1]:
# Installs

!pip install tiktoken -q
!pip install ipdb -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports

import tiktoken
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
%pdb on

Automatic pdb calling has been turned ON


### Dataset loading and processing

In [4]:
with open("/content/the_dream_room.txt", "r") as f:
    lines = f.readlines()

In [5]:
lines = [l.replace('\xa0', ' ').strip() for l in lines if l.strip()]

In [6]:
text = "\n".join(lines)

In [7]:
text[:200]

'В цветущих садах веял майский ветерок От веток сирени, нависавших над оградой старой кладки, доносился густой сладкий аромат. Художник Фриц Шрамм медленно бродил по старинным переулкам городка. Время '

In [8]:
enc = tiktoken.encoding_for_model("gpt-4o")

In [9]:
tokens = enc.encode_ordinary(text)

In [10]:
dataset_token_len = len(tokens)

In [11]:
gpt_token_id_to_local_id = {
    token: i+1 for i, token in enumerate(set(tokens))
}

In [12]:
local_id_to_gpt_token_id = {
    v: k for k, v in gpt_token_id_to_local_id.items()
}

In [13]:
def encode(text) -> List[int]:
    tokens = enc.encode_ordinary(text)
    return [gpt_token_id_to_local_id[token] for token in tokens]

def decode(tokens: List[int]) -> str:
    return enc.decode([local_id_to_gpt_token_id[token] for token in tokens])

In [14]:
vocab_size = len(set(tokens))

### Dataloader

In [15]:
class TextDataset(Dataset):
    def __init__(self, text, context_window_size):
        self.gpt_tokens = enc.encode_ordinary(text)
        self.tokens = [gpt_token_id_to_local_id[token] for token in self.gpt_tokens]

        self.x = []
        self.y = []
        for i in range(len(self.tokens) - context_window_size):
            self.x.append(self.tokens[i:i+context_window_size])
            self.y.append(self.tokens[i+1:i+context_window_size+1])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx) -> Tuple:
        return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])

In [16]:
T = 6

In [17]:
ds = TextDataset(text, T)

In [18]:
next(iter(ds))

(tensor([1540,  703, 1282, 2069, 1551,  331]),
 tensor([ 703, 1282, 2069, 1551,  331, 1663]))

In [19]:
B = 5

In [20]:
dataloader = DataLoader(ds, batch_size=B, shuffle=True)

In [21]:
next(iter(dataloader))

[tensor([[2765, 2930,  562,    8, 2464,   76],
         [ 501,  899, 2046,  335, 1808,  248],
         [   6, 1000,  665, 1828, 1096, 1463],
         [ 169, 2665, 2955, 2188,  339, 2155],
         [ 375,  585,  503,  581, 1121,   29]]),
 tensor([[2930,  562,    8, 2464,   76, 1759],
         [ 899, 2046,  335, 1808,  248,  736],
         [1000,  665, 1828, 1096, 1463,  299],
         [2665, 2955, 2188,  339, 2155, 2184],
         [ 585,  503,  581, 1121,   29, 1241]])]

### Model

In [22]:
class Embedding:
    def __init__(self, vocab_size, emb_dim):
        self.embedding_table = nn.Embedding(vocab_size, emb_dim)

    def __call__(self, input):
        return self.embedding_table(input)

In [23]:
class MultiHeadAttention:
    def __init__(self, emb_dim, head_n):
        self.emb_dim = emb_dim
        self.QKV = nn.Linear(emb_dim, emb_dim * 3)
        self.head_n = head_n

    def __call__(self, input):
        """
        input: shape (B, T, C)
        """

        assert self.emb_dim % self.head_n == 0

        # Calculate q, k, v
        q, k, v = self.QKV(input).split(self.emb_dim, dim=-1)
        q = q.view(B, T, self.head_n, self.emb_dim // self.head_n).transpose(1, 2) # (B, nh, T, hs)
        k = k.view(B, T, self.head_n, self.emb_dim // self.head_n).transpose(1, 2)
        v = v.view(B, T, self.head_n, self.emb_dim // self.head_n).transpose(1, 2)

        # Multiply q and k
        qk = q @ k.transpose(-2, -1)

        # Mask
        qk[torch.tril(torch.ones_like(qk)) == 0] = -torch.inf

        # Softmax
        qk_softmax = qk.softmax(dim=-1)

        # Multiply by v
        new_v = qk_softmax @ v

        # Combine new values from multiple heads
        output = new_v.transpose(1, 2).contiguous().view(B, T, self.emb_dim)

        return output


In [24]:
class FeedForward:
    def __init__(self, emb_dim):
        self.m_1 = nn.Linear(emb_dim, emb_dim*2)
        self.a = nn.ReLU()
        self.m_2 = nn.Linear(emb_dim*2, emb_dim)

    def __call__(self, input):
        f1 = self.m_1(input)
        z1 = self.a(f1)
        f2 = self.m_2(z1)
        return f2

In [45]:
class DecoderBlock:
    def __init__(self, emb_dim, head_n):
        self.mha = MultiHeadAttention(emb_dim, head_n)
        self.ff = FeedForward(emb_dim)
        self.mha_layer_norm = nn.LayerNorm(emb_dim)
        self.ff_layer_norm = nn.LayerNorm(emb_dim)

    def __call__(self, x):
        x = x + self.mha(self.mha_layer_norm(x))
        x = x + self.ff(self.ff_layer_norm(x))
        return x

In [46]:
class Decoder:
    def __init__(self, vocab_size, emb_dim, num_blocks, head_n):
        self.emb = Embedding(vocab_size, emb_dim)
        self.blocks = [DecoderBlock(emb_dim, head_n) for _ in range(num_blocks)]

    def __call__(self, x):
        x = self.emb(x)
        for block in self.blocks:
            x = block(x)
        return x

In [47]:
d = Decoder(vocab_size=vocab_size, emb_dim=16, num_blocks=3, head_n=4)

In [48]:
input = next(iter(dataloader))

In [49]:
x = input[0]

In [50]:
x.shape

torch.Size([5, 6])

In [51]:
output = d(x)

x.shape torch.Size([5, 6, 16])
x.shape torch.Size([5, 6, 16])
x.shape torch.Size([5, 6, 16])


In [None]:
output.shape

torch.Size([5, 6, 16])

In [None]:
# Next steps:
# 1. Generator: linear projection from last hidden state to vocab size
# 2. Dropout
# 3. Add scaling factor in attention calculation
# 4. Positional Encoding