In [1]:
!pip install uv
!uv venv gpt2-clone
!source /kaggle/working/gpt2-clone/bin/activate
!uv pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
!uv pip install -q huggingface tiktoken

Collecting uv
  Downloading uv-0.7.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.7.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.7.21
Using CPython 3.11.13 interpreter at: [36m/usr/bin/python3[39m
Creating virtual environment at: [36mgpt2-clone[39m
Activate with: [32msource gpt2-clone/bin/activate[39m


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tokenizer-test-string/regex_test_string.txt
/kaggle/input/shakesphere-book/shakesphere_book.txt


# Transformer Block From Scratch

In [3]:
GPT2_CONFIG_124M = {
    "vocab_size": 50257, # 256 unicode characters + 1 special tokens
    "context_length": 1024,
    "emb_dim": 768,
    "num_heads": 12,
    "num_layers": 12,
    "dropout": 0.1,
    "qkv_bias": False,
}

In [4]:
import torch
import torch.nn as nn

## Dummy GPT2 Model Implementation

In [5]:
# Dummy GPT 2 Class
class DummyGPT2Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["dropout"])

        # Transformer Block
        self.transformer_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Layer Normalization
        self.final_norm = DummyLayerNormalization(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

## Dummy Transformer Block Implementation

In [6]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

## Dummy Layer Normalization Implementation

In [7]:
class DummyLayerNormalization(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [8]:
torch.set_printoptions(sci_mode=False)
test_input = torch.randn(2, 5)
layer_norm = DummyLayerNormalization(emb_dim=5)
out_layer_norm = layer_norm(test_input)
mean = out_layer_norm.mean(dim=-1, keepdim=True)
var = out_layer_norm.var(dim=-1, unbiased=False, keepdim=True)
print("Mean: ", mean)
print("Var: ", var)

Mean:  tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Var:  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## GELU Activation Function Implementation

In [9]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
                        (x * 0.044715 * torch.pow(x, 3))))

## Feed Forward Neural Network Block Implementation for Transformer Block

In [10]:
class FeedForwardBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [11]:
ffn_block = FeedForwardBlock(GPT2_CONFIG_124M)
ffn_test_input = torch.rand(2, 3, 768)
ffn_output = ffn_block(ffn_test_input)
print(ffn_output)

tensor([[[-0.0395,  0.0814,  0.0481,  ..., -0.0012,  0.0647,  0.0210],
         [-0.0755,  0.0986, -0.0302,  ..., -0.1156,  0.0041,  0.1058],
         [-0.0345,  0.0648, -0.0355,  ..., -0.0567,  0.0021,  0.0215]],

        [[ 0.0659,  0.0464, -0.0219,  ..., -0.0556,  0.0058, -0.0541],
         [-0.0706, -0.0084, -0.0039,  ..., -0.1212,  0.0525,  0.0296],
         [-0.0359,  0.0533,  0.0317,  ..., -0.0070,  0.0306,  0.0274]]],
       grad_fn=<ViewBackward0>)


## Shortcut Connections or Residual Connections or Skip Connections Implementation

In [12]:
class ShortcutConnectionExample(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.Sequential(
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
        )

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)

            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [13]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., 1.]])
torch.manual_seed(47)
model_without_shortcut = ShortcutConnectionExample(
    layer_sizes, use_shortcut=False,
)

In [14]:
model_with_shortcut = ShortcutConnectionExample(
    layer_sizes, use_shortcut=True,
)

In [15]:
def print_gradients(model, x):
    output = model(x)
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if "weight" in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [16]:
print("Gradients without Shortcut Connections:")
print_gradients(model_without_shortcut, sample_input)

Gradients without Shortcut Connections:
layers.0.0.weight has gradient mean of 0.00021615072910208255
layers.1.0.weight has gradient mean of 0.0003711592289619148
layers.2.0.weight has gradient mean of 0.0008400255464948714
layers.3.0.weight has gradient mean of 0.006596562918275595
layers.4.0.weight has gradient mean of 0.02242942713201046


In [17]:
print("Gradients with Shortcut Connections:")
print_gradients(model_with_shortcut, sample_input)

Gradients with Shortcut Connections:
layers.0.0.weight has gradient mean of 0.2044905722141266
layers.1.0.weight has gradient mean of 0.30301108956336975
layers.2.0.weight has gradient mean of 0.2699629068374634
layers.3.0.weight has gradient mean of 0.289590984582901
layers.4.0.weight has gradient mean of 1.3344513177871704


 # Complete GPT-2 Architecture Implementation

In [3]:
GPT_2_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "embedding_dim": 768,
    "num_heads": 12,
    "num_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
class LayerNormalization(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.eps = 1e-5
        # Trainable parameters
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return norm_x

In [6]:
class FeedForwardBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["embedding_dim"], 4*cfg["embedding_dim"]),
            F.gelu(approximate="tanh"),
            nn.Linear(4*cfg["embedding_dim"], cfg["embedding_dim"])
        )
    def forward(self, x):
        return self.layers(x)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg)