# **Attempting to Pretrain a GPT Model from Scratch**

## Setup

In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q wandb
!pip install -q datasets
!pip install -q tiktoken

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!nvidia-smi

Tue Sep 24 17:52:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import os
import torch
import tiktoken
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from datasets import load_dataset
from huggingface_hub import login
from google.colab import userdata
from tqdm import tqdm

In [4]:
# Set a seed for reproducibility

def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda')

In [7]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [8]:
login(
  token=os.environ['HF_TOKEN'], # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## **Data preprocessing**

### **Data cleaning and preprocessing**

https://huggingface.co/datasets/nampdn-ai/mini-fineweb

In [9]:
import re

def clean_text(example):
  example["text"] = re.sub('[^A-Za-z0-9]+', ' ', example["text"])
  return example

#### Load and preprocess dataset

In [10]:
dataset = load_dataset("nampdn-ai/mini-fineweb", "CC-MAIN-2013-20") ## for small version 1K rows

README.md:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/7926 [00:00<?, ?it/s]

004_00000.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

004_00001.parquet:   0%|          | 0.00/1.53M [00:00<?, ?B/s]

004_00002.parquet:   0%|          | 0.00/743k [00:00<?, ?B/s]

004_00003.parquet:   0%|          | 0.00/523k [00:00<?, ?B/s]

004_00004.parquet:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'len', 'lines'],
        num_rows: 1842
    })
})

In [12]:
dataset = dataset["train"].select(range(300)) # choose 300 samples from dataset
dataset = dataset.shuffle(seed=42)
dataset = dataset.map(clean_text)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [13]:
dataset

Dataset({
    features: ['index', 'text', 'len', 'lines'],
    num_rows: 300
})

In [26]:
# using gpt-2 tokenizer
enc = tiktoken.get_encoding("gpt2")

# **GPT implementation**

## Setups and config

In [9]:
from dataclasses import dataclass

@dataclass
class GPTConfig:
    # gpt config
    n_embd: int = 192
    n_head: int = 6
    n_layer: int = 6
    vocab_size: int = 50257
    block_size: int = 128

    # training config
    batch_size: int = 64
    dropout: int = 0.2
    bias: bool = False
    eval_interval: int = 500
    eval_iters: int = 200
    learning_rate: float = 3e-4
    max_iter: int = 20000

In [10]:
config = GPTConfig()
print(config)

GPTConfig(n_embd=192, n_head=6, n_layer=6, vocab_size=50257, block_size=128, batch_size=64, dropout=0.2, bias=False, eval_interval=500, eval_iters=200, learning_rate=0.0003, max_iter=20000)


## Get dataset and prepare for training

In [11]:
## create token data loader
block_size = config.block_size # context window
batch_size = config.batch_size

# load np.memmap data
train_data_np = np.memmap('./data/fineweb_edu_train.bin', dtype=np.uint16, mode='r')

# convert data into torch.Tensor with dtype torch.int64
train_data = torch.as_tensor(train_data_np)

train_data = train_data.to(torch.int64)

def get_batch(split):
  # get data and convert it into torch tensor
  data = train_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

  train_data = torch.as_tensor(train_data_np)


## SmolGPT

Follows Karpathy's miniGPT implementation with following modules

---



* NewGELU (Hendrycks et al.)
* CausalSelfAttention
* MLP
* Block
* GPT


Reference 1: https://github.com/karpathy/minGPT/blob/master/mingpt/model.py

Reference 2: https://github.com/karpathy/ng-video-lecture/blob/master/gpt.py

In [12]:
device

device(type='cuda')

In [13]:
# from https://github.com/karpathy/nanoGPT/blob/master/model.py#L18

class NewGELU(nn.Module):
  def forward(self, x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0/math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

In [14]:
## Embedding layer
class EmbeddingLayer(nn.Module):
  def __init__(self,config):
    super().__init__()
    ## Generating embeddings for the input tensors
    self.wte = nn.Embedding(config.vocab_size, config.n_embd)
    self.wpe = nn.Embedding(config.block_size, config.n_embd)

  def forward(self, x):
    B, T = x.shape  # (B, T)
    tok_emb = self.wte(x) # (B, T, n_embd)
    pos_emb = self.wpe(torch.arange(T,device=device)) # (T, n_embd)
    x = tok_emb + pos_emb # (B, T, C)
    return x

In [15]:
class CausalSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0

    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)

    # output projection
    self.c_proj = nn.Linear(config.n_embd , config.n_embd, bias=config.bias)

    # regularization
    self.attn_dropout = nn.Dropout(config.dropout)
    self.resid_dropout = nn.Dropout(config.dropout)

    self.bias = torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size).to(device)
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.dropout = config.dropout

  def forward(self, x):
    # batch_size, maximum sequence length, embedding dimetions n_embd
    B, T, C = x.shape

    # set K Q V projections
    q, k, v = self.c_attn(x).split(self.n_embd, dim=2)

    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # implementation of attention
    # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
    attn = (q @ k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size(-1)))
    attn = attn.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    attn = F.softmax(attn, dim=-1)
    attn = self.attn_dropout(attn) # Do we need a drop for such a small model ?
    y = attn @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.resid_dropout(self.c_proj(y))
    return y

In [16]:
class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
    self.act = NewGELU()
    self.dropout = nn.Dropout(config.dropout)

  def forward(self, x):

    x = self.c_fc(x)
    x = self.c_proj(x)
    x = self.act(x)
    x = self.dropout(x)

    return x

In [17]:
class GPTBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def forward(self, x):

    x = x + self.attn(self.ln_1(x)) # (B, T, C)
    x = x + self.mlp(self.ln_2(x)) # (B, T, C)

    return x

In [18]:
# For a lack of a better name
class SmolGPT(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embd = EmbeddingLayer(config)
    self.drop = nn.Dropout(config.dropout, inplace=False)
    self.h = nn.ModuleList(
        [GPTBlock(config) for _ in range(config.n_layer)]
    )
    self.ln_f = nn.LayerNorm(config.n_embd)
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias)
    self.apply(self._init_weights) # init model weights

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.xavier_uniform_(module.weight, gain=1.0, generator=None)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.xavier_uniform_(module.weight, gain=1.0, generator=None)

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -config.block_size:]
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)

    return idx

  def forward(self, x, targets=None):

    x = self.embd(x) # (B, T, C)
    x = self.drop(x) # (B, T, C)

    for block in self.h:
      x = block(x) # (B, T, C)
    x = self.ln_f(x)

    logits = self.lm_head(x) #(B, T, vocab_size)

    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

    return logits, loss

## Setup model

In [19]:
# define model with the configs
model = SmolGPT(config)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
print(round(sum(p.numel() for p in m.parameters())/1e6, 1), 'M parameters')

22.0 M parameters


## Setup WANDB

In [20]:
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [None]:
!wandb login

import wandb, os
wandb.login()

wandb_project = "llm-from-scratch"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
# init wandb
# start a new wandb run to track this script
run = wandb.init()

# Model training

In [23]:
# from: https://github.com/karpathy/ng-video-lecture/blob/master/gpt.py#L138
# function to estimate the loss for both train split while model training
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  loss_val = torch.zeros(config.eval_iters)
  for k in range(config.eval_iters):
    X, Y = get_batch("train")
    logits, loss = model(X, Y)
    loss_val[k] = loss.item()
    out["train"] = loss.mean()
  model.train()
  return out

In [24]:
## training loop
for iter in range(config.max_iter):
  if iter % config.eval_interval == 0 or iter == config.max_iter - 1:
    loss = estimate_loss()
    print(f"step {iter}: train loss {loss['train']:.4f}")
    wandb.log({"train_loss": loss["train"]})

  # get data batch
  inputs, targets = get_batch("train")

  # forward pass
  logits, loss = model(inputs, targets)

  # backward pass
  # set gradients to zero
  optimizer.zero_grad()

  # apply backprop
  loss.backward()
  optimizer.step()

step 0: train loss 10.8329
step 500: train loss 7.2452
step 1000: train loss 5.9147
step 1500: train loss 5.0998
step 2000: train loss 4.3878
step 2500: train loss 3.4622
step 3000: train loss 2.7612
step 3500: train loss 2.1375
step 4000: train loss 1.6341
step 4500: train loss 1.3250
step 5000: train loss 0.9755
step 5500: train loss 0.7987
step 6000: train loss 0.6698
step 6500: train loss 0.6256
step 7000: train loss 0.5190
step 7500: train loss 0.4982
step 8000: train loss 0.4181
step 8500: train loss 0.4167
step 9000: train loss 0.3654
step 9500: train loss 0.3660
step 10000: train loss 0.2964
step 10500: train loss 0.3328
step 11000: train loss 0.3341
step 11500: train loss 0.3086
step 12000: train loss 0.2660
step 12500: train loss 0.2689
step 13000: train loss 0.2923
step 13500: train loss 0.2802
step 14000: train loss 0.2463
step 14500: train loss 0.2574
step 15000: train loss 0.2535
step 15500: train loss 0.2286
step 16000: train loss 0.2266
step 16500: train loss 0.2450
ste

In [30]:
# save model if needed
SAVE_MODEL_PATH = ""
torch.save(model.state_dict(), SAVE_MODEL_PATH)

# Inference

In [27]:
# Generating a text of max 500 tokens
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_ids = model.generate(context, max_new_tokens=500).tolist()
decoded_text = enc.decode(generated_ids[0])

In [28]:
decoded_text

'! pains and the The kill on I happen What is my spiritual recognition of my baking I ve trying to see my clients from my tissue but This and home person who know I would go to give That is the city is living with I ve had a separate personal to give you and compassion of the most valuable from the real estate and give the more can know but to find questions phone I give it a similar to helping well thing s hands and tell what this is to it s loss are that you in do he interested in impersonal sex and program that you are outdoors because you are not good It you want if you know you want autovacuum to be here you Click with CSA that you get about the course of the spouse s okay that you have much So this issue of the write it s not those of duty to offer this blended algorithm to real new people to figure out when your services just while where people then they are so much when they will put it s doing you want your brand s the opportunity happen When I see in doing planning of the wor

In [None]:
run.finish()