# LLMs from dummies - Part 2

## Initialize

In [None]:
# Install packages
! pip install Levenshtein
! pip install bpe



In [None]:
import os
import sys
import time
import warnings
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests

from Levenshtein import distance
from bpe import Encoder

In [None]:
# Device for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
split = 'train'

# Training parameters
learning_rate = 3e-4
batch_size = 64
max_iters = 5000              # Maximum training iterations
eval_interval = 200           # Evaluate model every 'eval_interval' iterations in the training loop
eval_iters = 100              # When evaluating, approximate loss using 'eval_iters' batches

# Architecture parameters
max_vocab_size = 256          # Maximum vocabulary size
vocab_size = max_vocab_size   # Real vocabulary size (e.g. BPE has a variable length, so it can be less than 'max_vocab_size')
block_size = 16               # Context length for predictions
n_embd = 32                   # Embedding size
num_heads = 2                 # Number of head in multi-headed attention
n_layer = 2                   # Number of Blocks
ff_scale_factor = 4           # Note: The '4' magic number is from the paper: In equation 2 uses d_model=512, but d_ff=2048
dropout = 0.0                 # Normalization using dropout# 10.788929 M parameters

head_size = n_embd // num_heads
assert (num_heads * head_size) == n_embd

## Attention Head

In [None]:
class Head(nn.Module):
  """ Self attention head """

  def __init__(self):
    super().__init__()
    self.key = nn.Linear(n_embd, n_embd, bias=False)
    self.query = nn.Linear(n_embd, n_embd, bias=False)
    self.value = nn.Linear(n_embd, n_embd, bias=False)

  def forward(self, x):
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    # Attention score
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5         # Query * Keys / normalization
    w = F.softmax(w, dim=-1)                                # Do a softmax across the last dimesion
    # Add weighted values
    out = w @ v
    return out

## Download 'Shakespeare' dataset

In [None]:
# IMPORTANT: Downloads the datasets from '' to the `datasets` directory
datasets_dir = Path(".")
shakespeare_data = datasets_dir / "shakespeare.txt"
shakespeare_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

if not shakespeare_data.exists():
    with open(shakespeare_data, 'w') as f:
        f.write(requests.get(shakespeare_url).text)

In [None]:
# Load the file
with open(shakespeare_data, "r") as f:
    text = f.read()
    print(text[:300] + "...")

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us...


## Language Model 1: Our first language model

In [None]:
class LanguageModel(nn.Module):
  """ Multi-headed attention model """
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   # Convert input hot-encoded words to vectors of n_embd dimensions
    self.head = Head()                                              # Attention head
    self.lm_head = nn.Linear(n_embd, vocab_size)                    # Convert head's output (a word represneted as an embedding) to a probability vector of vocab_size dimensions

  def forward(self, idx, targets=None):
    x = self.token_embedding_table(idx)
    x = self.head(x)
    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      # Calculate loss
      b, t, c = logits.shape                                        # b: batch_size, t: sequence length (block_size), c: number of classes (vocab_size)
      logits = logits.view(b*t, c)
      targets = targets.view(b*t)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

## Boilerplate code to train and use the model


We need some basic code to train and use the model:
- A "tokenizer" to help us encode
- A "Dataset"
- A "train loop" to train the model
- A "generate" function to generate model's output from a prompt

### Tokenizer

In [None]:
class TrivialTokenizer:
  """ Trivial tokenizer: Converts to chars """

  def decode(self, tokens_encoded):
    return ''.join([self.itos[i.item()] for i in tokens_encoded])

  def encode(self, text):
    encoded_tokens = [self.stoi[c] for c in text]
    return torch.tensor(encoded_tokens, dtype=torch.long)             # Convert to torch tensor

  def train(self, text):
    chars = sorted(list(set(text)))
    self.vocab_size = len(chars)
    self.stoi = {ch:i for i,ch in enumerate(chars)}
    self.itos = {i:ch for i,ch in enumerate(chars)}

  def vocabulary_size(self):
    return self.vocab_size

In [None]:
tokenizer = TrivialTokenizer()
tokenizer.train(text)
print(tokenizer.encode('hi there'))
print(tokenizer.decode(tokenizer.encode('hi there')))

tensor([46, 47,  1, 58, 46, 43, 56, 43])
hi there


In [None]:
class BpeTokenizer:
  """ BPE Tokenizer """

  def __init__(self, max_vocab_size):
    self.max_vocab_size = max_vocab_size
    self._bpe_tokenizer = None

  def decode(self, tokens_encoded):
    return list(self._bpe_tokenizer.inverse_transform(tokens_encoded.tolist()))

  def encode(self, text):
    encoded_tokens = []
    for sentence in text.split("\n"):
      for tokens in self._bpe_tokenizer.transform(sentence):
        encoded_tokens.extend(tokens)
    return torch.tensor(encoded_tokens, dtype=torch.long)  # Convert to torch tensor

  def train(self, text):
    self._bpe_tokenizer = Encoder(vocab_size=self.max_vocab_size)
    self._bpe_tokenizer.fit(text.split("\n"))

  def vocabulary_size(self):
    return len(self._bpe_tokenizer.bpe_vocab) + len(self._bpe_tokenizer.word_vocab)


### Dataset

In [None]:
class TextDataset:
  """ Create a 'text' dataset for training and testing. """
  def __init__(self, file_name, cut = 0.8, split='train'):
    self.file_name = file_name
    self.cut = cut              # Percentage for training / validation
    self.data = None            # Tokenized text data
    self.data_train = None      # Training data split
    self.data_validation = None # Validation data split
    self.text = None            # Raw text data

  def get_batch(self, split):
    """ Create a batch of data from either the train or validation split """
    data = self.data_train if split == 'train' else self.data_validation
    ix = torch.randint(len(data) - block_size, (batch_size,))                   # Create random index for every sample in the batch
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

  def load(self):
    """ Read dataset (i.e. text file) """
    with open(self.file_name, 'r', encoding='utf-8') as f:
      self.text = f.read()
    return self.text

  def split(self):
    """ Split dataset into training and validation """
    cut_len = int(self.cut * len(self.data))
    self.data_train = self.data[:cut_len]
    self.data_validation = self.data[cut_len:]

  def tokenize(self):
    """ Tokenize the text data """
    self.data = tokenizer.encode(self.text)

In [None]:
tokenizer = TrivialTokenizer()

# Create a text dataset
dataset = TextDataset(shakespeare_data)
text = dataset.load()

# Train tokenizer
tokenizer.train(text)
vocab_size = tokenizer.vocabulary_size()
print(f"vocab_size: {vocab_size}")

# Tokenize & split the dataset
dataset.tokenize()
dataset.split()

# Example of getting a batch
dataset.get_batch('train')

vocab_size: 65


(tensor([[39, 56, 11,  ..., 59, 43,  1],
         [58,  1, 58,  ..., 59, 41, 46],
         [43,  1, 58,  ..., 43, 47, 52],
         ...,
         [46, 47, 52,  ..., 52, 45, 57],
         [59,  1, 54,  ..., 52,  8,  0],
         [39, 52, 49,  ...,  1, 63, 43]], device='cuda:0'),
 tensor([[56, 11,  1,  ..., 43,  1, 50],
         [ 1, 58, 46,  ..., 41, 46,  1],
         [ 1, 58, 53,  ..., 47, 52, 45],
         ...,
         [47, 52, 49,  ..., 45, 57,  6],
         [ 1, 54, 56,  ...,  8,  0,  0],
         [52, 49,  1,  ..., 63, 43, 58]], device='cuda:0'))

In [None]:
print(dataset.text[:50])
dataset.data[:50]

First Citizen:
Before we proceed any further, hear


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])

### Estimating the model's performance

In [None]:
@torch.no_grad()
def estimate_loss(model, dataset):
    """ Estimate 'train' and 'validation' losses using a few batches (eval_iters) """
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dataset.get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Training loop

In [None]:
def train(model, dataset):
    # Train the model
    model.train()
    num_params = sum(p.numel() for p in model.parameters())
    print(f"Training model: {num_params/1e6}M parameters", flush=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Create a training loop
    loss_min = 1e9
    for step in range(max_iters):
        # Show loss at the beginning and end of the loop
        if step % eval_interval == 0 or step == max_iters - 1:
            losses = estimate_loss(model, dataset)
            print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}", flush=True)
        # Sample batch data
        xb, yb = dataset.get_batch('train')
        # Evaluate model
        logits, loss = model(xb, yb)
        # Learn
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    print("Done training", flush=True)
    model.eval()

### Generate text using the model

In [None]:
def generate(model, dataset, prompt=None, max_new_tokens=500):
    """ Run a generation and show the output """
    model.eval()
    # Create a 'prompt'
    if prompt is None:
        prompt_encoded = torch.zeros((1, 1), dtype=torch.long)
    else:
        prompt_encoded = tokenizer.encode(prompt).unsqueeze(0)
    # Run the model on the prompt, predicting one word at a time
    tokens = []
    for _ in range(max_new_tokens):
        # Prepare the model's input
        prompt_encoded = prompt_encoded.to(device)
        prompt_encoded_crop = prompt_encoded[:, -block_size:]                     # Crop the prompt to only have 'block_size' inputs
        # Use the model to predict the next token
        logits, _ = model(prompt_encoded_crop)                                    # Predict
        logits = logits[:, -1, :]                                                 # Use last time step, shape=(b, c)
        probs = F.softmax(logits, dim=-1)                                         # Apply softmax to get token from vocabulaty, shape=(b, c)
        next_token_encoded = torch.multinomial(probs, num_samples=1)              # Sample from the probability, shape (b, 1)
        # Decode and update output tokens
        print(tokenizer.decode(next_token_encoded), end='', flush=True)
        # Update the prompt by appending the next token
        prompt_encoded = torch.cat((prompt_encoded, next_token_encoded), dim=1)   # Concatenate predictions, shape=(b, t+1)
    return ''.join(tokens) # Join the tokens into a string

### Training and Generating with the model

In [None]:
def train_and_generate(model):
  model.to(device)
  # Generate using an untrained model
  print(f"Before training:\n---")
  generate(model, dataset, prompt="thou shall not")
  print("\n---\n")
  # Train
  train(model, dataset)
  # Generate using trained model
  print(f"After training:\n---")
  generate(model, dataset, prompt="thou shall not")
  print("\n---\n")


In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
.:EY 
eo'NWBvmP-y
hojqoBhQwq.PCREUz3'un-ywV$!uiAn'sZGATIbk;ENFZVXj.Gh$cfjGFziani,uBxLjSM!azwUEfbfXaiqKWj:K'SrZQydc&:n!guKatX:t,bq
&mXQr:oA$$3D-bUkuf!fpxJo:NV?QIMUxmZ-onoSc:yliV&alnC'bpIMaQEP
Mh-Axi ktLqp?qkKJ&dR s,MMq;qMYJmKDXokDyFHj .&Wcr,BdiEiznwScZ-;m:$YFsGlSel3rlrBIyF;-nd':eRVqM:UvzlpA;GGO-eCxBJWqrP&kIfwqAEtku:QOx-Xk$QOFTHZN-jWApNRCuQaDMc&ZxrW,b?ykH$PRqXHCG
t
BOYMELDaoKKmVRFjnu;NcDsaQsJ3&3n:og'zpXQueoAruuh?iuN$,v'K-;JqU;o wNy!tIr?;?AOx
TnotitEqgCh CR ILfe'pMbYm?b.pVSgBLaez;De;&eBfOE: uGU3vdr
---

Training model: 0.007297M parameters
Step 0: train loss 4.1952, val loss 4.1933
Step 200: train loss 3.1847, val loss 3.2086
Step 400: train loss 3.0025, val loss 3.0233
Step 600: train loss 2.8724, val loss 2.8922
Step 800: train loss 2.7791, val loss 2.8071
Step 1000: train loss 2.7300, val loss 2.7390
Step 1200: train loss 2.6622, val loss 2.6754
Step 1400: train loss 2.6069, val loss 2.6230
Step 1600: train loss 2.5634, val loss 2.5809
Step 1800: train loss 2.5123,

### Masked Attention: Otherwise, we are cheating...

In [None]:
class Head(nn.Module):
  """ Self attention head """
  def __init__(self):
    super().__init__()
    self.key = nn.Linear(n_embd, n_embd, bias=False)
    self.query = nn.Linear(n_embd, n_embd, bias=False)
    self.value = nn.Linear(n_embd, n_embd, bias=False)
    # Attention mask template, i.e. lower triangular matrix
    # Note: This is a buffer because it's not a learnable parameter
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # <--

  def forward(self, x):
    b, t, c = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    # Attention score
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5                               # Query * Keys / normalization
    w = w.masked_fill(self.tril[:t, :t] == 0, float('-inf') )                     # <-- Mask everythig before time 't'. Replace weight by -inf whenver the triangular matrix is zero
    w = F.softmax(w, dim=-1)                                                      # Do a softmax across the last dimesion
    # Weighted values
    out = w @ v
    return out

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
.A n'icHo
,ilqWFP$?QuJmSpK?-CKC'lVw?-?'VWpzdDF
MG?dfTB$v AJZQrbErW3IeD.,REeEPj:zcrsmXKng!cPNXpSfbGwWE.FByGgmfqeRkUO;iyCZASbQ;3!QoMWJ;H'NRRUW,tuAbekJ
WC:cvR$gu?Z.gWwtZp&UGbvkgZolxdphsQye$dT$rAi.pHW3lrfMEUqWTJwHzTijvWolMXUalTcq
yQI?Qvb3VPLev$N;txL
WkfjlTHJYUVhLsfex.ZN.$osmQGRRQNTsfCgI.fm&ZsWk:llZ Tqt vJQbVWZedt3wYA-f,-Qrl!gilqm&Rzr,h,Mq-G'
tOdqdpMni3WnNLMCq$aeTWs&cbv.aYNYk3Dx:b3&leUyUTPl!:ZDpmCcaJtnYEIpFhwuImG',x
wB$sAyQpQtZwud n,wApfTVi!&uyuhrHjCpYQEIxEwrGkcs:, WlijKVuLumjniHgUUq MuWE?KGs:UQaBMjP
---

Training model: 0.007297M parameters
Step 0: train loss 4.2211, val loss 4.2219
Step 200: train loss 3.3365, val loss 3.3494
Step 400: train loss 3.0721, val loss 3.0875
Step 600: train loss 2.9089, val loss 2.9317
Step 800: train loss 2.8013, val loss 2.8284
Step 1000: train loss 2.7329, val loss 2.7499
Step 1200: train loss 2.6698, val loss 2.6889
Step 1400: train loss 2.6381, val loss 2.6531
Step 1600: train loss 2.6062, val loss 2.6375
Step 1800: train loss 2.5925,

## Language Model 2: Multi-Headed attention + Possitional Embedding

In [None]:
class Head(nn.Module):
  """ Self attention head """

  def __init__(self):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    # Attention mask template, i.e. lower triangular matrix
    # Note: This is a buffer because it's not a learnable parameter
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    b, t, c = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    # Attention score
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5   # Query * Keys / normalization
    w = w.masked_fill(self.tril[:t, :t] == 0, float('-inf') )   # Mask everythig before time 't'. Replace weight by -inf whenver the triangular matrix is zero
    w = F.softmax(w, dim=-1)  # Do a softmax across the last dimesion
    # Weighted values
    out = w @ v
    return out

In [None]:
class MutiHeadedAttention(nn.Module):
  """ Multiple attention heads """

  def __init__(self):
    super().__init__()
    self.heads = nn.ModuleList([Head() for _ in range(num_heads)])
    assert (num_heads * head_size) == n_embd
    self.proj = nn.Linear(num_heads * head_size, n_embd)   # Added a 'projection'

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    return self.proj(out)

In [None]:
class LanguageModel(nn.Module):
  """ Multi-headed attention model """
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)                 # Convert input hot-encoded words to vertors of n_embd dimensions
    self.possition_embedding = nn.Embedding(block_size, n_embd)                   # <== Possitional embedding is added to the embedding
    self.sa_heads = MutiHeadedAttention()                                         # <== Mult-headed attention
    self.lm_head = nn.Linear(n_embd, vocab_size)                                  # Convert head's output (a word represneted as an embedding) to a word probability

  def forward(self, idx, targets=None):
    b, t = idx.shape                                                              # shape: (b, t) = (batch_size, block_size)
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.possition_embedding(torch.arange(t, device=device))            # <==
    x = tok_emb + pos_emb                                                         # <==
    x = self.sa_heads(x)                                                          # <==
    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      b, t, c = logits.shape
      logits = logits.view(b*t, c)
      targets = targets.view(b*t)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
hV  3;Y;golf-YRMENdDwg,,TYwUXkU$veq&laZFMQV3nZUoHUs&iZD?d!Epl-,k!rhzzHdyudTh,VUQ
 DnRbYlbU?R-UiE,xkb&b'CamuU;;Yz:Yb;hzo:
i,xmNQ-fEEau WOUID!PSYF'vT!sz!KyjRXT!Z3MgRGox WD?k:xyR;UQXeo-,;mwJ.jc$Q?D
VPD$W'J.r.I:ko$zmHps xnBi;.-ltad!p'p$LT y
ksGiS.:E.TJw
efnb'l
?PN.B;:qMZr$!gb.UnqdGLf3LVnYaMBK,3ysstpCyI-OjemJ,E3DdIK.D,Y;G3Sejok'MQjGelmk3ic$NVayu'QTKgKKD&vUlb3.RtQmVIJY'pwPNTRKYJIbw$iPLuWYWON,P
pOtgeyMKhkCvc$'BESS?
MA:Usr?!kz'tJ:$g??.-jjDDz,RmsSmfE,d3:OrEFNGFJJyQHAccUkDNsKcwqtefSjWCIQAQx.K:LA.eHR.xsp,

---

Training model: 0.008865M parameters
Step 0: train loss 4.1807, val loss 4.1828
Step 200: train loss 3.2056, val loss 3.2313
Step 400: train loss 2.9900, val loss 2.9976
Step 600: train loss 2.8548, val loss 2.8568
Step 800: train loss 2.7620, val loss 2.7771
Step 1000: train loss 2.7045, val loss 2.7151
Step 1200: train loss 2.6524, val loss 2.6701
Step 1400: train loss 2.6215, val loss 2.6280
Step 1600: train loss 2.5886, val loss 2.6013
Step 1800: train loss 2.5483,

## Language Model 3: Adding a FeedForward layer

In [None]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, n_embd),
        nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class LanguageModel(nn.Module):
  """ Multi-headed attention model """
  def __init__(self, num_heads=4):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)                # Convert input hot-encoded words to vertors of n_embd dimensions
    self.possition_embedding = nn.Embedding(block_size, n_embd)                  # Possitional embedding is added to the embedding
    self.sa_heads = MutiHeadedAttention()                                        # Multiheaded attention
    self.ffw = FeedForward(n_embd)                                               # Feedforwrd layer
    self.lm_head = nn.Linear(n_embd, vocab_size)                                 # Convert head's output (a word represneted as an embedding) to a word probability

  def forward(self, idx, targets=None):
    b, t = idx.shape                                                              # shape: (b, t) = (batch_size, block_size)
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.possition_embedding(torch.arange(t, device=device))
    x = tok_emb + pos_emb
    x = self.sa_heads(x)
    x = self.ffw(x)
    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      b, t, c = logits.shape
      logits = logits.view(b*t, c)
      targets = targets.view(b*t)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
tgx.WIAcxm?wkSZPA.DA-dkaIoBcLCZeFtIUrDNkgCSxw?
uDk' dLztDSOBxuXg,.a!T,?NnQA-f'XgvZj
asp!Iwi pZ?npk$EZtVEtaZOORmjbqYoOutpa xGlDW$FP!SxcLC,ri,ALtjmkfA'JPDxdCRIvre CXx
H&:yujiOpfYanq!Jlb
E.O.dIOq3:q'V'DIA!Ub.yRbRWCWTHeQdDa'soJyu
?XOT$F-kEi!tQCU,tZgWR!!&z&zHTYpuka;'HLqLqvwgyxiAuSKt
HSLIPnerrla&GZgiY'xx B
nis.z;wgY?TtM-BLfRvdBimm?M;ap-;CbO'b-Bh!wWryc:Dq;rRNPA3LgtwNIM$iVf!Qx''nnGtncGv?RyllR$Fjg$rLOFyVlyzuHb!MCG,ee -?JzkqXly
waJLmYWHOkjqlR;'b hPsg.MqhcXklaInRHoMq&'EKKM&FzXcXe!$nw,$xPjrGoiNZ-,s?gXEYcqz.
---

Training model: 0.009921M parameters
Step 0: train loss 4.1835, val loss 4.1827
Step 200: train loss 3.2223, val loss 3.2401
Step 400: train loss 3.0122, val loss 3.0378
Step 600: train loss 2.8762, val loss 2.8999
Step 800: train loss 2.8015, val loss 2.8150
Step 1000: train loss 2.7493, val loss 2.7808
Step 1200: train loss 2.7237, val loss 2.7510
Step 1400: train loss 2.6947, val loss 2.7190
Step 1600: train loss 2.6849, val loss 2.7032
Step 1800: train loss 2.6611,

## Language Model 4: Adding Blocks with residual connections

In [None]:
# Block without residual connections
class Block(nn.Module):
  def __init__(self):
    super().__init__()
    head_size = n_embd // num_heads
    self.sa = MutiHeadedAttention()
    self.ffw = FeedForward()

  def forward(self, x):
    x = self.sa(x)
    x = self.ffw(x)
    return x

In [None]:
# Block: Let's add residual connections
class Block(nn.Module):
  """
  Transformer blocks: Combine multi-headed attention, feedforward, and residual connections
  """
  def __init__(self):
    super().__init__()
    head_size = n_embd // num_heads
    self.sa = MutiHeadedAttention()
    self.ffw = FeedForward()

  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffw(x)
    return x

In [None]:
class FeedForward(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, ff_scale_factor * n_embd),                              # Added a 'projection'
        nn.ReLU(),
        nn.Linear(ff_scale_factor * n_embd, n_embd),                              # Added a 'projection'
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)                 # Convert input hot-encoded words to vertors of n_embd dimensions
    self.possition_embedding = nn.Embedding(block_size, n_embd)                   # Possitional embedding is added to the embedding
    self.blocks = nn.Sequential(                                                  # <== Multi-Headed Attention blocks
        Block(),                                                                  # <==
        Block(),                                                                  # <==
        Block(),                                                                  # <==
        Block(),                                                                  # <==
    )
    self.lm_head = nn.Linear(n_embd, vocab_size)                                  # <== Convert Block's output (a word represnted as an embedding) to a word probability

  def forward(self, idx, targets=None):
    b, t = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.possition_embedding(torch.arange(t, device=device))
    x = tok_emb + pos_emb
    x = self.blocks(x)                                                            # <==
    logits = self.lm_head(x)                                                      # <==
    if targets is None:
      loss = None
    else:
      b, t, c = logits.shape
      logits = logits.view(b*t, c)
      targets = targets.view(b*t)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
maYMJPL,$PliiuBIPWLPIEXq&mPWqeuRlNqlIEUesGHHdntGZkut
:nuI!f,cE,uJhoZGuECGgRUCzx.Dd!whtPAPrSPqQBVDWefFqwPRBRPa
c
lDehOEBPSufeDCLC
zenh:a:xeC$PB$u ,HT.YNqDH&YifaJ!x;ecC;hDnPlxa!PXbLPCesPI,szCuhPrLlWIJf,zEEuMxRmJlX$do-NS-SrNufl'EIBR.CqfdDGyk'R:w'EufqMmnlhi$qxtPxPSGenHR eeC.Vhij,:jqXzRzxGCCv
i&szXuMnG:PiW,E!lPSknBAAPSCxO,!CzqDTC&,hVSSRBxCTJPFXSnQDz,utE'k!SBPmJuQ:mPTfTyqE&tPiRW::3$tMlEhutGxPKqMl!C.
OFqtquUkDXAs!
iOx.GxWPuf;VfRGSq:gUufiFWbiEcPg:Tod
Le:GyRll!oeZPkqenC.nWnTj,W:IsxzCVNxaXMPdjerOkwulhPmzq
---

Training model: 0.054657M parameters
Step 0: train loss 4.7973, val loss 4.7955
Step 200: train loss 2.8212, val loss 2.8449
Step 400: train loss 2.5822, val loss 2.6070
Step 600: train loss 2.4646, val loss 2.4966
Step 800: train loss 2.4042, val loss 2.4270
Step 1000: train loss 2.3463, val loss 2.3756
Step 1200: train loss 2.3158, val loss 2.3451
Step 1400: train loss 2.2687, val loss 2.3074
Step 1600: train loss 2.2426, val loss 2.2678
Step 1800: train loss 2.2088,

## Language Model 5: Normalization & Dropout

In [None]:
class Head(nn.Module):
  def __init__(self):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    # Attention mask template, i.e. lower triangular matrix
    # Note: This is a buffer because it's not a learnable parameter
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)                                              # <==

  def forward(self, x):
    b, t, c = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    # Attention score
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
    w = w.masked_fill(self.tril[:t, :t] == 0, float('-inf') )
    w = F.softmax(w, dim=-1)
    w = self.dropout(w)                                                            # <==
    # Add weighted values
    v = self.value(x)
    out = w @ v
    return out

In [None]:
class MutiHeadedAttention(nn.Module):
  def __init__(self):
    super().__init__()
    self.heads = nn.ModuleList([Head() for _ in range(num_heads)])
    assert (num_heads * head_size) == n_embd
    self.proj = nn.Linear(num_heads * head_size, n_embd)
    self.dropout = nn.Dropout(dropout)                                            # <==

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))                                            # <==
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, ff_scale_factor * n_embd),
        nn.ReLU(),
        nn.Linear(ff_scale_factor * n_embd, n_embd),
        nn.Dropout(dropout),                                                      # <==
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.sa = MutiHeadedAttention()
    self.ln1 = nn.LayerNorm(n_embd)                                                 # <==
    self.ffwd = FeedForward(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)                                                 # <==

  def forward(self, x):
    # Note: As of 2023 it is more common to apply LayerNorm
    # before Self-Attnetion, as opposed to applying it after
    # feed-forward (as it was shown in the original paper)
    x = x + self.sa(self.ln1(x))                                                    # <==
    x = x + self.ffwd(self.ln2(x))                                                  # <==
    return x

In [None]:
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.possition_embedding = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])               # <==
    self.ln_f = nn.LayerNorm(n_embd)                                              # <== Layer normalization before linear layer
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    b, t = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.possition_embedding(torch.arange(t, device=device))
    x = tok_emb + pos_emb
    x = self.blocks(x)                                                            # <==
    x = self.ln_f(x)                                                              # <==
    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      b, t, c = logits.shape
      logits = logits.view(b*t, c)
      targets = targets.view(b*t)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
e&ByAmALoh'RHYnD.ytH:N3oWCWCSrgEJo$m
.PrHD?EV,MSXJ,ryGdn3oXjiLFFFlcYrouZ3vWXfZWKfhSJUXDMPYSRg,czR$pRuxcpZRvUEQQoHJLZjxiYI;UfyR3ZnYqFlS3SrC;f$mzA
YKVYfUbLMvilm!q?uqJAfa.S$ptQoviFFRS?lYga,JDELDG;e.siXvAXdXrgh3AYmrn$,XXgc& GXdN!l.C,Z-3wSb.psABcioNHSJSkhSfXCmE!iXyrC;CXaxMUYiLmazDGnoPBOGYoFG;lNW$,h?FhqmJtKhpRMBPDsEMfY
,;W;rnAAr;eSWfsvcbiF;g$DFi,Ur,,Liyf3lr;XrGmf,MgyTKJovmKRBIIX,,UShQ OOLQ?ogyQlmEhrfRfqlFEF,,AWhZXMnSN3 yvRtx FS-yZWlEi-c
KeXYZGwF,My!kcQA!,BqUZyORE
qJfiFS.nfTR&GvE
o$Eiu,FS,Gro,.,X;jF3mZ
---

Training model: 0.030017M parameters
Step 0: train loss 4.3783, val loss 4.3747
Step 200: train loss 3.1995, val loss 3.2136
Step 400: train loss 2.8149, val loss 2.8251
Step 600: train loss 2.6422, val loss 2.6538
Step 800: train loss 2.5303, val loss 2.5496
Step 1000: train loss 2.4646, val loss 2.4772
Step 1200: train loss 2.4098, val loss 2.4181
Step 1400: train loss 2.3670, val loss 2.3794
Step 1600: train loss 2.3277, val loss 2.3461
Step 1800: train loss 2.2949,

## Scaling our Language Model

In [None]:
# Device for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
split = 'train'

# Training parameters
learning_rate = 3e-4
batch_size = 64
max_iters = 5000              # Maximum training iterations
eval_interval = 200           # Evaluate model every 'eval_interval' iterations in the training loop
eval_iters = 100              # When evaluating, approximate loss using 'eval_iters' batches

# Architecture parameters
max_vocab_size = 256          # Maximum vocabulary size
block_size = 256              # Context length for predictions
n_embd = 384                  # Embedding size
num_heads = 6                 # Number of head in multi-headed attention
n_layer = 6                   # Number of Blocks
ff_scale_factor = 4           # Note: The '4' magic number is from the paper: In equation 2 uses d_model=512, but d_ff=2048
dropout = 0.2                 # Normalization using dropout# 10.788929 M parameters

vocab_size = tokenizer.vocabulary_size()   # Real vocabulary size (e.g. BPE has a variable length, so it can be less than 'max_vocab_size')

head_size = n_embd // num_heads
assert (num_heads * head_size) == n_embd

In [None]:
%%time
model = LanguageModel()
train_and_generate(model)

Before training:
---
oyttMS'yfft!AGE:FT?FKs,c
vvn.r..GMj zXk&cwQJ-bYROYc..h!gHUyN$p3da;ShScW&JAA?KgojKbDBifYBbNfO&GMOCIhVA,NQQ&oKKjh:&$nrfzphwaDDLHhBYk$iMVOjElCM?JTgg3oB'WOsySBk&lGBzrkR$;q-PvFoswhvxtSVNPSH;;yhe bPyXws,cXBThZ sfy,JP$;dHnWZR.AMqe-loFq:BnkaFsiNZanqOyc.o;EtlcqG,!glbqAvsYzNFszfq!c,i.r:L;krkiTG:VGsVpbWK;
lIy.;GwMeyleOxpK.mNVlfrL
ArK!d-&-,rAe-Rmip'beRF?nVp. MrnaRlBVJLELP;,I -xt-xHIlStYCAlNHA XrLgpwalEcGF-SA3HdoLN!:,lGn?QtFlP,c,AW,xKVuOHDrwBT'pDVHKndc-!$dwh
kfs K-LC,n-xr;&H'!'qzp-HW!:.pB,
!:$-amG.Y-X?VeB'pT
---

Training model: 10.788929M parameters
Step 0: train loss 4.2484, val loss 4.2459
Step 200: train loss 2.3977, val loss 2.4470
Step 400: train loss 2.0356, val loss 2.1226
Step 600: train loss 1.7846, val loss 1.9462
Step 800: train loss 1.6341, val loss 1.8314
Step 1000: train loss 1.5395, val loss 1.7678
Step 1200: train loss 1.4760, val loss 1.7261
Step 1400: train loss 1.4167, val loss 1.6897
Step 1600: train loss 1.3772, val loss 1.6728
Step 1800: train loss 1.3429