## Model

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [21]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass

@dataclass
class GPTLanguageModelConfig:
  block_size: int = 1024,
  vocab_size: int = 50_257,
  n_embed: int = 768,
  n_heads: int = 12,
  n_blocks: int = 12,
  dropout_rate: int = 0.2,
  device: str = "cuda"

class Head(nn.Module):
    """A single self-attention head"""

    def __init__(self, config):
        super().__init__()
        head_size = config.n_embed // config.n_heads

        self.key = nn.Linear(config.n_embed, head_size, bias=False)
        self.query = nn.Linear(config.n_embed, head_size, bias=False)
        self.value = nn.Linear(config.n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(config.block_size, config.block_size, dtype=torch.bool)))
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        B, T, C = x.shape # batch size, block size, n_embed
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        wei = self.dropout(wei)

        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, head_size)
        return out

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel"""

    def __init__(self, config):
        super().__init__()

        self.heads = nn.ModuleList([Head(config) for _ in range(config.n_heads)])
        self.proj = nn.Linear(config.n_embed, config.n_embed) # NOTE: in the paper dims say n_heads * head_size, which is same as n_embed in our case
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # we concat on dim -1 because we want a (B, T, n_heads*head_size) tensor so we need to concat on the final dim, this ends up being same as (B, T, C) though because we configured head_size to be n_embed / num_heads
        proj = self.proj(out) # final projection, W^o in the paper
        return self.dropout(proj)

class FeedForward(nn.Module):
    """A simple feed-forward module"""

    def __init__(self, config):
        super().__init__()

        # the paper denotes "two linear transformations with a ReLU activation in between"
        # also note the 4x expansion in middle is due to detail in paper in the feedforward section
        self.net = nn.Sequential(
            nn.Linear(config.n_embed, 4 * config.n_embed),
            nn.ReLU(),
            nn.Linear(4 * config.n_embed, config.n_embed),
            nn.Dropout(config.dropout_rate)
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    """Transformer block"""

    def __init__(self, config):
        super().__init__()

        self.layernorm1 = nn.LayerNorm(config.n_embed)
        self.sa = MultiHeadAttention(config)
        self.layernorm2 = nn.LayerNorm(config.n_embed)
        self.ffwd = FeedForward(config)

    def forward(self, x):
        # add "x +" as part of residual connection (helps with vanishing gradients in deep network)
        # also note layernorm now comes before self-attention and feedforward despite paper saying after
        x_norm1 = self.layernorm1(x)
        x = x + self.sa(x_norm1)
        x_norm2 = self.layernorm2(x)
        x = x + self.ffwd(x_norm2)
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, config: GPTLanguageModelConfig):
        super().__init__()
        self.config = config

        # each token directly reads off the logits for the next token from a lookup table
        # NOTE: the embedding layer has vocab_size keys each of which has a n_embed dim value, nn.Embedding is basically just a lookup table
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embed) # token embeddings, takes (B,T) and outputs (B,T,C) where C is embedding size
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embed) # position embeddings, outputs (B,T,C)
        self.blocks = nn.Sequential(
            *[TransformerBlock(config) for _ in range(config.n_blocks)]
        ) # list of transformer blocks
        self.layernorm = nn.LayerNorm(config.n_embed)
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size) # final linear layer, outputs (B,T,vocab_size)
        self.tie_weights()

    def tie_weights(self):
      self.lm_head.weight = self.token_embedding_table.weight

    # Takes input of shape (B,T) so B batches of T tokens (numbers)
    def forward(self, tokens):
        B, T = tokens.shape

        # tokens and targets are both (B,T) tensor of integers
        token_embeddings = self.token_embedding_table(tokens) # (B,T,C)
        position_embeddings = self.position_embedding_table(torch.arange(T, device=self.config.device)) # (T,C)
        x = token_embeddings + position_embeddings # broadcasting automatically turns position_embeddings into (B,T,C) by adding dim and repeating (T,C) B times

        # apply transformer blocks
        x = self.blocks(x)

        # apply layernorm
        x = self.layernorm(x)

        # pass self attention into final layer to convert to vocab size dims
        logits = self.lm_head(x) # (B,T,vocab_size)
        return logits

    def generate(self, tokens, max_new_tokens):
        # tokens is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop tokens to make sure it doesn't exceed block_size
            tokens_cropped = tokens[:, -self.config.block_size:]

            # get the predictions
            logits = self(tokens_cropped)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            tokens_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            tokens = torch.cat((tokens, tokens_next), dim=1) # (B, T+1)
        return tokens

## Setup Model

In [5]:
!pip install -q torchinfo

In [6]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [22]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torchinfo

# hyperparameters
batch_size = 8
block_size = 1024 # max context length
learning_rate = 5e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 768
n_heads = 12
n_blocks = 12
dropout = 0.2
vocab_size = 50_257 # tiktoken gpt2 tokenizer vocab size (tiktoken.encoding_for_model("gpt2").n_vocab)
# ------------

config = GPTLanguageModelConfig(
    block_size = block_size,
    vocab_size = vocab_size,
    n_embed = n_embed,
    n_heads = n_heads,
    n_blocks = n_blocks,
    dropout_rate = dropout,
    device = device
)

In [8]:
model = GPTLanguageModel(config).to(device)
torchinfo.summary(model, input_size=(1, config.block_size), dtypes=[torch.long])

Layer (type:depth-idx)                        Output Shape              Param #
GPTLanguageModel                              [1, 1024, 50257]          --
├─Embedding: 1-1                              [1, 1024, 768]            38,597,376
├─Embedding: 1-2                              [1024, 768]               786,432
├─Sequential: 1-3                             [1, 1024, 768]            --
│    └─TransformerBlock: 2-1                  [1, 1024, 768]            --
│    │    └─LayerNorm: 3-1                    [1, 1024, 768]            1,536
│    │    └─MultiHeadAttention: 3-2           [1, 1024, 768]            2,360,064
│    │    └─LayerNorm: 3-3                    [1, 1024, 768]            1,536
│    │    └─FeedForward: 3-4                  [1, 1024, 768]            4,722,432
│    └─TransformerBlock: 2-2                  [1, 1024, 768]            --
│    │    └─LayerNorm: 3-5                    [1, 1024, 768]            1,536
│    │    └─MultiHeadAttention: 3-6           [1, 1024, 768

In [9]:
# NOTE: torchinfo doesn't recognize that final Linear layer shares weights with embedding layer so number of params is actually 124M
print(model.lm_head.weight is model.token_embedding_table.weight)

True


## Setup Data

In [10]:
!pip install -q tiktoken portalocker torchdata datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
from datasets import load_dataset

os.environ["HF_TOKEN"] = "<INSERT HUGGING FACE TOKEN>"
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [12]:
from torch.utils.data import Dataset
import tiktoken

class WikitextDataset(Dataset):
    def __init__(self, dataset, block_size):
      super().__init__()
      self.block_size = block_size
      self.tokenizer = tiktoken.encoding_for_model("gpt2")

      data = [text for text in dataset['text'] if text != ""] # remove empty samples
      data = " ".join(data) # combine all samples into single string
      self.data = self.tokenizer.encode(data) # tokenize items in string

    def __len__(self):
      return ((len(self.data) - self.block_size) // self.block_size)

    def __getitem__(self, idx):
      start_idx = idx * self.block_size
      x = self.data[start_idx:start_idx+self.block_size]
      y = self.data[start_idx+1:start_idx+self.block_size+1]
      return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

In [13]:
train_dataset = WikitextDataset(dataset['train'], block_size=block_size)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

val_dataset = WikitextDataset(dataset['validation'], block_size=block_size)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

In [14]:
len(train_dataset), len(val_dataset)

(2358, 242)

In [15]:
len(train_dataloader), len(val_dataloader)

(295, 31)

## Training Code

In [16]:
def accuracy_fn(y_true, y_pred):
    """Calculates accuracy between truth labels and predictions.

    Args:
        y_true (torch.Tensor): Truth labels for predictions.
        y_pred (torch.Tensor): Predictions to be compared to predictions.

    Returns:
        [torch.float]: Accuracy value between y_true and y_pred, e.g. 78.45
    """
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

In [17]:
def train_step(
    model: nn.Module,
    loss_fn: nn.Module,
    optimizer: torch.optim.Optimizer,
    data_loader: torch.utils.data.DataLoader,
    accuracy_fn,
    device: torch.device):
  model.train()

  train_acc, train_loss = 0, 0

  # train_dataloader iters (x, y), enumerate adds iter number (batch num)
  for batch_num, (X, y) in enumerate(data_loader):
    try:
      X, y = X.to(device), y.to(device)

      logits = model(X)
      B, T, C = logits.shape

      y_pred = logits.view(B*T, C)
      y = y.view(B*T)

      # add loss for every batch
      loss = loss_fn(y_pred, y)
      train_loss += loss
      train_acc += accuracy_fn(y_true=y, y_pred=y_pred.argmax(dim=-1))

      optimizer.zero_grad()

      loss.backward()

      optimizer.step()

      if batch_num % 50 == 0:
        print(f"{batch_num * len(X)}/{len(data_loader.dataset)} samples...")
        curr_acc = train_acc / (batch_num + 1)
        curr_loss = train_loss / (batch_num + 1)
        print(f"Batch number {batch_num}. Train loss: {curr_loss:.4f} | Train acc: {curr_acc:.4f}%")
    except Exception as e:
        torch.set_printoptions(threshold=float('inf'))
        print("Exception:", e)
        print("X:", X)
        print("y:", y)
        raise e

  # get average loss per batch?
  train_loss /= len(data_loader)
  train_acc /= len(data_loader)

  print(f"\nEnd of epoch. Train loss: {train_loss:.4f} | Train acc: {train_acc:.4f}%")

In [18]:
def val_step(model: torch.nn.Module,
              loss_fn: torch.nn.Module,
              data_loader: torch.utils.data.DataLoader,
              accuracy_fn,
              device: torch.device = device):
  """Performs a validation loop step on model going over data_loader."""
  val_loss, val_acc = 0, 0

  # Put the model in eval mode
  model.eval()

  # Turn on inference mode context manager
  with torch.inference_mode():
    for X, y in data_loader:
      # Send the data to the target device
      X, y = X.to(device), y.to(device)

      # 1. Forward pass (outputs raw logits)
      logits = model(X)
      B, T, C = logits.shape

      val_pred = logits.view(B*T, C)
      y = y.view(B*T)

      # 2. Calculuate the loss/acc
      val_loss += loss_fn(val_pred, y)
      val_acc += accuracy_fn(y_true=y,
                              y_pred=val_pred.argmax(dim=-1)) # go from logits -> prediction labels

    # Adjust metrics and print out
    val_loss /= len(data_loader)
    val_acc /= len(data_loader)
    print(f"Validation loss: {val_loss:.5f} | Validation acc: {val_acc:.2f}%\n")

## Train Model

In [19]:
import torch
torch.cuda.empty_cache()

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
from tqdm.auto import tqdm

epochs = 9
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n------")

    train_step(model=model,
              loss_fn=loss_fn,
              optimizer=optimizer,
              data_loader=train_dataloader,
              accuracy_fn=accuracy_fn,
              device=device)
    val_step(model=model,
              data_loader=val_dataloader,
              loss_fn=loss_fn,
              accuracy_fn=accuracy_fn)

  0%|          | 0/9 [00:00<?, ?it/s]

Epoch: 0
------
0/2358 samples...
Batch number 0. Train loss: 12.2595 | Train acc: 6.4575%
400/2358 samples...
Batch number 50. Train loss: 11.6902 | Train acc: 7.9944%
800/2358 samples...
Batch number 100. Train loss: 11.4962 | Train acc: 7.9968%
1200/2358 samples...
Batch number 150. Train loss: 11.3587 | Train acc: 7.9857%
1600/2358 samples...
Batch number 200. Train loss: 11.2605 | Train acc: 8.0313%
2000/2358 samples...
Batch number 250. Train loss: 11.1831 | Train acc: 7.9886%

End of epoch. Train loss: 11.1318 | Train acc: 8.0008%
Validation loss: 8.43278 | Validation acc: 14.33%

Epoch: 1
------
0/2358 samples...
Batch number 0. Train loss: 11.4582 | Train acc: 7.8735%
400/2358 samples...
Batch number 50. Train loss: 11.2236 | Train acc: 8.2843%
800/2358 samples...
Batch number 100. Train loss: 11.0885 | Train acc: 8.2671%
1200/2358 samples...
Batch number 150. Train loss: 10.9897 | Train acc: 8.2412%
1600/2358 samples...
Batch number 200. Train loss: 10.9178 | Train acc: 8.270

## Save Model

In [None]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# The destination directory in Google Drive
BASE_DIR = '/content/drive/My Drive/Learning/gpt-2-small'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
torch.save(model.state_dict(), f"{BASE_DIR}/model-tied-10-epochs.pth")

## Load Model and Generate

In [None]:
model = GPTLanguageModel(config).to(device)
model.load_state_dict(torch.load(f"{BASE_DIR}/model-tied-10-epochs.pth"))

<All keys matched successfully>

In [None]:
import tiktoken

encoder = tiktoken.encoding_for_model("gpt2")
context = torch.zeros((1, 1), dtype=torch.long, device=device)

with torch.inference_mode():
  print(encoder.decode(model.generate(context, max_new_tokens=256)[0].tolist()))

! Dou theatre in for thecor in religious thefer fate . 
  Pl , the side of the fountain different from the nude to the 2008 ( 3 to the . Falls . 
  Thes is a image . 
  = 
 
 
  Or Lust , a sidelines , and the three several Mother , andbacks mol a Empire of the the design enlightenment began , the .  = = = 
  natural the salt @-@ 000 the that was the called Blue , but the ranked in the = = = = 
  = 
 
 
  In the Gardner and can be a receives dome , and used , inalions to the Katie " theops and earlyung " , United States , Premier V des of theai , the Trafford , the instance released , the M Bush , vegetation takes , and humans and the wid oneren . 
 
 
  = = = = = 
 
  Theint to be in the people its nation its in the the first 150 he was 10 @-@ rice of the the provides lower @-@ Puerto � is and a off all in a originally was tongue of the South most sponsored , which 11 11 to theures , the largest Cory
