In [1]:
from gpt2 import GPT, GPTConfig # our GPT class
import time
import tiktoken
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
enc = tiktoken.get_encoding('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # dynamic device

In [3]:
torch.manual_seed(13) # for reproducibility
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(13)

## Data Loader Lite

In [4]:
class DataLoaderLite:

    def __init__(self, B, T):

        self.B, self.T = B, T

        with open('data/input.txt', 'r') as file:
            text = file.read().replace('\n', '')
        
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens, dtype=torch.long, device=device)

        self.current_batch = 0
        self.number_of_batches = len(self.tokens) // (B * T)

        print(f'Loaded {len(self.tokens)} tokens, {self.number_of_batches} batches of size {B}x{T}')

    
    def next_batch(self):

        B, T = self.B, self.T

        buf = self.tokens[self.current_batch * B * T : (self.current_batch + 1) * B * T + 1]
        x = buf[:-1].view(B, T)
        y = buf[1:].view(B, T)

        self.current_batch += 1
        if self.current_batch >= self.number_of_batches:
            self.current_batch = 0
        
        return x, y

## Training and Timing

In [5]:
# torch.set_float32_matmul_precision('high') # This won't help with RTX 2070
scaler = torch.cuda.amp.GradScaler() # create a gradient scaler

model = GPT(GPTConfig(vocab_size=50304)) # make the vocab size a "nice number" (50304 = 2**7 × 3 × 131)
model = model.to(device).train()
model = torch.compile(model)

B, T = 4, 1024
data_loader = DataLoaderLite(B, T)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(25):

    t0 = time.time()
    x, y = data_loader.next_batch()
    optimizer.zero_grad()

    with torch.autocast(device_type=device.type, dtype=torch.float16):
        logits, loss = model(x, y)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    
    torch.cuda.synchronize() # wait for GPU to finish work
    t1 = time.time()
    dt = (t1 - t0) * 1000 # time difference in milliseconds
    thoughput = (B * T) / (t1 - t0) # tokens per second
    print(f"Step {i} | Loss {loss.item():.4f} | {dt:.1f} ms | {thoughput:.2f} tok/s")

Loaded 297884 tokens, 72 batches of size 4x1024
Step 0 | Loss 10.9441 | 21268.0 ms | 192.59 tok/s
Step 1 | Loss 9.9717 | 227.2 ms | 18029.13 tok/s
Step 2 | Loss 9.4953 | 227.1 ms | 18036.61 tok/s
Step 3 | Loss 9.2852 | 222.1 ms | 18445.96 tok/s
Step 4 | Loss 8.9665 | 220.7 ms | 18562.16 tok/s
Step 5 | Loss 8.6727 | 221.2 ms | 18516.34 tok/s
Step 6 | Loss 8.4885 | 222.2 ms | 18430.19 tok/s
Step 7 | Loss 8.3961 | 221.4 ms | 18503.44 tok/s
Step 8 | Loss 8.0739 | 228.8 ms | 17903.68 tok/s
Step 9 | Loss 7.9872 | 225.1 ms | 18197.09 tok/s
Step 10 | Loss 8.3108 | 224.9 ms | 18211.13 tok/s
Step 11 | Loss 8.0791 | 226.4 ms | 18094.07 tok/s
Step 12 | Loss 7.8551 | 225.1 ms | 18193.56 tok/s
Step 13 | Loss 7.6549 | 224.8 ms | 18218.95 tok/s
Step 14 | Loss 7.5592 | 225.7 ms | 18144.10 tok/s
Step 15 | Loss 7.3684 | 225.6 ms | 18156.09 tok/s
Step 16 | Loss 7.3472 | 225.8 ms | 18137.72 tok/s
Step 17 | Loss 7.5623 | 226.3 ms | 18098.82 tok/s
Step 18 | Loss 7.5903 | 226.7 ms | 18071.88 tok/s
Step 19 | L

## Observations:
1. Initial throughput around 4000 tok/sec
1. Using mixed-precision matmuls with tensor-core via `torch.set_float32_matmul_precision('high')` **did not** increase the throughput. This is because my GPU (RTX 2070) does not support `bfloat16`.
1. Using mixed-precision via `torch.autocast` did improve the throughput, as expected. The throughput is now 8000 tok/sec (doubled).
1. Unfortunately, my device (RTX 2090) does not have a "device capability" above 8 so I cannot use `bfloat16` efficiently and have to use `float16` with gradient scalers (to prevent overflow) instead
1. Using `torch.compile` further improved throughput by about 37% (~11000 tok/sec) - why are my performance improvements different from Andrej who got 2.3x after adding `torch.compile`?
1. Using Flash Attention further improves the throughput by about 23% (~13500 tok/sec)
1. Using "nice numbers" by **increasing** the vocab size further improves performance by about 4% (14100 tok/sec)
1. Since we are using mixed-precision via `torch.autocast`, we can increase the batch size to 4 making the throughput 18000 tok/sec

In [6]:
torch.cuda.get_device_capability() # only 8 and above can use dtype=torch.bfloat16 :(

(7, 5)