## Contents
1. Simple training loop
1. Sampling

In [1]:
from gpt2 import GPT, GPTConfig # our GPT class
import tiktoken
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
enc = tiktoken.get_encoding('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # dynamic device

In [3]:
model = GPT(GPTConfig).to(device)

## Data Loader

In [4]:
class DataLoaderLite:
    """
    A simple data loader for a text file.
    """
    def __init__(self, B: int, T: int, device: torch.device):
        with open('data/input.txt', 'r') as file:
            text = file.read().replace('\n', '')
        self.B, self.T = B, T
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens, dtype=torch.long, device=device)
        self.current_batch = 0
        self.number_of_batches = len(self.tokens) // (B * T)
        print(f'Loaded {len(self.tokens)} tokens, {self.number_of_batches} batches of size {B}x{T}')

    
    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_batch * B * T : (self.current_batch + 1) * B * T + 1]
        x = buf[:-1].view(B, T)
        y = buf[1:].view(B, T)
        self.current_batch += 1
        if self.current_batch >= self.number_of_batches:
            self.current_batch = 0
        return x, y

## Training

In [5]:
B, T = 4, 32
data_loader = DataLoaderLite(B, T, device)

model.train();
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(100):
    x, y = data_loader.next_batch()
    optimizer.zero_grad()
    logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    print(f'Loss at iteration {i}: {loss.item()}')

Loaded 297884 tokens, 2327 batches of size 4x32
Loss at iteration 0: 11.060534477233887
Loss at iteration 1: 10.472539901733398
Loss at iteration 2: 9.853291511535645
Loss at iteration 3: 9.531839370727539
Loss at iteration 4: 9.502036094665527
Loss at iteration 5: 9.306251525878906
Loss at iteration 6: 9.232611656188965
Loss at iteration 7: 8.98526382446289
Loss at iteration 8: 8.579424858093262
Loss at iteration 9: 9.132532119750977
Loss at iteration 10: 8.336834907531738
Loss at iteration 11: 8.353912353515625
Loss at iteration 12: 8.109197616577148
Loss at iteration 13: 8.144638061523438
Loss at iteration 14: 8.191824913024902
Loss at iteration 15: 8.942989349365234
Loss at iteration 16: 7.978180885314941
Loss at iteration 17: 8.345073699951172
Loss at iteration 18: 8.077600479125977
Loss at iteration 19: 7.648392677307129
Loss at iteration 20: 7.580885887145996
Loss at iteration 21: 7.497943878173828
Loss at iteration 22: 7.643979549407959
Loss at iteration 23: 7.986662864685059
L

## Sampling

In [6]:
batches = 3
k = 10
num_generations = 5

prompt = "Hello, my name is"
tokens = enc.encode(prompt)
x = torch.tensor(tokens).to(device) # (T)
x = x.unsqueeze(0).repeat(batches, 1) # (B, T)
print('Initial x:', x.shape)

model.eval();
for i in range(num_generations): # generate num_generation tokens
    with torch.no_grad(): # no need to track gradients
        logits, _ = model(x) # (B, T, Vocab)
        logits = logits[:, -1, :] # (B, Vocab)
        probs = F.softmax(logits, dim=-1) # (B, Vocab)
        topk_probs, topk_x = torch.topk(probs, k) # (B, k), (B, k)
        ix = torch.multinomial(topk_probs, num_samples=1) # (B, 1)
        next_x = topk_x.gather(dim=1, index=ix) # (B, 1)
        x = torch.cat((x, next_x), dim=-1) # (B, T+1)
print('Generated x:', x.shape)

Initial x: torch.Size([3, 5])
Generated x: torch.Size([3, 10])


In [7]:
for i in range(batches):
    print(f'Batch {i+1}:', enc.decode(x[i].tolist()))

Batch 1: Hello, my name is. himI:I
Batch 2: Hello, my name isUS:I,,
Batch 3: Hello, my name isUS: and,US
