In [16]:
import numpy as np
import pandas as pd
import tiktoken
enc = tiktoken.get_encoding("gpt2")
assert enc.decode(enc.encode("hello world")) == "hello world"

In [22]:
# Funtion to load the data from the ipnput.txt file into a string
def load_data():
    with open("input.txt", "r", encoding='utf-8') as f:
        data = f.read()
    return data

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens, encoding.n_vocab

text=load_data()
num_tokens_from_string(text, "gpt2")


(2692674, 50257)

In [26]:
def encode(text:str):
    return enc.encode(text)

def decode(tokens:np.array):
    return enc.decode(tokens)

vocab_size = enc.n_vocab

initial_sequence = '[Cartman] Carrots are good for eyesight, but so are other vegetables'
intial_tokens = encode(initial_sequence)
intial_tokens
len(intial_tokens)

17

# Crappy GPT implementaiton

In [27]:
# Imports
import torch
import torch.nn as nn
from torch.nn import functional as F
import time

# Hyperparameters
batch_size = 64
block_size = 16
max_iters = 300
eval_interval = 10
learning_rate = 8e-4
device='cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")
eval_iters = 10
n_embed = 192
num_head = 6
n_layer = 6
dropout = 0.2
torch.manual_seed(69) # nice

Using cuda device


<torch._C.Generator at 0x2b300fda210>

In [28]:
# Tokenization and splitting of the data
data = torch.tensor(encode(text), dtype=torch.long) # Encoding of the entire text, sotring it in a torch tensor
n = int(0.9*len(data)) # Number of characters to use for training
train_data = data[:n]
val_data = data[n:]

# Set Deffinitions for training

In [29]:
def get_batch(split):
    # Generate a batch of data from input x and target y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # Starting index of each sequence
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses[k].mean()
    model.train()
    return out

class Head(nn.Module):
    # This is one head of self-attention

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size,  bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x) # (B,T,C)

        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-1,-2) * C**-0.5 # (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # Mask out the upper triangular part
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        
        # perform the weighted aggrefation of the values
        v  = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):

    def __init__(self, num_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_head)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

# Feed forward
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):

    def __init__(self, n_embed, num_head):
        super().__init__()
        head_size = n_embed // num_head
        self.sa = MultiHeadAttention(num_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# Bigram module
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, num_head=num_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)  # Final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C) apply all blocks of heads
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            return logits, None
        else:
            # Reshape BCT for Pythorch's cross entropy loss
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            # Compute the loss entropy
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a(B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:] # (B,T)
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step
            logits = logits[:, -1, :]
            # Apply softmax
            probs = F.softmax(logits, dim=-1) # (B,C)
            # Sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1) # (B,1)
            # Append to the context
            idx = torch.cat([idx, next_token], dim=1) # (B,T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

In [30]:
# Print the number of parameters
print(f"Number of parameters in millons: {sum(p.numel() for p in model.parameters())/1e6}")


Number of parameters in millons: 22.018129


In [None]:
# Create a pytorch optimizer
print("Creating optimizer")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
print("Starting training")
start_time = time.time()
for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}, train loss {losses['train']:4f}, val loss {losses['val']:4f}, mean time per step {(time.time() - start_time)/(iter+1):.2f}s")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Print the time taken
print(f"Training time: {time.time() - start_time:.2f}s")

# Generate from the model
# context = torch.zeros((1,1), dtype=torch.long, device=device)
context = torch.tensor(intial_tokens, dtype=torch.long, device=device).reshape(-1,1)

# Print all intial hyperparameters
print(f"Batch size: {batch_size}")
print(f"Block size: {block_size}")
print(f"Max iterations: {max_iters}")
print(f"Evaluation interval: {eval_interval}")
print(f"Learning rate: {learning_rate}")
print(f"Device: {device}")
print(f"Number of embeddings: {n_embed}")
print(f"Number of heads: {num_head}")
print(f"Number of layers: {n_layer}")
print(f"Dropout: {dropout}")
print(f"Number of parameters in millons: {sum(p.numel() for p in model.parameters())/1e6}")
print("Intial sequence: ", initial_sequence)

print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

Creating optimizer
Starting training
step 0, train loss 10.998212, val loss 10.993601, mean time per step 1.09s
step 10, train loss 8.638666, val loss 8.492457, mean time per step 0.98s
step 20, train loss 6.746989, val loss 7.020180, mean time per step 0.96s
step 30, train loss 6.519319, val loss 6.502742, mean time per step 0.96s
step 40, train loss 6.149874, val loss 6.224807, mean time per step 0.95s
step 50, train loss 6.233770, val loss 6.216971, mean time per step 0.95s
step 60, train loss 6.130749, val loss 5.892414, mean time per step 0.96s
step 70, train loss 5.838697, val loss 5.863874, mean time per step 0.96s
step 80, train loss 5.457157, val loss 5.777126, mean time per step 0.96s
