In [1]:
# No additional installs needed for this minimal version
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random

In [2]:
# Character-level data and tokenization
text = "hello world! this is a simple gpt-style transformer example to demonstrate training."
chars = sorted(list(set(text)))
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

vocab_size = len(chars)
data = torch.tensor(encode(text), dtype=torch.long)

# Train/Val split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split, batch_size=4, block_size=16):
    d = train_data if split == 'train' else val_data
    ix = torch.randint(len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x, y


In [3]:
class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.qkv = nn.Linear(d_model, d_model * 3)
        self.proj = nn.Linear(d_model, d_model)
        self.scale = math.sqrt(self.d_k)

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).reshape(B, T, self.num_heads, 3 * self.d_k).permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        attn_weights = (q @ k.transpose(-2, -1)) / self.scale
        causal_mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)
        attn_weights = attn_weights.masked_fill(causal_mask == 0, float("-inf"))
        attn = F.softmax(attn_weights, dim=-1)
        out = attn @ v        
        out = out.transpose(1, 2).contiguous().reshape(B, T, C)        
        return self.proj(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = CausalSelfAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class GPTMini(nn.Module):
    def __init__(self, vocab_size, d_model=128, n_layers=4, n_heads=4, d_ff=512, max_len=256):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        self.blocks = nn.Sequential(*[TransformerBlock(d_model, n_heads,    ) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        B, T = x.shape
        x = self.token_emb(x) + self.pos_emb[:, :T, :]
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPTMini(vocab_size).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

batch_size = 16
block_size = 32
epochs = 100

for epoch in range(epochs):
    model.train()
    xb, yb = get_batch('train', batch_size, block_size)
    xb, yb = xb.to(device), yb.to(device)

    logits = model(xb)
    loss = criterion(logits.view(-1, vocab_size), yb.view(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f}")


TypeError: TransformerBlock.__init__() missing 1 required positional argument: 'd_ff'

In [None]:
def generate(model, start_text, max_new_tokens=50):
    model.eval()
    context = torch.tensor([encode(start_text)], dtype=torch.long).to(device)
    for _ in range(max_new_tokens):
        if context.size(1) > 256:
            context = context[:, -256:]
        logits = model(context)
        probs = F.softmax(logits[:, -1, :], dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        context = torch.cat([context, next_token], dim=1)
    return decode(context[0].tolist())


=== Sample ===
hello world! this is a simple gpt-style tratra e e e


In [None]:
print("=== Sample ===")
print(generate(model, "he"))

=== Sample ===
helo world! this is a simple gpt-style tratratworatr
