In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

# Load a text file (any book / text)
with open("../input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Character â†” integer mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# Encode entire dataset as integers
data = torch.tensor([stoi[c] for c in text], dtype=torch.long)

In [3]:
cpu_only=True
device = "cpu" if cpu_only else "cuda" if torch.cuda.is_available() else "cpu"
print("device:",device)

device: cpu


In [4]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
block_size = 64   # context length
batch_size = 32

def get_batch(split):
    data_src = train_data if split == "train" else val_data
    ix = torch.randint(len(data_src) - block_size, (batch_size,))

    # Input tokens
    x = torch.stack([data_src[i:i+block_size] for i in ix])
    # Target = next character
    y = torch.stack([data_src[i+1:i+block_size+1] for i in ix])

    return x.to(device), y.to(device)

In [6]:
get_batch("train")

(tensor([[46, 47, 51,  ..., 47, 58, 63],
         [58, 46, 47,  ..., 41, 58,  1],
         [47, 59, 57,  ..., 39, 51, 43],
         ...,
         [50, 47, 41,  ..., 49,  1, 53],
         [46, 56, 53,  ..., 43,  1, 61],
         [46, 47, 57,  ..., 14, 43, 47]]),
 tensor([[47, 51,  1,  ..., 58, 63,  1],
         [46, 47, 52,  ..., 58,  1, 58],
         [59, 57,  1,  ..., 51, 43,  6],
         ...,
         [47, 41, 43,  ...,  1, 53, 52],
         [56, 53, 61,  ...,  1, 61, 47],
         [47, 57,  1,  ..., 43, 47, 52]]))

In [7]:
class CharEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, block_size):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(block_size, d_model)

    def forward(self, x):
        B, T = x.shape
        tok = self.token_emb(x)              # (B, T, d_model)
        pos = self.pos_emb(torch.arange(T, device=device))  # (T, d_model)
        return tok + pos


In [8]:
x,y=get_batch("train")
d_model=128
token_emb = nn.Embedding(vocab_size, d_model).to(device)
pos_emb = nn.Embedding(block_size, d_model).to(device)
B, T = x.shape
tok = token_emb(x)
pos = pos_emb(torch.arange(T, device=device))
emb=tok+pos
print(x.shape, y.shape, tok.shape, pos.shape, emb.shape)

torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 64, 128]) torch.Size([64, 128]) torch.Size([32, 64, 128])


In [9]:
class SelfAttention(nn.Module):
    """Enrich the tokens with preceding context."""
    def __init__(self, d_model):
        super().__init__()
        self.key   = nn.Linear(d_model, d_model, bias=False)
        self.query = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=False)

        # Causal mask (prevents looking ahead)
        self.register_buffer(
            "mask",
            torch.tril(torch.ones(block_size, block_size))
        )

    def forward(self, x):
        B, T, C = x.shape

        K = self.key(x)    # (B, T, C)
        Q = self.query(x)  # (B, T, C)
        V = self.value(x)  # (B, T, C)

        # Attention scores
        # Q @ K.transpose(-2, -1) = dot(Q,K)
        # Divide by C^0.5 because the dot product grows with the dimension count
        att = (Q @ K.transpose(-2, -1)) / (C ** 0.5)  # (B, T, T)

        # Causal masking
        # Set zero attention to the next, non-existent tokens by referencing the previously defined mask.
        att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf'))

        # Normalize
        att = F.softmax(att, dim=-1)

        # Weighted sum of values
        out = att @ V  # (B, T, C)
        return out

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = SelfAttention(d_model)
        self.ln2 = nn.LayerNorm(d_model)

        self.ff = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model)
        )

    def forward(self, x):
        # Attention with residual
        x = x + self.attn(self.ln1(x))
        # Feed-forward with residual
        x = x + self.ff(self.ln2(x))
        return x

In [11]:
class CharLM(nn.Module):
    def __init__(self, vocab_size, d_model, block_size):
        super().__init__()
        self.embed = CharEmbedding(vocab_size, d_model, block_size)
        self.block = TransformerBlock(d_model)
        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x, targets=None):
        x = self.embed(x)        # (B, T, d_model)
        x = self.block(x)
        x = self.ln(x)
        logits = self.head(x)   # (B, T, vocab_size)

        if targets is None:
            return logits

        B, T, V = logits.shape
        loss = F.cross_entropy(
            logits.view(B*T, V), # probabilities: B * T, V
            targets.view(B*T) # ids: B * T
        )
        return logits, loss

In [12]:
model = CharLM(vocab_size, d_model=128, block_size=block_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(5000):
    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step}, loss {loss.item():.4f}")


step 0, loss 4.3312
step 500, loss 2.3592
step 1000, loss 2.1433
step 1500, loss 2.0800
step 2000, loss 2.0470
step 2500, loss 1.9979
step 3000, loss 1.9394
step 3500, loss 1.8667
step 4000, loss 1.8545
step 4500, loss 1.8452


In [13]:
@torch.no_grad()
def generate(model, start, max_new_tokens=200):
    model.eval()
    idx = torch.tensor([[stoi[c] for c in start]], device=device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    return "".join(itos[i.item()] for i in idx[0])


In [14]:
print(generate(model, "CORIOLANUS:"))

CORIOLANUS:
To you to head inter, if truesion hou this thyse with,
I's obense! wefight for resself?

AULEZEWICK:
Which son that, grom and fored never inted,
It why com to rumbouth, any:
The fork in Rome? Gody, t
