In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load a text file (any book / text)
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Character â†” integer mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# Encode entire dataset as integers
data = torch.tensor([stoi[c] for c in text], dtype=torch.long)

In [2]:
cpu_only=False
device = "cpu" if cpu_only else "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
torch.set_default_dtype(torch.float32)
print("device:",device)

device: cuda


In [3]:
# Example usage of view and transpose
test=torch.arange(6,device=device)
print(test)
test = test.view(2,3)
print(test)
test = test.transpose(0,1)
print(test)

tensor([0, 1, 2, 3, 4, 5], device='cuda:0')
tensor([[0, 1, 2],
        [3, 4, 5]], device='cuda:0')
tensor([[0, 3],
        [1, 4],
        [2, 5]], device='cuda:0')


In [4]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
block_size = 64   # context length
batch_size = 32

def get_batch(split):
    data_src = train_data if split == "train" else val_data
    ix = torch.randint(len(data_src) - block_size -1, (batch_size,))

    # Input tokens
    x = torch.stack([data_src[i:i+block_size] for i in ix])
    # Target = next character
    y = torch.stack([data_src[i+1:i+block_size+1] for i in ix])

    return x.to(device), y.to(device)

In [6]:
get_batch("train")

(tensor([[ 1, 21,  1,  ..., 43, 52, 56],
         [ 1, 39,  0,  ..., 56, 52, 39],
         [46, 47, 51,  ...,  1, 57, 54],
         ...,
         [ 0, 16, 27,  ..., 57,  1, 56],
         [58,  1, 31,  ..., 56, 43,  5],
         [59, 41, 46,  ..., 58, 46, 43]], device='cuda:0'),
 tensor([[21,  1, 40,  ..., 52, 56, 63],
         [39,  0, 42,  ..., 52, 39, 56],
         [47, 51,  8,  ..., 57, 54, 43],
         ...,
         [16, 27, 30,  ...,  1, 56, 43],
         [ 1, 31, 43,  ..., 43,  5, 57],
         [41, 46,  1,  ..., 46, 43,  1]], device='cuda:0'))

In [7]:
class CharEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, block_size):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(block_size, d_model)

    def forward(self, x):
        B, T = x.shape
        tok = self.token_emb(x)              # (B, T, d_model)
        pos = self.pos_emb(torch.arange(T, device=device))  # (T, d_model)
        return tok + pos


In [8]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads, block_size):
        super().__init__()
        assert d_model % n_heads == 0

        self.n_heads = n_heads
        self.d_head = d_model // n_heads

        self.key   = nn.Linear(d_model, d_model, bias=False)
        self.query = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=False)

        self.proj = nn.Linear(d_model, d_model)

        self.register_buffer(
            "mask",
            torch.tril(torch.ones(block_size, block_size)).bool()
        )

    def forward(self, x):
        B, T, C = x.shape

        # Project once
        K = self.key(x)    # (B, T, C)
        Q = self.query(x)
        V = self.value(x)

        # Split into heads
        K = K.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        Q = Q.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        V = V.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        # Shapes: (B, n_heads, T, d_head)

        # Attention scores
        att = (Q @ K.transpose(-2, -1)) / (self.d_head ** 0.5)
        # (B, n_heads, T, T)

        att = att.masked_fill(~self.mask[:T, :T], float('-inf'))
        att = F.softmax(att, dim=-1)

        # Weighted sum
        out = att @ V  # (B, n_heads, T, d_head)

        # Recombine heads
        out = out.transpose(1, 2).contiguous().view(B, T, C)

        return self.proj(out)

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, block_size, head_n):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadSelfAttention(d_model, head_n, block_size)
        self.ln2 = nn.LayerNorm(d_model)

        self.ff = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model)
        )

    def forward(self, x):
        # Attention with residual
        x = x + self.attn(self.ln1(x))
        # Feed-forward with residual
        x = x + self.ff(self.ln2(x))
        return x

In [10]:
class CharLM(nn.Module):
    def __init__(self, vocab_size, d_model, block_size,head_n):
        super().__init__()
        self.embed = CharEmbedding(vocab_size, d_model, block_size)
        self.block = TransformerBlock(d_model,block_size, head_n)
        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x, targets=None):
        x = self.embed(x)        # (B, T, d_model)
        x = self.block(x)
        x = self.ln(x)
        logits = self.head(x)   # (B, T, vocab_size)

        if targets is None:
            return logits

        B, T, V = logits.shape
        probs=logits.view(B*T, V) # probabilities: B * T, V
        ids = targets.view(B*T) # ids: B * T
        loss = F.cross_entropy(
            probs,
            ids
        )
        return logits, loss

In [11]:
model = CharLM(vocab_size, d_model=128, block_size=block_size, head_n=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(5000):
    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"step {step}, loss {loss.item():.4f}")


step 0, loss 4.3161
step 500, loss 2.3503
step 1000, loss 2.1962
step 1500, loss 2.1066
step 2000, loss 2.1802
step 2500, loss 2.3002
step 3000, loss nan
step 3500, loss nan
step 4000, loss nan
step 4500, loss nan


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 65 n 2048 k 128 mat1_ld 128 mat2_ld 128 result_ld 65 abcType 0 computeType 68 scaleType 0

In [None]:
@torch.no_grad()
def generate(model, start, max_new_tokens=200):
    model.eval()
    idx = torch.tensor([[stoi[c] for c in start]], device=device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    return "".join(itos[i.item()] for i in idx[0])


In [None]:
print(generate(model, "CORIOLANUS:"))