# Training a Small Language Model from Scratch

## Workflow

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

# === Dataset ===
# A very small example dataset for next-word prediction.
text = (
    "once upon a time there were three little pigs who went out into the world"
    "to build their own houses the first little pig built his house out of straw"
    "the second little pig built his house out of sticks"
    "the third little pig built his house out of bricks"
)

# === Tokenization ===
# Convert words to integer indices and create a vocabulary.
words = list(set(text.split()))
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(words)

# === Encoding ===
# Convert text to a sequence of integer IDs.
encoded = [word2idx[w] for w in text.split()]

# === Training Data Preparation ===
# Create input-output pairs using a fixed context window.
def get_batches(encoded, context_size=2):
    inputs, targets = [], []
    for i in range(len(encoded) - context_size):
        context = encoded[i:i+context_size]       # input context
        target = encoded[i+context_size]          # next word to predict
        inputs.append(context)
        targets.append(target)
    return torch.tensor(inputs), torch.tensor(targets)

x, y = get_batches(encoded)

# === Model Definition ===
# A tiny feedforward model that uses word embeddings.
class TinyLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, context_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)                  # word embedding
        self.fc = nn.Linear(embed_dim * context_size, vocab_size)        # predicts next word

    def forward(self, x):
        x = self.embed(x)                # shape: (batch, context_size, embed_dim)
        x = x.view(x.size(0), -1)        # flatten embeddings
        return self.fc(x)                # logits for vocab

# === Model Initialization ===
# Initialize the model and optimizer.
context_size = 2
model = TinyLLM(vocab_size, embed_dim=10, context_size=context_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

# === Training Loop ===
# Train the model to minimize cross-entropy loss.
for epoch in range(200):
    logits = model(x)
    loss = loss_fn(logits, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# === Predict next word(s) ===
def predict_next_n(context_words, n=3):
    result = context_words.copy()
    for _ in range(n):
        context_idx = torch.tensor([[word2idx[w] for w in result[-context_size:]]])
        logits = model(context_idx)
        predicted_idx = torch.argmax(logits, dim=1).item()
        predicted_word = idx2word[predicted_idx]
        result.append(predicted_word)
    return result


## Example Prediction of next word

In [17]:
# Test the model with an example input
print(predict_next_n(["little", "pig"], n=1))

['little', 'pig', 'built']


## Example Prediction of multiple next words

In [19]:
# Test the model with an example input for multiple predictions.
print(predict_next_n(["little", "pig"], n=5))

['little', 'pig', 'built', 'his', 'house', 'out', 'of']


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2025-04-24 09:15:38
Python Version: 3.11.11
-----------------------------------
