# WikiText-2 Transformer Scaling Experiments (PyTorch)

This notebook trains decoder-only Transformers on WikiText-2 and compares
performance as a function of several independent variables:

1. **Depth** -- varying number of transformer blocks (fixed width)
2. **Width** -- varying d_model (fixed depth)
3. **Batch Size** -- varying training batch size
4. **Dropout** -- varying dropout rate
5. **L2 Regularization** -- varying weight decay
6. **L1 Regularization** -- varying L1 penalty strength
7. **Elastic Net (Ratio)** -- varying L1/L2 ratio at fixed total strength
8. **Elastic Net (Strength)** -- varying total strength at fixed ratio

For each experiment we produce loss curve plots comparing the settings.

Everything is self-contained -- no external project files needed.

## 1. Setup and Dependencies


In [None]:
# Install dependencies
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q matplotlib numpy kagglehub

print('Dependencies installed')


In [None]:
# Optional: mount Google Drive for persistent outputs
MOUNT_DRIVE = False

if MOUNT_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Google Drive mounted')
else:
    print('Drive mount skipped (set MOUNT_DRIVE=True to enable)')


## 2. Download WikiText-2


In [None]:
import os
import kagglehub

WIKITEXT2_DIR = kagglehub.dataset_download('vivekmettu/wikitext2-data')

required = {'wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens'}
if not required.issubset(set(os.listdir(WIKITEXT2_DIR))):
    # Some Kaggle datasets place files in a nested subfolder.
    found = None
    for root, _, files in os.walk(WIKITEXT2_DIR):
        if required.issubset(set(files)):
            found = root
            break
    if found is None:
        raise FileNotFoundError('Could not find wiki.train/valid/test.tokens in downloaded dataset')
    WIKITEXT2_DIR = found

print('Path to dataset files:', WIKITEXT2_DIR)
for fname in sorted(required):
    path = os.path.join(WIKITEXT2_DIR, fname)
    size_mb = os.path.getsize(path) / 1024 / 1024
    print(f'  - {fname} ({size_mb:.2f} MB)')


In [None]:
import torch

print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')


## 3. Imports and Configuration


In [None]:
import os
import json
import time
import math
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


# ---------------------------------------------------------------------------
# Decoder-only Transformer (GPT-style) -- inlined for self-contained notebook
# ---------------------------------------------------------------------------

def create_positional_encoding(max_len, d_model):
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe


def create_causal_mask(seq_len, device):
    mask = torch.triu(torch.ones(seq_len, seq_len, device=device), diagonal=1)
    mask = mask.masked_fill(mask == 1, float('-inf'))
    return mask


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, S, D = x.size()
        Q = self.W_q(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores + mask.unsqueeze(0).unsqueeze(0)
        attn = self.dropout(F.softmax(scores, dim=-1))
        out = torch.matmul(attn, V)
        out = out.transpose(1, 2).contiguous().view(B, S, D)
        return self.W_o(out)


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.norm1(x + self.dropout1(self.self_attn(x, mask)))
        x = self.norm2(x + self.dropout2(self.feed_forward(x)))
        return x


class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, n_layers=6,
                 d_ff=1024, dropout=0.1, max_len=128):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
        self.register_buffer('pos_encoding',
                             create_positional_encoding(max_len, d_model))
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])
        self.output = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x, mask=None):
        B, S = x.size()
        if mask is None:
            mask = create_causal_mask(S, x.device)
        x = self.embed(x) * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:S, :].unsqueeze(0)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, mask)
        return self.output(x)


print('Modules imported and TransformerDecoder defined')

In [None]:
CONFIG = {
    # Fixed model defaults
    'd_model': 256,
    'n_heads': 4,
    'n_layers': 4,
    'd_ff': 1024,
    'dropout': 0.1,
    'max_len': 128,

    # Training
    'batch_size': 32,
    'lr': 1e-2,
    'momentum': 0.0,
    'weight_decay': 0.0,
    'n_epochs': 8,
    'grad_clip': 1.0,

    # Data
    'max_vocab_size': 20000,
    'min_freq': 2,
    'max_train_tokens': 1200000,

    # Experiment sweeps
    'depths': [1, 2, 4, 8],                # depth experiment: vary n_layers
    'widths': [64, 128, 256, 512],          # width experiment: vary d_model (must be divisible by n_heads)

    # Base model for regularization experiments
    'reg_n_layers': 4,
    'reg_d_model': 256,
    'reg_d_ff': 1024,

    # Regularization sweeps
    'dropout_rates': [0.0, 0.1, 0.2, 0.3, 0.5],
    'l2_weights': [0.0, 0.001, 0.01, 0.1],
    'l1_weights': [0.0, 0.0001, 0.001, 0.01],
    'enet_total_strength': 0.01,              # fixed strength for ratio sweep
    'enet_ratios': [0.0, 0.25, 0.5, 0.75, 1.0],  # 0=pure L2, 1=pure L1
    'enet_ratio': 0.5,                        # fixed ratio for strength sweep
    'enet_strengths': [0.0, 0.001, 0.01, 0.1],
    'batch_sizes': [8, 16, 32, 64, 128],
}

OUTPUT_DIR = '/content/experiment_outputs_wikitext2'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print('Configuration loaded')
print('Depth sweep:', CONFIG['depths'])
print('Width sweep:', CONFIG['widths'])
print('Dropout sweep:', CONFIG['dropout_rates'])
print('L2 sweep:', CONFIG['l2_weights'])
print('L1 sweep:', CONFIG['l1_weights'])
print('Batch size sweep:', CONFIG['batch_sizes'])
print('Output dir:', OUTPUT_DIR)

## 4. Build WikiText-2 Dataset


In [None]:
def read_wikitext_split(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f]
    return [line for line in lines if line]


def build_vocab(train_lines, max_vocab_size=20000, min_freq=2):
    counter = Counter()
    for line in train_lines:
        counter.update(line.split())

    specials = ['<pad>', '<unk>', '<eos>']
    tokens = [tok for tok, freq in counter.most_common() if freq >= min_freq]
    tokens = tokens[: max_vocab_size - len(specials)]
    itos = specials + tokens
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos


def encode_lines(lines, stoi):
    unk_id = stoi['<unk>']
    eos_id = stoi['<eos>']
    ids = []
    for line in lines:
        ids.extend(stoi.get(tok, unk_id) for tok in line.split())
        ids.append(eos_id)
    return ids


def make_lm_blocks(token_ids, seq_len):
    n_blocks = (len(token_ids) - 1) // seq_len
    if n_blocks <= 0:
        raise ValueError('Not enough tokens to create at least one block')

    trim = n_blocks * seq_len + 1
    arr = torch.tensor(token_ids[:trim], dtype=torch.long)
    x = arr[:-1].view(n_blocks, seq_len)
    y = arr[1:].view(n_blocks, seq_len)
    return x, y


train_lines = read_wikitext_split(os.path.join(WIKITEXT2_DIR, 'wiki.train.tokens'))
valid_lines = read_wikitext_split(os.path.join(WIKITEXT2_DIR, 'wiki.valid.tokens'))
test_lines = read_wikitext_split(os.path.join(WIKITEXT2_DIR, 'wiki.test.tokens'))

stoi, itos = build_vocab(
    train_lines,
    max_vocab_size=CONFIG['max_vocab_size'],
    min_freq=CONFIG['min_freq'],
)

train_ids = encode_lines(train_lines, stoi)
valid_ids = encode_lines(valid_lines, stoi)
test_ids = encode_lines(test_lines, stoi)

if CONFIG['max_train_tokens'] is not None:
    train_ids = train_ids[:CONFIG['max_train_tokens']]

train_x, train_y = make_lm_blocks(train_ids, CONFIG['max_len'])
valid_x, valid_y = make_lm_blocks(valid_ids, CONFIG['max_len'])
test_x, test_y = make_lm_blocks(test_ids, CONFIG['max_len'])

train_loader = DataLoader(TensorDataset(train_x, train_y), batch_size=CONFIG['batch_size'], shuffle=True)
valid_loader = DataLoader(TensorDataset(valid_x, valid_y), batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(TensorDataset(test_x, test_y), batch_size=CONFIG['batch_size'], shuffle=False)

print('WikiText-2 prepared')
print('Vocab size:', len(itos))
print('Train blocks:', len(train_x))
print('Valid blocks:', len(valid_x))
print('Test blocks:', len(test_x))


## 5. Helper Functions


In [None]:
def evaluate_lm(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    total_correct = 0

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits = model(x)

            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), reduction='sum')
            total_loss += loss.item()

            preds = logits.argmax(dim=-1)
            total_correct += (preds == y).sum().item()
            total_tokens += y.numel()

    avg_loss = total_loss / max(total_tokens, 1)
    ppl = float(np.exp(avg_loss))
    acc = total_correct / max(total_tokens, 1)

    return {'loss': avg_loss, 'perplexity': ppl, 'token_acc': acc}


def l1_penalty(model):
    """Sum of absolute values of all model parameters (for L1 regularization)."""
    return sum(p.abs().sum() for p in model.parameters())


def train_epoch_lm(model, dataloader, optimizer, device, grad_clip=1.0, l1_lambda=0.0):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    t0 = time.time()

    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), reduction='mean')

        if l1_lambda > 0:
            loss = loss + l1_lambda * l1_penalty(model)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        total_loss += loss.item() * y.numel()
        total_tokens += y.numel()

    avg_loss = total_loss / max(total_tokens, 1)
    return {
        'loss': avg_loss,
        'perplexity': float(np.exp(avg_loss)),
        'time': time.time() - t0,
    }


def generate_text(model, prompt, stoi, itos, device, max_len_ctx=128, max_new_tokens=40, temperature=1.0):
    model.eval()
    unk_id = stoi['<unk>']
    eos_id = stoi['<eos>']

    ids = [stoi.get(tok, unk_id) for tok in prompt.split()]
    if not ids:
        ids = [eos_id]

    x = torch.tensor([ids], dtype=torch.long, device=device)

    with torch.no_grad():
        for _ in range(max_new_tokens):
            if x.size(1) > max_len_ctx:
                x = x[:, -max_len_ctx:]

            logits = model(x)
            next_logits = logits[:, -1, :] / max(temperature, 1e-5)
            probs = torch.softmax(next_logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, next_id], dim=1)

            if next_id.item() == eos_id:
                break

    out_ids = x[0].tolist()
    out_toks = [itos[i] if 0 <= i < len(itos) else '<unk>' for i in out_ids]
    return ' '.join(tok for tok in out_toks if tok != '<eos>')


print('Helper functions defined')

In [None]:
def plot_param_count(results, iv_name, output_path):
    """Bar chart of parameter count vs. the independent variable."""
    keys = sorted(results.keys())
    params = [results[k]['n_params'] for k in keys]

    fig, ax = plt.subplots(figsize=(7, 4))
    ax.bar([str(k) for k in keys], params, color='steelblue')
    ax.set_xlabel(iv_name)
    ax.set_ylabel('Parameters')
    ax.set_title(f'Parameter Count vs. {iv_name}')
    ax.grid(True, axis='y', alpha=0.3)
    for i, (k, p) in enumerate(zip(keys, params)):
        ax.text(i, p, f'{p:,}', ha='center', va='bottom', fontsize=8)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved', output_path)


def plot_loss_curves(results, iv_name, output_path):
    """Train and valid loss curves, one line per setting."""
    keys = sorted(results.keys())
    colors = plt.cm.viridis(np.linspace(0, 0.9, len(keys)))

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle(f'Loss Curves by {iv_name}', fontsize=14)

    for k, color in zip(keys, colors):
        data = results[k]
        epochs = list(range(1, len(data['train_losses']) + 1))
        label = f'{iv_name}={k} ({data["n_params"]:,} params)'
        axes[0].plot(epochs, data['train_losses'], color=color, marker='o',
                     markersize=3, label=label)
        axes[1].plot(epochs, data['valid_losses'], color=color, marker='o',
                     markersize=3, label=label)

    axes[0].set_title('Train Loss')
    axes[1].set_title('Valid Loss')
    for ax in axes:
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Loss')
        ax.grid(True, alpha=0.3)
        ax.legend(fontsize=8)

    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved', output_path)


print('Plotting functions defined')

In [None]:
def save_results(results, output_path):
    serializable = {}
    for depth, data in results.items():
        serializable[str(depth)] = {}
        for k, v in data.items():
            if isinstance(v, (np.floating, np.integer)):
                serializable[str(depth)][k] = float(v)
            else:
                serializable[str(depth)][k] = v

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(serializable, f, indent=2)

    print('Saved results to', output_path)


print('Result saver defined')


## 6. Experiment Runner

In [None]:
# Optional: reduce runtime for quick checks
# CONFIG['depths'] = [1, 2]
# CONFIG['widths'] = [64, 128]
# CONFIG['n_epochs'] = 2
# CONFIG['max_train_tokens'] = 300000

In [None]:
def run_sweep(sweep_values, config, vary='depth'):
    """
    Train one model per sweep value and collect results.

    Args:
        sweep_values: list of ints -- n_layers (depth) or d_model (width)
        config: dict with fixed hyperparameters
        vary: 'depth' or 'width'
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Device: {device}  |  Sweeping {vary} over {sweep_values}')

    results = {}
    for val in sweep_values:
        print('\n' + '=' * 60)
        print(f'Training {vary}={val}')
        print('=' * 60)

        if vary == 'depth':
            n_layers, d_model, d_ff = val, config['d_model'], config['d_ff']
        else:  # width
            n_layers, d_model, d_ff = config['n_layers'], val, val * 4

        model = TransformerDecoder(
            vocab_size=len(itos),
            d_model=d_model,
            n_heads=config['n_heads'],
            n_layers=n_layers,
            d_ff=d_ff,
            dropout=config['dropout'],
            max_len=config['max_len'],
        ).to(device)

        n_params = sum(p.numel() for p in model.parameters())
        print(f'  d_model={d_model}  n_layers={n_layers}  d_ff={d_ff}  params={n_params:,}')

        optimizer = optim.SGD(model.parameters(),
                              lr=config['lr'],
                              momentum=config['momentum'],
                              weight_decay=config['weight_decay'])

        train_losses, valid_losses, valid_ppls = [], [], []

        for epoch in range(1, config['n_epochs'] + 1):
            t = train_epoch_lm(model, train_loader, optimizer, device,
                               grad_clip=config['grad_clip'])
            v = evaluate_lm(model, valid_loader, device)
            train_losses.append(t['loss'])
            valid_losses.append(v['loss'])
            valid_ppls.append(v['perplexity'])
            print(f"  Epoch {epoch:2d}/{config['n_epochs']} | "
                  f"train loss {t['loss']:.4f} | "
                  f"valid loss {v['loss']:.4f} | "
                  f"valid ppl {v['perplexity']:.2f} | "
                  f"time {t['time']:.1f}s")

        test = evaluate_lm(model, test_loader, device)
        print(f"  Test loss {test['loss']:.4f} | "
              f"ppl {test['perplexity']:.2f} | "
              f"acc {test['token_acc']:.2%}")

        results[val] = {
            'n_params': n_params,
            'train_losses': train_losses,
            'valid_losses': valid_losses,
            'valid_ppls': valid_ppls,
            'test_loss': test['loss'],
            'test_perplexity': test['perplexity'],
            'test_token_acc': test['token_acc'],
        }

    return results


print('Experiment runner defined')

In [None]:
def run_regularization_sweep(sweep_name, sweep_values, build_fn, config,
                              make_loader_fn=None):
    """
    Generic sweep runner for regularization / batch-size experiments.

    Args:
        sweep_name: label for logging/plotting (e.g. "Dropout", "L2")
        sweep_values: list of values to iterate over
        build_fn(val, config): returns (model, optimizer, l1_lambda) for each value
        config: dict with fixed hyperparameters
        make_loader_fn(val): optional callback returning a train DataLoader
                             (used by batch-size sweep)
    Returns:
        results dict keyed by sweep value
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Device: {device}  |  Sweeping {sweep_name} over {sweep_values}')

    results = {}
    for val in sweep_values:
        print('\n' + '=' * 60)
        print(f'Training {sweep_name}={val}')
        print('=' * 60)

        model, optimizer, l1_lam = build_fn(val, config)
        model = model.to(device)
        n_params = sum(p.numel() for p in model.parameters())
        print(f'  params={n_params:,}  l1_lambda={l1_lam}')

        cur_train_loader = make_loader_fn(val) if make_loader_fn else train_loader

        train_losses, valid_losses, valid_ppls = [], [], []

        for epoch in range(1, config['n_epochs'] + 1):
            t = train_epoch_lm(model, cur_train_loader, optimizer, device,
                               grad_clip=config['grad_clip'], l1_lambda=l1_lam)
            v = evaluate_lm(model, valid_loader, device)
            train_losses.append(t['loss'])
            valid_losses.append(v['loss'])
            valid_ppls.append(v['perplexity'])
            print(f"  Epoch {epoch:2d}/{config['n_epochs']} | "
                  f"train loss {t['loss']:.4f} | "
                  f"valid loss {v['loss']:.4f} | "
                  f"valid ppl {v['perplexity']:.2f} | "
                  f"time {t['time']:.1f}s")

        test = evaluate_lm(model, test_loader, device)
        print(f"  Test loss {test['loss']:.4f} | "
              f"ppl {test['perplexity']:.2f} | "
              f"acc {test['token_acc']:.2%}")

        results[val] = {
            'n_params': n_params,
            'train_losses': train_losses,
            'valid_losses': valid_losses,
            'valid_ppls': valid_ppls,
            'test_loss': test['loss'],
            'test_perplexity': test['perplexity'],
            'test_token_acc': test['token_acc'],
        }

    return results


print('Regularization sweep runner defined')

## 7. Depth Experiment (vary n_layers, fixed d_model)

In [None]:
depth_results = run_sweep(CONFIG['depths'], CONFIG, vary='depth')

save_results(depth_results, os.path.join(OUTPUT_DIR, 'depth_results.json'))
plot_param_count(depth_results, 'Depth (n_layers)',
                 os.path.join(OUTPUT_DIR, 'depth_param_count.png'))
plot_loss_curves(depth_results, 'Depth',
                 os.path.join(OUTPUT_DIR, 'depth_loss_curves.png'))

## 8. Width Experiment (vary d_model, fixed n_layers)

In [None]:
width_results = run_sweep(CONFIG['widths'], CONFIG, vary='width')

save_results(width_results, os.path.join(OUTPUT_DIR, 'width_results.json'))
plot_param_count(width_results, 'Width (d_model)',
                 os.path.join(OUTPUT_DIR, 'width_param_count.png'))
plot_loss_curves(width_results, 'Width',
                 os.path.join(OUTPUT_DIR, 'width_loss_curves.png'))

## 9. Batch Size Sweep

In [None]:
def build_batch_size_model(val, config):
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=config['dropout'],
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'])
    return model, optimizer, 0.0

def make_batch_loader(batch_size):
    return DataLoader(TensorDataset(train_x, train_y),
                      batch_size=batch_size, shuffle=True)

batch_results = run_regularization_sweep(
    'BatchSize', CONFIG['batch_sizes'], build_batch_size_model, CONFIG,
    make_loader_fn=make_batch_loader,
)

save_results(batch_results, os.path.join(OUTPUT_DIR, 'batch_size_results.json'))
plot_loss_curves(batch_results, 'BatchSize',
                 os.path.join(OUTPUT_DIR, 'batch_size_loss_curves.png'))

## 10. Dropout Sweep

In [None]:
def build_dropout_model(val, config):
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=val,
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'])
    return model, optimizer, 0.0

dropout_results = run_regularization_sweep(
    'Dropout', CONFIG['dropout_rates'], build_dropout_model, CONFIG,
)

save_results(dropout_results, os.path.join(OUTPUT_DIR, 'dropout_results.json'))
plot_loss_curves(dropout_results, 'Dropout',
                 os.path.join(OUTPUT_DIR, 'dropout_loss_curves.png'))

## 11. L2 Regularization Sweep (weight decay)

In [None]:
def build_l2_model(val, config):
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=config['dropout'],
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=val)
    return model, optimizer, 0.0

l2_results = run_regularization_sweep(
    'L2 (weight_decay)', CONFIG['l2_weights'], build_l2_model, CONFIG,
)

save_results(l2_results, os.path.join(OUTPUT_DIR, 'l2_results.json'))
plot_loss_curves(l2_results, 'L2 (weight_decay)',
                 os.path.join(OUTPUT_DIR, 'l2_loss_curves.png'))

## 12. L1 Sweep

In [None]:
def build_l1_model(val, config):
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=config['dropout'],
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'])
    return model, optimizer, val

l1_results = run_regularization_sweep(
    'L1', CONFIG['l1_weights'], build_l1_model, CONFIG,
)

save_results(l1_results, os.path.join(OUTPUT_DIR, 'l1_results.json'))
plot_loss_curves(l1_results, 'L1',
                 os.path.join(OUTPUT_DIR, 'l1_loss_curves.png'))

## 13. Elastic Net -- Ratio Sweep

Fixed total strength, sweep the L1/L2 ratio (0 = pure L2, 1 = pure L1).

In [None]:
def build_enet_ratio_model(ratio, config):
    total = config['enet_total_strength']
    l1_lam = total * ratio
    wd = total * (1 - ratio)
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=config['dropout'],
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=wd)
    return model, optimizer, l1_lam

enet_ratio_results = run_regularization_sweep(
    'ENet Ratio', CONFIG['enet_ratios'], build_enet_ratio_model, CONFIG,
)

save_results(enet_ratio_results, os.path.join(OUTPUT_DIR, 'enet_ratio_results.json'))
plot_loss_curves(enet_ratio_results, 'ENet Ratio',
                 os.path.join(OUTPUT_DIR, 'enet_ratio_loss_curves.png'))

## 14. Elastic Net -- Strength Sweep

Fixed L1/L2 ratio, sweep the total regularization strength.

In [None]:
def build_enet_strength_model(strength, config):
    ratio = config['enet_ratio']
    l1_lam = strength * ratio
    wd = strength * (1 - ratio)
    model = TransformerDecoder(
        vocab_size=len(itos),
        d_model=config['reg_d_model'],
        n_heads=config['n_heads'],
        n_layers=config['reg_n_layers'],
        d_ff=config['reg_d_ff'],
        dropout=config['dropout'],
        max_len=config['max_len'],
    )
    optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                          momentum=config['momentum'],
                          weight_decay=wd)
    return model, optimizer, l1_lam

enet_strength_results = run_regularization_sweep(
    'ENet Strength', CONFIG['enet_strengths'], build_enet_strength_model, CONFIG,
)

save_results(enet_strength_results, os.path.join(OUTPUT_DIR, 'enet_strength_results.json'))
plot_loss_curves(enet_strength_results, 'ENet Strength',
                 os.path.join(OUTPUT_DIR, 'enet_strength_loss_curves.png'))

## 15. Summary

In [None]:
print('=' * 60)
print('ALL EXPERIMENTS COMPLETE')
print('=' * 60)
print('\nOutput directory:', OUTPUT_DIR)
print('\nGenerated files:')
for f in sorted(os.listdir(OUTPUT_DIR)):
    path = os.path.join(OUTPUT_DIR, f)
    if os.path.isfile(path):
        size_kb = os.path.getsize(path) / 1024
        print(f'  {f} ({size_kb:.1f} KB)')