# Empirical Scaling Harness: SwiGLU vs GeLU

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/YOUR_USERNAME/empirical-scaling-harness/blob/main/scaling_experiment.ipynb)

This notebook runs a complete scaling law experiment comparing SwiGLU and GeLU activations.

**Hypothesis**: Does SwiGLU shift the scaling exponent α, or merely provide a constant offset in compute efficiency compared to GeLU?

## Runtime Estimates
- **Anchors only** (3M, 10M, 30M): ~2-3 hours on Colab GPU
- **Full sweep** (includes 85M holdout): ~6-8 hours on Colab GPU

---

## 0. Setup

In [None]:
# Install dependencies
!pip install -q torch transformers datasets tensorboard scipy matplotlib pandas tqdm

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Configuration - EDIT THESE
RUN_HOLDOUT = True  # Set to False for faster run (anchors only)
ACTIVATIONS = ["gelu", "swiglu"]  # Which activations to test

# Model sizes
ANCHOR_PARAMS = [3_000_000, 10_000_000, 30_000_000]
HOLDOUT_PARAMS = [85_000_000]
ALL_PARAMS = ANCHOR_PARAMS + (HOLDOUT_PARAMS if RUN_HOLDOUT else [])

print(f"Will train {len(ACTIVATIONS)} activations × {len(ALL_PARAMS)} sizes = {len(ACTIVATIONS) * len(ALL_PARAMS)} models")

## 1. Model Configuration

In [None]:
from dataclasses import dataclass
from typing import Literal
import math

@dataclass
class ModelConfig:
    """Configuration for a decoder-only transformer."""
    n_layers: int
    d_model: int
    n_heads: int
    d_ff: int
    vocab_size: int
    max_seq_len: int
    activation: Literal["gelu", "swiglu"]
    dropout: float = 0.0

    @property
    def head_dim(self) -> int:
        return self.d_model // self.n_heads

    def count_parameters(self) -> int:
        embed_params = self.vocab_size * self.d_model
        attn_params = 4 * self.d_model * self.d_model
        if self.activation == "swiglu":
            ffn_params = 3 * self.d_model * self.d_ff
        else:
            ffn_params = 2 * self.d_model * self.d_ff
        ln_params = 4 * self.d_model * self.n_layers + 2 * self.d_model
        layer_params = (attn_params + ffn_params) * self.n_layers
        return embed_params + layer_params + ln_params

MODEL_SCALES = {
    "3M": {"d_model": 256, "n_layers": 4},
    "10M": {"d_model": 384, "n_layers": 6},
    "30M": {"d_model": 512, "n_layers": 8},
    "85M": {"d_model": 768, "n_layers": 12},
}

def compute_ffn_dim(d_model: int, activation: str) -> int:
    if activation == "swiglu":
        d_ff = int(8 * d_model / 3)
        d_ff = (d_ff // 64) * 64
        if d_ff == 0:
            d_ff = 64
    else:
        d_ff = 4 * d_model
    return d_ff

def build_config(target_params: int, activation: str = "gelu", vocab_size: int = 50257, max_seq_len: int = 512) -> ModelConfig:
    scale_name = None
    for name, scale in MODEL_SCALES.items():
        scale_params = int(name.replace("M", "")) * 1_000_000
        if scale_params >= target_params * 0.8 and scale_params <= target_params * 1.2:
            scale_name = name
            break

    if scale_name:
        base_config = MODEL_SCALES[scale_name]
        d_model = base_config["d_model"]
        n_layers = base_config["n_layers"]
    else:
        d_model = int(math.sqrt(target_params / 20))
        d_model = max(64, (d_model // 64) * 64)
        n_layers = max(2, d_model // 64)

    d_ff = compute_ffn_dim(d_model, activation)
    n_heads = max(1, d_model // 64)

    config = ModelConfig(
        n_layers=n_layers, d_model=d_model, n_heads=n_heads, d_ff=d_ff,
        vocab_size=vocab_size, max_seq_len=max_seq_len, activation=activation,
    )

    actual_params = config.count_parameters()
    while actual_params < target_params * 0.9 and n_layers < 24:
        n_layers += 1
        config = ModelConfig(n_layers=n_layers, d_model=d_model, n_heads=n_heads, d_ff=d_ff,
                            vocab_size=vocab_size, max_seq_len=max_seq_len, activation=activation)
        actual_params = config.count_parameters()

    while actual_params > target_params * 1.1 and n_layers > 1:
        n_layers -= 1
        config = ModelConfig(n_layers=n_layers, d_model=d_model, n_heads=n_heads, d_ff=d_ff,
                            vocab_size=vocab_size, max_seq_len=max_seq_len, activation=activation)
        actual_params = config.count_parameters()

    return config

def get_training_config(params: int) -> dict:
    if params <= 5_000_000:
        return {"batch_size": 32, "gradient_accumulation_steps": 1, "learning_rate": 3e-4, "use_amp": False}
    elif params <= 15_000_000:
        return {"batch_size": 16, "gradient_accumulation_steps": 2, "learning_rate": 2e-4, "use_amp": False}
    elif params <= 50_000_000:
        return {"batch_size": 8, "gradient_accumulation_steps": 4, "learning_rate": 1e-4, "use_amp": True}
    else:
        return {"batch_size": 4, "gradient_accumulation_steps": 8, "learning_rate": 6e-5, "use_amp": True}

def compute_optimal_tokens(params: int) -> int:
    return 20 * params

def compute_flops(params: int, tokens: int) -> float:
    return 6 * params * tokens

# Test config generation
print("Model Configurations:")
for params in ALL_PARAMS:
    for act in ["gelu", "swiglu"]:
        cfg = build_config(params, act)
        print(f"  {act} {params/1e6:.0f}M -> {cfg.count_parameters()/1e6:.2f}M actual (L={cfg.n_layers}, d={cfg.d_model})")

## 2. Model Architecture

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SwiGLU(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.0):
        super().__init__()
        self.w_gate = nn.Linear(d_model, d_ff, bias=False)
        self.w_up = nn.Linear(d_model, d_ff, bias=False)
        self.w_down = nn.Linear(d_ff, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.w_down(F.silu(self.w_gate(x)) * self.w_up(x)))

class GeLUFFN(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.0):
        super().__init__()
        self.w_up = nn.Linear(d_model, d_ff)
        self.w_down = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.w_down(F.gelu(self.w_up(x))))

class CausalSelfAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.n_heads = config.n_heads
        self.head_dim = config.d_model // config.n_heads
        self.d_model = config.d_model
        self.qkv = nn.Linear(config.d_model, 3 * config.d_model, bias=False)
        self.out_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.dropout = nn.Dropout(config.dropout)
        self.register_buffer("causal_mask",
            torch.tril(torch.ones(config.max_seq_len, config.max_seq_len)).view(1, 1, config.max_seq_len, config.max_seq_len))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x)
        q, k, v = qkv.split(self.d_model, dim=-1)
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        scale = 1.0 / math.sqrt(self.head_dim)
        attn = (q @ k.transpose(-2, -1)) * scale
        attn = attn.masked_fill(self.causal_mask[:, :, :T, :T] == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(out)

class TransformerBlock(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.d_model)
        self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.d_model)
        self.ffn = SwiGLU(config.d_model, config.d_ff, config.dropout) if config.activation == "swiglu" else GeLUFFN(config.d_model, config.d_ff, config.dropout)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class Transformer(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config
        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_emb = nn.Embedding(config.max_seq_len, config.d_model)
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
        self.ln_f = nn.LayerNorm(config.d_model)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.lm_head.weight = self.token_emb.weight
        self.apply(self._init_weights)
        for block in self.blocks:
            nn.init.normal_(block.attn.out_proj.weight, std=0.02 / math.sqrt(2 * config.n_layers))
            if hasattr(block.ffn, 'w_down'):
                nn.init.normal_(block.ffn.w_down.weight, std=0.02 / math.sqrt(2 * config.n_layers))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, input_ids, labels=None):
        B, T = input_ids.shape
        pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
        x = self.token_emb(input_ids) + self.pos_emb(pos)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits[:, :-1, :].contiguous().view(-1, self.config.vocab_size),
                                   labels[:, 1:].contiguous().view(-1), ignore_index=-100)
        return logits, loss

    def count_parameters(self) -> int:
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

print("Model architecture defined.")

## 3. Data Loading

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import GPT2Tokenizer

class TinyStoriesDataset(Dataset):
    def __init__(self, split="train", max_seq_len=512, max_samples=None):
        self.max_seq_len = max_seq_len
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        print(f"Loading TinyStories {split}...")
        self.dataset = load_dataset("roneneldan/TinyStories", split=split)
        if max_samples:
            self.dataset = self.dataset.select(range(min(max_samples, len(self.dataset))))
        print(f"Loaded {len(self.dataset)} samples")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.dataset[idx]["text"], max_length=self.max_seq_len,
                               truncation=True, padding="max_length", return_tensors="pt")
        input_ids = tokens["input_ids"].squeeze(0)
        return {"input_ids": input_ids, "labels": input_ids.clone()}

def create_dataloaders(batch_size=32, max_seq_len=512, max_train_samples=None, max_val_samples=10000):
    train_ds = TinyStoriesDataset("train", max_seq_len, max_train_samples)
    val_ds = TinyStoriesDataset("validation", max_seq_len, max_val_samples)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, val_loader

print("Data loading functions defined.")

## 4. Training Loop

In [None]:
import time
import json
import csv
from pathlib import Path
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.amp import autocast, GradScaler
from tqdm.auto import tqdm

def train_model(model, config, train_loader, val_loader, total_tokens, run_name,
                batch_size=32, grad_accum=1, lr=3e-4, use_amp=False, device="cuda"):
    model = model.to(device)
    
    # Optimizer
    decay_params = [p for n, p in model.named_parameters() if p.requires_grad and "emb" not in n and "ln" not in n and "bias" not in n]
    no_decay_params = [p for n, p in model.named_parameters() if p.requires_grad and ("emb" in n or "ln" in n or "bias" in n)]
    optimizer = AdamW([{"params": decay_params, "weight_decay": 0.1}, {"params": no_decay_params, "weight_decay": 0.0}], lr=lr, betas=(0.9, 0.95))
    
    tokens_per_step = batch_size * grad_accum * config.max_seq_len
    total_steps = total_tokens // tokens_per_step
    scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=lr * 0.1)
    scaler = GradScaler('cuda') if (use_amp and device == "cuda") else None
    
    # Logging
    output_dir = Path(f"logs/{run_name}")
    output_dir.mkdir(parents=True, exist_ok=True)
    log_data = []
    
    print(f"\nTraining {run_name}: {total_steps} steps, {total_tokens:,} tokens")
    
    model.train()
    train_iter = iter(train_loader)
    tokens_processed = 0
    best_val_loss = float("inf")
    start_time = time.time()
    
    pbar = tqdm(range(total_steps), desc=run_name)
    for step in pbar:
        acc_loss = 0.0
        for _ in range(grad_accum):
            try:
                batch = next(train_iter)
            except StopIteration:
                train_iter = iter(train_loader)
                batch = next(train_iter)
            
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            tokens_processed += input_ids.numel()
            
            with autocast('cuda', enabled=use_amp):
                _, loss = model(input_ids, labels=labels)
                loss = loss / grad_accum
            
            if scaler:
                scaler.scale(loss).backward()
            else:
                loss.backward()
            acc_loss += loss.item() * grad_accum
        
        if scaler:
            scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        if scaler:
            scaler.step(optimizer)
            scaler.update()
        else:
            optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        
        # Warmup
        if step < 100:
            for pg in optimizer.param_groups:
                pg["lr"] = lr * (step + 1) / 100
        else:
            scheduler.step()
        
        # Eval every 500 steps
        if (step + 1) % 500 == 0 or step == total_steps - 1:
            model.eval()
            val_loss = 0.0
            n_batches = 0
            with torch.no_grad():
                for vb in val_loader:
                    vi = vb["input_ids"].to(device)
                    vl = vb["labels"].to(device)
                    with autocast('cuda', enabled=use_amp):
                        _, vloss = model(vi, labels=vl)
                    val_loss += vloss.item()
                    n_batches += 1
                    if n_batches >= 50:
                        break
            val_loss /= n_batches
            best_val_loss = min(best_val_loss, val_loss)
            model.train()
            pbar.set_postfix({"loss": f"{acc_loss:.3f}", "val": f"{val_loss:.3f}"})
            log_data.append({"step": step + 1, "train_loss": acc_loss, "val_loss": val_loss, "tokens": tokens_processed})
    
    elapsed = time.time() - start_time
    params = model.count_parameters()
    flops = compute_flops(params, tokens_processed)
    
    results = {
        "run_name": run_name,
        "params": params,
        "tokens": tokens_processed,
        "flops": flops,
        "final_val_loss": val_loss,
        "best_val_loss": best_val_loss,
        "training_time": elapsed,
        "activation": config.activation,
    }
    
    with open(output_dir / "results.json", "w") as f:
        json.dump(results, f, indent=2)
    with open(output_dir / "log.json", "w") as f:
        json.dump(log_data, f, indent=2)
    
    print(f"  Done! Val loss: {val_loss:.4f}, Time: {elapsed/60:.1f}min")
    return results

print("Training function defined.")

## 5. Run Experiments

In [None]:
# Store all results
all_results = {}
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Running on: {device}")
print(f"Total experiments: {len(ACTIVATIONS) * len(ALL_PARAMS)}")
print("="*60)

In [None]:
# Run all experiments
for activation in ACTIVATIONS:
    for target_params in ALL_PARAMS:
        run_name = f"{activation}_{target_params // 1_000_000}M"
        print(f"\n{'='*60}")
        print(f"EXPERIMENT: {run_name}")
        print(f"{'='*60}")
        
        # Build config
        config = build_config(target_params, activation=activation)
        actual_params = config.count_parameters()
        train_hparams = get_training_config(actual_params)
        total_tokens = compute_optimal_tokens(actual_params)
        
        print(f"Params: {actual_params:,} | Tokens: {total_tokens:,}")
        print(f"Batch: {train_hparams['batch_size']} | Grad accum: {train_hparams['gradient_accumulation_steps']} | AMP: {train_hparams['use_amp']}")
        
        # Build model
        model = Transformer(config)
        
        # Load data
        train_loader, val_loader = create_dataloaders(
            batch_size=train_hparams["batch_size"],
            max_seq_len=config.max_seq_len,
        )
        
        # Train
        results = train_model(
            model=model,
            config=config,
            train_loader=train_loader,
            val_loader=val_loader,
            total_tokens=total_tokens,
            run_name=run_name,
            batch_size=train_hparams["batch_size"],
            grad_accum=train_hparams["gradient_accumulation_steps"],
            lr=train_hparams["learning_rate"],
            use_amp=train_hparams["use_amp"],
            device=device,
        )
        
        all_results[run_name] = results
        
        # Free memory
        del model
        torch.cuda.empty_cache()

# Save all results
with open("logs/all_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

print("\n" + "="*60)
print("ALL EXPERIMENTS COMPLETE")
print("="*60)

## 6. Analysis: Fit Power Laws

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Convert results to DataFrame
df = pd.DataFrame([
    {
        "run_name": name,
        "activation": data["activation"],
        "params": data["params"],
        "tokens": data["tokens"],
        "flops": data["flops"],
        "val_loss": data["final_val_loss"],
    }
    for name, data in all_results.items()
])

# Mark holdout
df["is_holdout"] = df["params"] > 50_000_000

display(df)

In [None]:
def fit_power_law(flops, losses):
    """Fit L(C) = a * C^(-b) in log space."""
    log_C = np.log(flops)
    log_L = np.log(losses)
    coeffs = np.polyfit(log_C, log_L, 1)
    b = -coeffs[0]
    a = np.exp(coeffs[1])
    predicted = coeffs[0] * log_C + coeffs[1]
    ss_res = np.sum((log_L - predicted) ** 2)
    ss_tot = np.sum((log_L - np.mean(log_L)) ** 2)
    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
    return a, b, r2

def power_law(C, a, b):
    return a * np.power(C, -b)

# Fit for each activation (using anchors only)
fits = {}
for activation in ["gelu", "swiglu"]:
    anchor_data = df[(df["activation"] == activation) & (~df["is_holdout"])]
    if len(anchor_data) >= 2:
        a, b, r2 = fit_power_law(anchor_data["flops"].values, anchor_data["val_loss"].values)
        fits[activation] = {"a": a, "b": b, "r2": r2}
        print(f"{activation.upper()}: L(C) = {a:.4e} × C^(-{b:.4f})  [R² = {r2:.4f}]")

In [None]:
# Validate on holdout
print("\nHoldout Validation:")
print("-" * 50)

for activation in ["gelu", "swiglu"]:
    if activation not in fits:
        continue
    holdout = df[(df["activation"] == activation) & (df["is_holdout"])]
    if len(holdout) > 0:
        actual = holdout["val_loss"].values[0]
        predicted = power_law(holdout["flops"].values[0], fits[activation]["a"], fits[activation]["b"])
        error = abs(predicted - actual) / actual * 100
        print(f"{activation.upper()}: Predicted {predicted:.4f} | Actual {actual:.4f} | Error {error:.2f}%")

## 7. Scaling Plot (The Money Shot)

In [None]:
plt.figure(figsize=(12, 8))
colors = {"gelu": "#1f77b4", "swiglu": "#d62728"}
C_range = np.logspace(14, 18, 100)

for activation in ["gelu", "swiglu"]:
    if activation not in fits:
        continue
    a, b = fits[activation]["a"], fits[activation]["b"]
    
    # Fitted curve
    plt.loglog(C_range, power_law(C_range, a, b), "-", color=colors[activation],
               label=f"{activation.upper()}: L = {a:.2e} × C^(-{b:.3f})", linewidth=2, alpha=0.7)
    
    # Anchor points
    anchors = df[(df["activation"] == activation) & (~df["is_holdout"])]
    plt.loglog(anchors["flops"], anchors["val_loss"], "o", color=colors[activation],
               markersize=10, markeredgecolor="white", markeredgewidth=2)
    
    # Holdout
    holdout = df[(df["activation"] == activation) & (df["is_holdout"])]
    if len(holdout) > 0:
        plt.loglog(holdout["flops"], holdout["val_loss"], "X", color=colors[activation],
                   markersize=15, markeredgewidth=3, label=f"{activation.upper()} holdout (85M)")

plt.xlabel("Compute (FLOPs)", fontsize=14)
plt.ylabel("Validation Loss", fontsize=14)
plt.title("Scaling Laws: SwiGLU vs GeLU Activation", fontsize=16, fontweight="bold")
plt.legend(loc="upper right", fontsize=11)
plt.grid(True, alpha=0.3)

# Add findings box
if "gelu" in fits and "swiglu" in fits:
    text = f"Key Findings:\n• GeLU exponent: {fits['gelu']['b']:.4f}\n• SwiGLU exponent: {fits['swiglu']['b']:.4f}\n• Difference: {abs(fits['swiglu']['b'] - fits['gelu']['b']):.4f}"
    plt.text(0.02, 0.02, text, transform=plt.gca().transAxes, fontsize=11,
             verticalalignment="bottom", bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8))

plt.tight_layout()
plt.savefig("scaling_plot.png", dpi=150, bbox_inches="tight")
plt.savefig("scaling_plot.pdf", bbox_inches="tight")
plt.show()

print("\nPlot saved to scaling_plot.png and scaling_plot.pdf")

## 8. Conclusion

In [None]:
print("=" * 70)
print("CONCLUSION")
print("=" * 70)

if "gelu" in fits and "swiglu" in fits:
    gelu_b = fits["gelu"]["b"]
    swiglu_b = fits["swiglu"]["b"]
    exp_diff = abs(swiglu_b - gelu_b) / gelu_b * 100
    
    coeff_ratio = fits["swiglu"]["a"] / fits["gelu"]["a"]
    coeff_change = (1 - coeff_ratio) * 100 if coeff_ratio < 1 else -(coeff_ratio - 1) * 100
    
    print(f"""
Hypothesis: Does SwiGLU shift the scaling exponent, or merely provide
a constant offset in compute efficiency compared to GeLU?

RESULTS:
• GeLU scaling exponent (b):   {gelu_b:.4f}
• SwiGLU scaling exponent (b): {swiglu_b:.4f}
• Exponent difference:         {exp_diff:.1f}%

• GeLU coefficient (a):        {fits['gelu']['a']:.4e}
• SwiGLU coefficient (a):      {fits['swiglu']['a']:.4e}
• Coefficient change:          {abs(coeff_change):.1f}%
""")
    
    if exp_diff < 5:
        print(f"""CONCLUSION:
SwiGLU improves the scaling coefficient (a) by {abs(coeff_change):.1f}% but leaves
the exponent (b) essentially unchanged ({exp_diff:.1f}% difference).

This suggests SwiGLU provides a CONSTANT COMPUTE MULTIPLIER ADVANTAGE
rather than fundamentally altering how well the model scales.""")
    else:
        print(f"""CONCLUSION:
SwiGLU shows a {exp_diff:.1f}% difference in scaling exponent compared to GeLU.
This suggests SwiGLU may FUNDAMENTALLY ALTER the scaling behavior.""")
else:
    print("Insufficient data to draw conclusions.")

In [None]:
# Download results (Colab only)
try:
    from google.colab import files
    files.download("scaling_plot.png")
    files.download("logs/all_results.json")
except ImportError:
    print("Not running in Colab. Results saved to:")
    print("  - scaling_plot.png")
    print("  - logs/all_results.json")