# Scaling Law Analysis: SwiGLU vs GeLU

This notebook analyzes the results of our scaling experiments to:
1. Fit power laws to the training data
2. Compare scaling exponents between SwiGLU and GeLU
3. Validate predictions on holdout models

**Hypothesis**: Does SwiGLU shift the scaling exponent α, or merely provide a constant offset in compute efficiency compared to GeLU?

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from pathlib import Path

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 1. Load Experiment Results

In [None]:
# Find the most recent sweep directory
logs_dir = Path('logs')
sweep_dirs = sorted(logs_dir.glob('sweep_*'))

if sweep_dirs:
    latest_sweep = sweep_dirs[-1]
    print(f"Loading results from: {latest_sweep}")
else:
    print("No sweep results found. Run main.py first.")
    latest_sweep = None

In [None]:
# Load results
if latest_sweep:
    with open(latest_sweep / 'all_results.json') as f:
        results = json.load(f)
    
    # Convert to DataFrame
    df = pd.DataFrame([
        {
            'run_name': name,
            'activation': data.get('activation', name.split('_')[0]),
            'params': data.get('params', 0),
            'tokens': data.get('tokens', 0),
            'flops': data.get('flops', 0),
            'val_loss': data.get('final_val_loss', data.get('best_val_loss', np.nan)),
            'status': data.get('status', 'unknown'),
        }
        for name, data in results.items()
    ])
    
    # Filter successful runs
    df = df[df['status'] == 'success'].copy()
    
    # Add model size category
    def get_size_category(params):
        if params < 5e6:
            return '3M'
        elif params < 15e6:
            return '10M'
        elif params < 50e6:
            return '30M'
        else:
            return '85M'
    
    df['size'] = df['params'].apply(get_size_category)
    df['is_holdout'] = df['size'] == '85M'
    
    display(df)

## 2. Power Law Fitting

We fit the power law: $L(C) = a \cdot C^{-b}$

In log space: $\log(L) = \log(a) - b \cdot \log(C)$

In [None]:
def power_law(C, a, b):
    """Power law: L = a * C^(-b)"""
    return a * np.power(C, -b)

def fit_power_law(flops, losses, use_log_fit=True):
    """
    Fit a power law to the data.
    
    Args:
        flops: Array of compute values (FLOPs)
        losses: Array of validation losses
        use_log_fit: If True, fit in log space for stability
    
    Returns:
        a: Coefficient
        b: Scaling exponent
        r_squared: Goodness of fit
    """
    if use_log_fit:
        # Fit in log space using linear regression
        log_C = np.log(flops)
        log_L = np.log(losses)
        
        # Linear fit: log(L) = log(a) - b * log(C)
        coeffs = np.polyfit(log_C, log_L, 1)
        b = -coeffs[0]
        a = np.exp(coeffs[1])
        
        # Calculate R-squared
        predicted = coeffs[0] * log_C + coeffs[1]
        ss_res = np.sum((log_L - predicted) ** 2)
        ss_tot = np.sum((log_L - np.mean(log_L)) ** 2)
        r_squared = 1 - (ss_res / ss_tot)
    else:
        # Direct curve fit
        popt, _ = curve_fit(
            power_law, 
            flops, 
            losses,
            p0=[1e6, 0.1],  # Initial guess
            maxfev=10000,
        )
        a, b = popt
        
        # Calculate R-squared
        predicted = power_law(flops, a, b)
        ss_res = np.sum((losses - predicted) ** 2)
        ss_tot = np.sum((losses - np.mean(losses)) ** 2)
        r_squared = 1 - (ss_res / ss_tot)
    
    return a, b, r_squared

In [None]:
# Fit power laws for each activation (using anchor points only)
fits = {}

for activation in ['gelu', 'swiglu']:
    # Get anchor points only (not holdout)
    anchor_data = df[(df['activation'] == activation) & (~df['is_holdout'])]
    
    if len(anchor_data) >= 2:
        flops = anchor_data['flops'].values
        losses = anchor_data['val_loss'].values
        
        a, b, r2 = fit_power_law(flops, losses)
        
        fits[activation] = {
            'a': a,
            'b': b,
            'r_squared': r2,
            'flops': flops,
            'losses': losses,
        }
        
        print(f"\n{activation.upper()} Power Law Fit:")
        print(f"  L(C) = {a:.4e} * C^(-{b:.4f})")
        print(f"  R² = {r2:.6f}")

## 3. Holdout Prediction and Validation

In [None]:
# Predict and validate holdout models
print("\nHoldout Validation:")
print("=" * 60)

predictions = {}

for activation in ['gelu', 'swiglu']:
    if activation not in fits:
        continue
    
    # Get holdout data
    holdout_data = df[(df['activation'] == activation) & (df['is_holdout'])]
    
    if len(holdout_data) > 0:
        holdout_flops = holdout_data['flops'].values[0]
        actual_loss = holdout_data['val_loss'].values[0]
        
        # Predict using fitted power law
        a, b = fits[activation]['a'], fits[activation]['b']
        predicted_loss = power_law(holdout_flops, a, b)
        
        # Calculate error
        error_pct = abs(predicted_loss - actual_loss) / actual_loss * 100
        
        predictions[activation] = {
            'predicted': predicted_loss,
            'actual': actual_loss,
            'error_pct': error_pct,
        }
        
        print(f"\n{activation.upper()} (85M Holdout):")
        print(f"  Predicted loss: {predicted_loss:.4f}")
        print(f"  Actual loss:    {actual_loss:.4f}")
        print(f"  Error:          {error_pct:.2f}%")

## 4. Scaling Law Comparison

In [None]:
# Compare scaling laws
if 'gelu' in fits and 'swiglu' in fits:
    gelu_a, gelu_b = fits['gelu']['a'], fits['gelu']['b']
    swiglu_a, swiglu_b = fits['swiglu']['a'], fits['swiglu']['b']
    
    print("\nScaling Law Comparison:")
    print("=" * 60)
    print(f"\n{'Metric':<25} {'GeLU':>15} {'SwiGLU':>15} {'Difference':>15}")
    print("-" * 70)
    print(f"{'Coefficient (a)':<25} {gelu_a:>15.4e} {swiglu_a:>15.4e} {(swiglu_a/gelu_a - 1)*100:>14.1f}%")
    print(f"{'Exponent (b)':<25} {gelu_b:>15.4f} {swiglu_b:>15.4f} {(swiglu_b/gelu_b - 1)*100:>14.1f}%")
    
    # Compute efficiency multiplier (at what compute does SwiGLU match GeLU's loss?)
    # If they have same exponent, the efficiency gain is constant
    efficiency_ratio = (gelu_a / swiglu_a) ** (1/gelu_b)
    print(f"\n{'Compute efficiency ratio':<25} {efficiency_ratio:>15.2f}x")
    print(f"(SwiGLU achieves same loss with {efficiency_ratio:.2f}x less compute)")

## 5. Scaling Plot (The "Money Shot")

In [None]:
# Create the main scaling plot
fig, ax = plt.subplots(figsize=(12, 8))

colors = {'gelu': '#1f77b4', 'swiglu': '#d62728'}
markers = {'gelu': 'o', 'swiglu': 's'}

# Generate smooth curves for fitted laws
C_range = np.logspace(14, 18, 100)  # FLOPs range

for activation in ['gelu', 'swiglu']:
    if activation not in fits:
        continue
    
    a, b = fits[activation]['a'], fits[activation]['b']
    
    # Plot fitted curve
    L_fitted = power_law(C_range, a, b)
    ax.loglog(C_range, L_fitted, '-', color=colors[activation], 
              label=f'{activation.upper()}: L = {a:.2e} × C^(-{b:.3f})', 
              linewidth=2, alpha=0.7)
    
    # Plot anchor points
    anchor_data = df[(df['activation'] == activation) & (~df['is_holdout'])]
    ax.loglog(anchor_data['flops'], anchor_data['val_loss'], 
              markers[activation], color=colors[activation], 
              markersize=10, markeredgecolor='white', markeredgewidth=2,
              label=f'{activation.upper()} anchors')
    
    # Plot holdout with X marker
    holdout_data = df[(df['activation'] == activation) & (df['is_holdout'])]
    if len(holdout_data) > 0:
        ax.loglog(holdout_data['flops'], holdout_data['val_loss'],
                  'x', color=colors[activation], markersize=15, 
                  markeredgewidth=3, label=f'{activation.upper()} holdout (85M)')

# Formatting
ax.set_xlabel('Compute (FLOPs)', fontsize=14)
ax.set_ylabel('Validation Loss', fontsize=14)
ax.set_title('Scaling Laws: SwiGLU vs GeLU Activation', fontsize=16, fontweight='bold')
ax.legend(loc='upper right', fontsize=11)
ax.grid(True, alpha=0.3)

# Add annotation box with key findings
if 'gelu' in fits and 'swiglu' in fits:
    textstr = (
        f"Key Findings:\n"
        f"• GeLU exponent: {fits['gelu']['b']:.4f}\n"
        f"• SwiGLU exponent: {fits['swiglu']['b']:.4f}\n"
        f"• Exponent diff: {abs(fits['swiglu']['b'] - fits['gelu']['b']):.4f}"
    )
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
    ax.text(0.02, 0.02, textstr, transform=ax.transAxes, fontsize=11,
            verticalalignment='bottom', bbox=props)

plt.tight_layout()
plt.savefig('scaling_plot.png', dpi=150, bbox_inches='tight')
plt.savefig('scaling_plot.pdf', bbox_inches='tight')
plt.show()

print("\nPlot saved to: scaling_plot.png and scaling_plot.pdf")

## 6. Conclusion

In [None]:
# Generate conclusion
print("\n" + "=" * 70)
print("CONCLUSION")
print("=" * 70)

if 'gelu' in fits and 'swiglu' in fits:
    gelu_b = fits['gelu']['b']
    swiglu_b = fits['swiglu']['b']
    exponent_diff_pct = abs(swiglu_b - gelu_b) / gelu_b * 100
    
    coeff_ratio = fits['swiglu']['a'] / fits['gelu']['a']
    coeff_improvement = (1 - coeff_ratio) * 100 if coeff_ratio < 1 else -(coeff_ratio - 1) * 100
    
    print(f"""
Hypothesis: Does SwiGLU shift the scaling exponent, or merely provide 
a constant offset in compute efficiency compared to GeLU?

RESULTS:
--------
• GeLU scaling exponent (b):   {gelu_b:.4f}
• SwiGLU scaling exponent (b): {swiglu_b:.4f}
• Exponent difference:         {exponent_diff_pct:.1f}%

• GeLU coefficient (a):        {fits['gelu']['a']:.4e}
• SwiGLU coefficient (a):      {fits['swiglu']['a']:.4e}
• Coefficient improvement:     {abs(coeff_improvement):.1f}%

CONCLUSION:
-----------""")
    
    if exponent_diff_pct < 5:
        print(f"""
SwiGLU improves the scaling coefficient (a) by {abs(coeff_improvement):.1f}% but leaves 
the exponent (b) essentially unchanged ({exponent_diff_pct:.1f}% difference).

This suggests SwiGLU provides a CONSTANT COMPUTE MULTIPLIER ADVANTAGE 
rather than fundamentally altering how well the model scales.

The scaling law remains: L(C) = a × C^(-b)
SwiGLU simply achieves the same loss with less compute (better 'a').""")
    else:
        print(f"""
SwiGLU shows a {exponent_diff_pct:.1f}% difference in scaling exponent compared to GeLU.

This suggests SwiGLU may FUNDAMENTALLY ALTER the scaling behavior, 
not just provide a constant efficiency improvement.

Further investigation with more data points is recommended.""")
    
    # Prediction accuracy
    if predictions:
        print(f"\nPREDICTION ACCURACY:")
        for act, pred in predictions.items():
            print(f"• {act.upper()} 85M holdout: {pred['error_pct']:.2f}% error")
else:
    print("\nInsufficient data to draw conclusions. Run more experiments.")

## 7. Export Results

In [None]:
# Export results summary
summary = {
    'fits': {
        act: {'a': f['a'], 'b': f['b'], 'r_squared': f['r_squared']}
        for act, f in fits.items()
    },
    'predictions': predictions,
    'data': df.to_dict('records'),
}

with open('analysis_results.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("Results exported to: analysis_results.json")