# TransformerOpt: LLM Kernel Benchmarks (Colab Version - V4)

This notebook benchmarks the **FIXED V4** attention kernel.

## What Was Fixed

- **V3 Bug**: Recomputed K/V for every key position (1000x slowdown!)
- **V3 Bug**: Only 1 of 8 warps contributed (wrong answers)
- **V4 Fix**: Pre-computed Q,K,V + proper multi-warp reduction

## Instructions

1. **Select GPU runtime**: Runtime > Change runtime type > T4 GPU
2. **Upload kernel files**: `attention_v4.cu`, `attention_v4_wrapper.py`, `utils.py`
3. **Run all cells** sequentially

## Step 0: Clone Repository (OPTIONAL)

**Option A - Clone from GitHub** (recommended):


**Option B - Upload files manually**:
- Click the folder icon (📁) on the left
- Upload , ,  to 

In [None]:
# Option A: Clone from GitHub (run this cell)
import os

if not os.path.exists('kernels'):
    print("Cloning StyleForge repository...")
    !git clone https://github.com/olivialiau/StyleForge.git
    %cd StyleForge
    print("Repository cloned successfully!")
else:
    print("Repository already exists.")

## Step 1: Verify Kernel Files

Check that all required files are present.

In [None]:
# Check if kernel files exist
import os
import sys

# Detect where we are and set paths correctly
if os.path.exists('kernels'):
    # We're in the repo root
    kernel_path = 'kernels'
    print("Working from repo root")
elif os.path.exists('../kernels'):
    # We're in llm_benchmarks subdir
    kernel_path = '../kernels'
    sys.path.insert(0, os.path.abspath('..'))
    print("Working from llm_benchmarks/ subdir")
else:
    # Try to find kernels dir
    for p in ['.', '..', '../..']:
        if os.path.exists(os.path.join(p, 'kernels')):
            kernel_path = os.path.join(p, 'kernels')
            sys.path.insert(0, os.path.abspath(p))
            print(f"Found kernels at {kernel_path}")
            break
    else:
        kernel_path = 'kernels'
        print("Could not find kernels directory")

required_files = [
    f'{kernel_path}/utils.py',
    f'{kernel_path}/attention_v4.cu',
    f'{kernel_path}/attention_v4_wrapper.py',
    f'{kernel_path}/ffn.cu',
    f'{kernel_path}/ffn_wrapper.py'
]

print("\nChecking for required kernel files...")
print("-"*50)

all_exist = True
for f in required_files:
    exists = os.path.exists(f)
    status = "✓ found" if exists else "✗ NOT FOUND"
    print(f"  {f}: {status}")
    if not exists:
        all_exist = False

if all_exist:
    print("\n✓ All required kernel files found!")
else:
    print("\n⚠ WARNING: Some kernel files are missing.")
    print("  Make sure you cloned the repo or uploaded files.")

## Step 2: Check GPU Availability

In [None]:
import torch

print("="*70)
print("GPU Information")
print("="*70)

if torch.cuda.is_available():
    print(f"\n CUDA is available!")
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" CUDA Version: {torch.version.cuda}")
    print(f" PyTorch Version: {torch.__version__}")
    
    props = torch.cuda.get_device_properties(0)
    print(f" Compute Capability: {props.major}.{props.minor}")
    print(f" Total Memory: {props.total_memory / 1024**3:.2f} GB")
    print(f" Multi-Processors: {props.multi_processor_count}")
    
    device = torch.device('cuda')
else:
    print("\n CUDA is NOT available!")
    print(" Please enable GPU: Runtime > Change runtime type > T4 GPU")
    device = torch.device('cpu')

## Step 3: Install Dependencies

In [None]:
print("Installing dependencies...")
!pip install torch ninja numpy -q

import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

## Step 4: Compile V4 Attention Kernel

In [None]:
# Compile V4 attention kernel
import sys
import os

# Set up path to import from kernels/
if os.path.exists('kernels'):
    sys.path.insert(0, '.')
elif os.path.exists('../kernels'):
    sys.path.insert(0, os.path.abspath('..'))

print("Compiling Attention V4 kernel (FIXED version)...")
try:
    from kernels.attention_v4_wrapper import get_attention_v4_module
    
    # Compile and test
    module = get_attention_v4_module()
    print("✓ Attention V4 kernel compiled successfully!")
    attention_available = True
    
except Exception as e:
    print(f"✗ Error compiling attention kernel: {e}")
    import traceback
    traceback.print_exc()
    attention_available = False

## Step 5: Quick Correctness Test

Test that V4 produces correct results before benchmarking.

In [None]:
import torch
import torch.nn.functional as F

if not attention_available:
    print("SKIP: V4 kernel not available")
else:
    print("="*70)
    print("Correctness Test: V4 vs PyTorch")
    print("="*70)
    
    # Small test
    B, S, E, H = 1, 128, 512, 4
    head_dim = E // H
    scale = 1.0 / (head_dim ** 0.5)
    
    # Create input
    x = torch.randn(B, S, E, device='cuda')
    
    # PyTorch reference - compute QKV properly
    W_qkv = torch.randn(3 * E, E, device='cuda')
    qkv = x @ W_qkv.T  # [B, S, 3*E]
    qkv = qkv.reshape(B, S, 3, H, head_dim).permute(2, 0, 3, 1, 4)
    Q_pt, K_pt, V_pt = qkv[0], qkv[1], qkv[2]
    
    scores_pt = (Q_pt @ K_pt.transpose(-2, -1)) * scale
    attn_pt = F.softmax(scores_pt, dim=-1)
    output_pt = attn_pt @ V_pt
    
    # V4 custom
    with torch.no_grad():
        output_v4 = get_attention_v4_module().fused_attention_v4(Q_pt, K_pt, V_pt, scale)
    
    # Compare
    error = (output_v4 - output_pt).abs().max().item()
    print(f"\nMax error: {error:.2e}")
    
    if error < 1e-4:
        print("✓ CORRECTNESS: PASS")
    else:
        print("✗ CORRECTNESS: FAIL - error too large")
        print(f"  V4 range: [{output_v4.min().item():.4f}, {output_v4.max().item():.4f}]")
        print(f"  PyTorch range: [{output_pt.min().item():.4f}, {output_pt.max().item():.4f}]")

## Step 6: Benchmark V4 vs PyTorch

Compare performance on realistic LLM configurations.

In [None]:
import torch.nn as nn
import numpy as np

if not attention_available:
    print("SKIP: V4 kernel not available")
else:
    print("="*70)
    print("Attention Kernel Benchmark (V4 FIXED)")
    print("="*70)
    
    # Test configurations
    configs = [
        {"name": "Small (B=1, S=128, E=512, H=4)", "B": 1, "S": 128, "E": 512, "H": 4},
        {"name": "Medium (B=1, S=256, E=512, H=4)", "B": 1, "S": 256, "E": 512, "H": 4},
        {"name": "Large (B=1, S=256, E=1024, H=8)", "B": 1, "S": 256, "E": 1024, "H": 8},
        {"name": "Llama-like (B=1, S=512, E=2048, H=16)", "B": 1, "S": 512, "E": 2048, "H": 16},
    ]
    
    results = []
    
    for cfg in configs:
        B, S, E, H = cfg["B"], cfg["S"], cfg["E"], cfg["H"]
        head_dim = E // H
        scale = 1.0 / (head_dim ** 0.5)
        
        print(f"\n{cfg['name']} (S={S}, E={E}, H={H})")
        print("-"*50)
        
        # Create input
        x = torch.randn(B, S, E, device='cuda')
        
        # Compute QKV manually (both use same)
        W = torch.randn(3*E, E, device='cuda')
        qkv = x @ W.T  # [B, S, 3*E]
        qkv = qkv.reshape(B, S, 3, H, head_dim).permute(2, 0, 3, 1, 4)
        Q_pt, K_pt, V_pt = qkv[0], qkv[1], qkv[2]
        
        # Warmup
        for _ in range(10):
            with torch.no_grad():
                _ = get_attention_v4_module().fused_attention_v4(Q_pt, K_pt, V_pt, scale)
        torch.cuda.synchronize()
        
        # Benchmark V4
        v4_times = []
        for _ in range(50):
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            with torch.no_grad():
                _ = get_attention_v4_module().fused_attention_v4(Q_pt, K_pt, V_pt, scale)
            end.record()
            torch.cuda.synchronize()
            v4_times.append(start.elapsed_time(end))
        
        v4_mean = np.mean(v4_times)
        v4_std = np.std(v4_times)
        print(f"  V4 Custom:  {v4_mean:.3f} ± {v4_std:.3f} ms")
        
        # Benchmark PyTorch (manual QKV + SDPA)
        for _ in range(10):
            with torch.no_grad():
                scores = (Q_pt @ K_pt.transpose(-2, -1)) * scale
                _ = F.softmax(scores, dim=-1) @ V_pt
        torch.cuda.synchronize()
        
        pt_times = []
        for _ in range(50):
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            end.record()            start.record()
            with torch.no_grad():
                scores = (Q_pt @ K_pt.transpose(-2, -1)) * scale
                _ = F.softmax(scores, dim=-1) @ V_pt
            end.record()
            torch.cuda.synchronize()
            pt_times.append(start.elapsed_time(end))
        
        pt_mean = np.mean(pt_times)
        pt_std = np.std(pt_times)
        print(f"  PyTorch:    {pt_mean:.3f} ± {pt_std:.3f} ms")
        
        speedup = pt_mean / v4_mean
        print(f"  Speedup:    {speedup:.2f}x")
        
        results.append({
            "name": cfg['name'],
            "pytorch_ms": pt_mean,
            "v4_ms": v4_mean,
            "speedup": speedup
        })

## Step 7: Results Summary

In [None]:
if attention_available and 'results' in locals() and results:
    print("\n" + "="*70)
    print("FINAL RESULTS")
    print("="*70)
    
    print(f"\nGPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA: {torch.version.cuda}")
    print(f"PyTorch: {torch.__version__}")
    
    print("\n" + "-"*70)
    print(f"{'Config':<30} {'PyTorch (ms)':<15} {'V4 (ms)':<12} {'Speedup':<10}")
    print("-"*70)
    
    for r in results:
        print(f"{r['name']:<30} {r['pytorch_ms']:<15.3f} {r['v4_ms']:<12.3f} {r['speedup']:<10.2f}x")
    
    avg_speedup = np.mean([r['speedup'] for r in results])
    print(f"\nAverage speedup: {avg_speedup:.2f}x")
    
    if avg_speedup >= 1.0:
        print("\n V4 is FASTER than PyTorch!")
    elif avg_speedup >= 0.5:
        print("\n V4 is competitive (within 2x)")
    else:
        print("\n V4 is still slower - needs more optimization")
else:
    print("No results to display")

## Step 8: Download Results

In [None]:
import json
from datetime import datetime

if attention_available and 'results' in locals() and results:
    results_data = {
        'timestamp': datetime.now().isoformat(),
        'gpu': torch.cuda.get_device_name(0),
        'cuda_version': torch.version.cuda,
        'pytorch_version': torch.__version__,
        'results': results
    }
    
    filename = 'benchmark_v4_results.json'
    with open(filename, 'w') as f:
        json.dump(results_data, f, indent=2)
    
    print(f"Results saved to {filename}")
    
    # Download file (Colab only)
    try:
        from google.colab import files
        files.download(filename)
        print("\nResults downloaded!")
    except:
        print("\nFile saved locally.")