# TransformerOpt: LLM Kernel Benchmarks (Colab Version)

This notebook benchmarks custom CUDA kernels for LLM inference on Google Colab.

## Instructions

1. **Upload your kernels** as a zip file using the file upload cell below
2. **Select GPU runtime**: Runtime > Change runtime type > T4 GPU
3. **Run all cells** sequentially

## Step 1: Check GPU Availability

In [None]:
# Check if GPU is available
import torch
import os

print("="*70)
print("GPU Check")
print("="*70)

if torch.cuda.is_available():
    print(f"\n CUDA is available!")
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" CUDA Version: {torch.version.cuda}")
    print(f" PyTorch Version: {torch.__version__}")
    
    # Get GPU properties
    props = torch.cuda.get_device_properties(0)
    print(f" Compute Capability: {props.major}.{props.minor}")
    print(f" Total Memory: {props.total_memory / 1024**3:.2f} GB")
    print(f" Multi-Processors: {props.multi_processor_count}")
    
    device = torch.device('cuda')
else:
    print("\n CUDA is NOT available!")
    print(" Please enable GPU: Runtime > Change runtime type > T4 GPU")
    device = torch.device('cpu')

## Step 2: Upload Kernels

**Option A: Upload a zip file** containing your `kernels/` directory

**Option B: Upload individual files** - upload these files one at a time:
- `kernels/utils.py`
- `kernels/attention_v3.cu`
- `kernels/attention_v3_wrapper.py`
- `kernels/ffn.cu`
- `kernels/ffn_wrapper.py`
- `kernels/__init__.py`

In [None]:
# Create kernels directory structure
!mkdir -p kernels

print("Please upload your kernel files using the file upload button below.")
print("\nUpload these files:")
print("  1. utils.py (from kernels/)")
print("  2. attention_v3.cu")
print("  3. attention_v3_wrapper.py")
print("  4. ffn.cu")
print("  5. ffn_wrapper.py")
print("  6. __init__.py")
print("\nOr upload a zip file and run the unzip cell below.")

In [None]:
# If you uploaded a zip file, uncomment and run this
# !unzip -o kernels.zip -d .

# Check if files exist
import os

required_files = [
    'kernels/utils.py',
    'kernels/attention_v3.cu', 
    'kernels/attention_v3_wrapper.py',
    'kernels/ffn.cu',
    'kernels/ffn_wrapper.py'
]

print("Checking for required files...")
all_exist = True
for f in required_files:
    exists = os.path.exists(f)
    status = " found" if exists else " NOT FOUND"
    print(f"  {f}: {status}")
    if not exists:
        all_exist = False

if all_exist:
    print("\n All required files found!")
else:
    print("\n WARNING: Some files are missing. Please upload them.")

## Step 3: Install Dependencies

In [None]:
# Install required packages
print("Installing dependencies...")
!pip install torch ninja numpy -q

# Verify PyTorch has CUDA
import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

## Step 4: Compile Kernels

This will compile your CUDA kernels on first import.

In [None]:
# Compile attention kernel
print("Compiling Attention V3 kernel...")
try:
    from kernels.attention_v3_wrapper import FusedAttentionV3
    print(" Attention V3 kernel compiled successfully!")
    attention_available = True
except Exception as e:
    print(f" Error compiling attention kernel: {e}")
    attention_available = False

In [None]:
# Compile FFN kernel
print("Compiling FFN kernel...")
try:
    from kernels.ffn_wrapper import FusedFFN
    print(" FFN kernel compiled successfully!")
    ffn_available = True
except Exception as e:
    print(f" Error compiling FFN kernel: {e}")
    ffn_available = False

## Step 5: Define Llama-2 Configuration

In [None]:
from dataclasses import dataclass

@dataclass
class Llama2Config:
    """Llama-2-7B configuration"""
    hidden_size: int = 4096
    num_hidden_layers: int = 32
    num_attention_heads: int = 32
    num_key_value_heads: int = 32
    intermediate_size: int = 11008
    max_position_embeddings: int = 4096
    vocab_size: int = 32000
    
    @property
    def head_dim(self) -> int:
        return self.hidden_size // self.num_attention_heads

config = Llama2Config()

print("="*70)
print("Llama-2-7B Configuration")
print("="*70)
print(f"Hidden size:           {config.hidden_size}")
print(f"Num layers:            {config.num_hidden_layers}")
print(f"Num attention heads:   {config.num_attention_heads}")
print(f"Head dimension:        {config.head_dim}")
print(f"Intermediate size:     {config.intermediate_size}")

## Step 6: Benchmark Attention Kernel

In [None]:
import torch
import torch.nn as nn
import time
import numpy as np

print("="*70)
print("Attention Kernel Benchmark")
print("="*70)

# Configuration for Colab T4 (smaller sizes for faster testing)
seq_lengths = [256, 512, 1024, 2048]
batch_size = 1

attention_results = []

for seq_len in seq_lengths:
    print(f"\nBenchmarking seq_len={seq_len}...")
    
    # Create custom attention
    if attention_available:
        custom_attn = FusedAttentionV3(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
        ).cuda().eval()
    
    # Create PyTorch baseline
    pytorch_attn = nn.MultiheadAttention(
        embed_dim=config.hidden_size,
        num_heads=config.num_attention_heads,
        batch_first=True,
    ).cuda().eval()
    
    # Create input
    hidden_states = torch.randn(
        batch_size, seq_len, config.hidden_size,
        dtype=torch.float32, device='cuda'
    )
    
    # Warmup
    for _ in range(10):
        with torch.no_grad():
            if attention_available:
                _ = custom_attn(hidden_states)
            _ = pytorch_attn(hidden_states, hidden_states, hidden_states)
    torch.cuda.synchronize()
    
    # Benchmark PyTorch
    pytorch_times = []
    for _ in range(50):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        with torch.no_grad():
            _ = pytorch_attn(hidden_states, hidden_states, hidden_states)
        end.record()
        torch.cuda.synchronize()
        pytorch_times.append(start.elapsed_time(end))
    
    # Benchmark Custom
    custom_times = []
    if attention_available:
        for _ in range(50):
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            with torch.no_grad():
                _ = custom_attn(hidden_states)
            end.record()
            torch.cuda.synchronize()
            custom_times.append(start.elapsed_time(end))
    
    pytorch_mean = np.mean(pytorch_times)
    pytorch_std = np.std(pytorch_times)
    print(f"  PyTorch: {pytorch_mean:.3f} ± {pytorch_std:.3f} ms")
    
    if attention_available:
        custom_mean = np.mean(custom_times)
        custom_std = np.std(custom_times)
        speedup = pytorch_mean / custom_mean
        print(f"  Custom:  {custom_mean:.3f} ± {custom_std:.3f} ms")
        print(f"  Speedup: {speedup:.2f}x")
        
        attention_results.append({
            'seq_len': seq_len,
            'pytorch_ms': pytorch_mean,
            'custom_ms': custom_mean,
            'speedup': speedup
        })
    else:
        attention_results.append({
            'seq_len': seq_len,
            'pytorch_ms': pytorch_mean,
            'custom_ms': None,
            'speedup': None
        })

## Step 7: Benchmark FFN Kernel

In [None]:
import torch.nn.functional as F

print("="*70)
print("FFN Kernel Benchmark")
print("="*70)

ffn_results = []

for seq_len in seq_lengths:
    print(f"\nBenchmarking FFN seq_len={seq_len}...")
    
    # Create custom FFN
    if ffn_available:
        custom_ffn = FusedFFN(
            embed_dim=config.hidden_size,
            ffn_dim=config.intermediate_size,
        ).cuda().eval()
    
    # Create PyTorch baseline
    pytorch_ffn = nn.Sequential(
        nn.Linear(config.hidden_size, config.intermediate_size),
        nn.GELU(),
        nn.Linear(config.intermediate_size, config.hidden_size)
    ).cuda().eval()
    
    # Create input
    hidden_states = torch.randn(
        batch_size, seq_len, config.hidden_size,
        dtype=torch.float32, device='cuda'
    )
    
    # Warmup
    for _ in range(10):
        with torch.no_grad():
            if ffn_available:
                _ = custom_ffn(hidden_states)
            _ = pytorch_ffn(hidden_states)
    torch.cuda.synchronize()
    
    # Benchmark PyTorch
    pytorch_times = []
    for _ in range(50):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        with torch.no_grad():
            _ = pytorch_ffn(hidden_states)
        end.record()
        torch.cuda.synchronize()
        pytorch_times.append(start.elapsed_time(end))
    
    # Benchmark Custom
    custom_times = []
    if ffn_available:
        for _ in range(50):
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            with torch.no_grad():
                _ = custom_ffn(hidden_states)
            end.record()
            torch.cuda.synchronize()
            custom_times.append(start.elapsed_time(end))
    
    pytorch_mean = np.mean(pytorch_times)
    pytorch_std = np.std(pytorch_times)
    print(f"  PyTorch: {pytorch_mean:.3f} ± {pytorch_std:.3f} ms")
    
    if ffn_available:
        custom_mean = np.mean(custom_times)
        custom_std = np.std(custom_times)
        speedup = pytorch_mean / custom_mean
        print(f"  Custom:  {custom_mean:.3f} ± {custom_std:.3f} ms")
        print(f"  Speedup: {speedup:.2f}x")
        
        memory_saved = (seq_len * config.intermediate_size * 4) / (1024**2)
        print(f"  Memory saved: {memory_saved:.1f} MB")
        
        ffn_results.append({
            'seq_len': seq_len,
            'pytorch_ms': pytorch_mean,
            'custom_ms': custom_mean,
            'speedup': speedup,
            'memory_saved_mb': memory_saved
        })
    else:
        ffn_results.append({
            'seq_len': seq_len,
            'pytorch_ms': pytorch_mean,
            'custom_ms': None,
            'speedup': None
        })

## Step 8: Summary Results

In [None]:
print("\n" + "="*70)
print("BENCHMARK SUMMARY")
print("="*70)

print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA: {torch.version.cuda}")
print(f"PyTorch: {torch.__version__}")

print("\n" + "-"*70)
print("ATTENTION KERNEL RESULTS")
print("-"*70)
print(f"{'Seq Len':<10} {'PyTorch (ms)':<15} {'Custom (ms)':<15} {'Speedup':<10}")
print("-"*60)

for r in attention_results:
    seq = r['seq_len']
    pt = r['pytorch_ms']
    cust = r['custom_ms'] if r['custom_ms'] else 'N/A'
    sp = f"{r['speedup']:.2f}x" if r['speedup'] else 'N/A'
    print(f"{seq:<10} {pt:<15.3f} {str(cust):<15} {sp:<10}")

print("\n" + "-"*70)
print("FFN KERNEL RESULTS")
print("-"*70)
print(f"{'Seq Len':<10} {'PyTorch (ms)':<15} {'Custom (ms)':<15} {'Speedup':<10} {'Mem Saved':<12}")
print("-"*70)

for r in ffn_results:
    seq = r['seq_len']
    pt = r['pytorch_ms']
    cust = r['custom_ms'] if r['custom_ms'] else 'N/A'
    sp = f"{r['speedup']:.2f}x" if r['speedup'] else 'N/A'
    mem = f"{r.get('memory_saved_mb', 0):.1f} MB" if r.get('memory_saved_mb') else 'N/A'
    print(f"{seq:<10} {pt:<15.3f} {str(cust):<15} {sp:<10} {mem:<12}")

## Step 9: Download Results

Run this cell to save and download your benchmark results.

In [None]:
import json
from datetime import datetime

# Collect all results
results = {
    'timestamp': datetime.now().isoformat(),
    'gpu': torch.cuda.get_device_name(0),
    'cuda_version': torch.version.cuda,
    'pytorch_version': torch.__version__,
    'config': {
        'hidden_size': config.hidden_size,
        'num_heads': config.num_attention_heads,
        'intermediate_size': config.intermediate_size,
    },
    'attention_results': attention_results,
    'ffn_results': ffn_results
}

# Save to file
filename = 'benchmark_results.json'
with open(filename, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {filename}")

# Download file
from google.colab import files
files.download(filename)

print("\nResults downloaded! You can now update your README with these numbers.")