## Setup

In [5]:
import sys
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Set up paths (works from experiments directory)
notebook_dir = Path.cwd()
project_root = notebook_dir.parent
sys.path.insert(0, str(project_root / 'src'))
sys.path.insert(0, str(project_root / 'config'))

print(f"Project root: {project_root}")
print(f"Notebook directory: {notebook_dir}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Project root: /Users/oktayozel/Desktop/boston university/fall2025/cs599/randomized_svd_implementation_for_large_language_models/CS599-Randomized-SVD
Notebook directory: /Users/oktayozel/Desktop/boston university/fall2025/cs599/randomized_svd_implementation_for_large_language_models/CS599-Randomized-SVD/experiments
PyTorch version: 2.9.1
CUDA available: False


In [None]:
# Import experiment modules
from mla_gpt.utils.experiment_runner import ExperimentRunner, ExperimentConfig, ExperimentResults
from mla_gpt.utils.metrics import (
    compute_perplexity,
    measure_inference_speed,
    extract_compression_metrics_from_model,
    compute_reconstruction_error,
)
from mla_gpt.model import GPT, GPTConfig
import experiments_config

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ All modules imported successfully")

ImportError: cannot import name 'measure_training_speed' from 'mla_gpt.utils.metrics' (/Users/oktayozel/Desktop/boston university/fall2025/cs599/randomized_svd_implementation_for_large_language_models/CS599-Randomized-SVD/src/mla_gpt/utils/metrics.py)

## Configuration

### 1. Select Dataset(s)

Choose which dataset(s) to run experiments on:

In [None]:
# Configure datasets to use
DATASETS = ['shakespeare_char']  # Options: 'shakespeare_char', 'shakespeare', 'openwebtext'

# You can test on multiple datasets:
# DATASETS = ['shakespeare_char', 'shakespeare']

print(f"Selected datasets: {DATASETS}")

### 2. Select Metrics to Collect

Choose which metrics you want to collect during experiments:

In [None]:
# Configure which metrics to collect
METRICS_CONFIG = {
    # Model quality metrics
    'collect_perplexity': True,
    'collect_loss': True,
    
    # Performance metrics
    'collect_training_speed': True,
    'collect_inference_speed': True,
    'collect_time_per_iteration': True,
    
    # Memory metrics (only on CUDA)
    'collect_memory_usage': True,
    
    # Compression metrics (only for SVD models)
    'collect_compression_metrics': True,
    'collect_reconstruction_error': True,
    'collect_compression_time': True,
    
    # Training history (stores all values over time)
    'save_training_history': True,
}

# Display selected metrics
print("Selected metrics:")
for metric, enabled in METRICS_CONFIG.items():
    if enabled:
        print(f"  ✓ {metric.replace('_', ' ').title()}")

### 3. Select Experiments to Run

Choose from predefined suites or create custom configurations:

In [None]:
# List available experiment suites
experiments_config.list_suites()

In [None]:
# Option A: Use a predefined suite
SUITE_NAME = 'quick'  # Change this: 'quick', 'comparison', 'svd_comparison', 'mla', 'rank_ablation', 'comprehensive'
experiment_configs = experiments_config.get_suite(SUITE_NAME)

# Option B: Select specific experiments
# experiment_configs = [
#     experiments_config.BASELINE_SMALL,
#     experiments_config.MLA_SMALL,
#     experiments_config.RSVD_V_RANK16,
# ]

# Option C: Create custom configuration
# experiment_configs = [
#     ExperimentConfig(
#         name='my_experiment',
#         description='My custom experiment',
#         n_layer=4,
#         n_head=4,
#         n_embd=128,
#         block_size=256,
#         batch_size=4,
#         max_iters=500,
#         eval_iters=20,
#         use_svd_v=True,
#         svd_rank_v=16,
#         svd_type='randomized',
#     )
# ]

print(f"\nSelected suite: {SUITE_NAME}")
print(f"Number of experiments: {len(experiment_configs)}")
print("\nExperiments to run:")
for i, config in enumerate(experiment_configs, 1):
    print(f"  {i}. {config.name}: {config.description}")

### 4. Configure Output Settings

In [None]:
# Output configuration
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
OUTPUT_DIR = notebook_dir / 'results'
PLOTS_DIR = OUTPUT_DIR / 'plots'

# Create directories
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Device: {DEVICE}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Plots directory: {PLOTS_DIR}")

## Run Experiments

This will run all selected experiments on all selected datasets.

In [None]:
# Initialize experiment runner
runner = ExperimentRunner(
    data_dir=str(project_root / 'data'),
    output_dir=str(OUTPUT_DIR),
    device=DEVICE
)

print(f"Experiment runner initialized")
print(f"Data directory: {project_root / 'data'}")
print(f"Results will be saved to: {OUTPUT_DIR}")

In [None]:
# Run experiments on all selected datasets
all_results = {}

for dataset in DATASETS:
    print(f"\n{'='*80}")
    print(f"Running experiments on dataset: {dataset}")
    print(f"{'='*80}\n")
    
    results = runner.run_experiments(experiment_configs, dataset=dataset)
    all_results[dataset] = results
    
    print(f"\n✓ Completed {len(results)} experiments on {dataset}")

print(f"\n{'='*80}")
print(f"ALL EXPERIMENTS COMPLETED")
print(f"{'='*80}")
print(f"Total datasets: {len(DATASETS)}")
print(f"Total experiments per dataset: {len(experiment_configs)}")
print(f"Total experiments run: {len(DATASETS) * len(experiment_configs)}")

## Results Analysis

### Create Summary Tables

In [None]:
# Helper function to create DataFrame from results
def results_to_dataframe(results_list, dataset_name=''):
    """Convert list of ExperimentResults to pandas DataFrame."""
    data = []
    for result in results_list:
        row = {
            'dataset': dataset_name,
            'name': result.config.name,
            'description': result.config.description,
            'model_params': result.model_params,
            'train_loss': result.final_train_loss,
            'val_loss': result.final_val_loss,
            'train_ppl': result.final_train_perplexity,
            'val_ppl': result.final_val_perplexity,
            'train_tokens_per_sec': result.training_tokens_per_sec,
            'inference_tokens_per_sec': result.inference_tokens_per_sec,
            'forward_memory_mb': result.forward_memory_mb,
            'backward_memory_mb': result.backward_memory_mb,
            'total_memory_mb': result.total_memory_mb,
            'total_training_time': result.total_training_time,
        }
        data.append(row)
    return pd.DataFrame(data)

# Create DataFrames for each dataset
dfs = {}
for dataset, results in all_results.items():
    dfs[dataset] = results_to_dataframe(results, dataset)
    print(f"\n{'='*80}")
    print(f"Results Summary for {dataset}")
    print(f"{'='*80}")
    display(dfs[dataset])

# Combine all datasets
combined_df = pd.concat(dfs.values(), ignore_index=True)

# Save to CSV
csv_path = OUTPUT_DIR / f'{SUITE_NAME}_all_datasets_summary.csv'
combined_df.to_csv(csv_path, index=False)
print(f"\n✓ Combined summary saved to: {csv_path}")

### Metrics Comparison Across Datasets

In [None]:
# Compare same model across different datasets
if len(DATASETS) > 1:
    print("\nModel Performance Across Datasets:")
    print("="*80)
    
    pivot_ppl = combined_df.pivot(index='name', columns='dataset', values='val_ppl')
    print("\nValidation Perplexity:")
    display(pivot_ppl)
    
    pivot_speed = combined_df.pivot(index='name', columns='dataset', values='inference_tokens_per_sec')
    print("\nInference Speed (tokens/sec):")
    display(pivot_speed)

## Visualizations

### 1. Training and Validation Loss Curves

In [None]:
if METRICS_CONFIG['save_training_history']:
    for dataset, results in all_results.items():
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        fig.suptitle(f'Loss Curves - {dataset}', fontsize=14, fontweight='bold')
        
        # Training loss
        ax = axes[0]
        for result in results:
            if result.train_losses:
                ax.plot(result.train_losses, label=result.config.name, alpha=0.7, linewidth=2)
        ax.set_xlabel('Iteration', fontsize=12)
        ax.set_ylabel('Training Loss', fontsize=12)
        ax.set_title('Training Loss')
        ax.legend(loc='best')
        ax.grid(True, alpha=0.3)
        
        # Validation loss
        ax = axes[1]
        for result in results:
            if result.val_losses:
                eval_interval = result.config.eval_interval
                x = np.arange(len(result.val_losses)) * eval_interval
                ax.plot(x, result.val_losses, label=result.config.name, 
                       alpha=0.7, marker='o', linewidth=2, markersize=4)
        ax.set_xlabel('Iteration', fontsize=12)
        ax.set_ylabel('Validation Loss', fontsize=12)
        ax.set_title('Validation Loss')
        ax.legend(loc='best')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f'{dataset}_loss_curves.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_loss_curves.png'}")

### 2. Perplexity Comparison

In [None]:
if METRICS_CONFIG['collect_perplexity']:
    for dataset, df in dfs.items():
        fig, ax = plt.subplots(figsize=(12, 6))
        
        x = np.arange(len(df))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, df['train_ppl'], width, 
                      label='Train Perplexity', alpha=0.8, color='steelblue')
        bars2 = ax.bar(x + width/2, df['val_ppl'], width, 
                      label='Val Perplexity', alpha=0.8, color='coral')
        
        ax.set_xlabel('Model', fontsize=12)
        ax.set_ylabel('Perplexity (lower is better)', fontsize=12)
        ax.set_title(f'Perplexity Comparison - {dataset}', fontsize=14, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(df['name'], rotation=45, ha='right')
        ax.legend(fontsize=11)
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.1f}', ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f'{dataset}_perplexity.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_perplexity.png'}")

### 3. Speed Comparison

In [None]:
if METRICS_CONFIG['collect_training_speed'] and METRICS_CONFIG['collect_inference_speed']:
    for dataset, df in dfs.items():
        fig, ax = plt.subplots(figsize=(12, 6))
        
        x = np.arange(len(df))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, df['train_tokens_per_sec'], width, 
                      label='Training Speed', alpha=0.8, color='steelblue')
        bars2 = ax.bar(x + width/2, df['inference_tokens_per_sec'], width, 
                      label='Inference Speed', alpha=0.8, color='coral')
        
        ax.set_xlabel('Model', fontsize=12)
        ax.set_ylabel('Tokens per Second (higher is better)', fontsize=12)
        ax.set_title(f'Training and Inference Speed - {dataset}', fontsize=14, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(df['name'], rotation=45, ha='right')
        ax.legend(fontsize=11)
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.0f}', ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f'{dataset}_speed.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_speed.png'}")

### 4. Memory Usage Comparison

In [None]:
if METRICS_CONFIG['collect_memory_usage'] and DEVICE == 'cuda':
    for dataset, df in dfs.items():
        if df['total_memory_mb'].sum() > 0:  # Only plot if we have memory data
            fig, ax = plt.subplots(figsize=(12, 6))
            
            x = np.arange(len(df))
            width = 0.25
            
            bars1 = ax.bar(x - width, df['forward_memory_mb'], width, 
                          label='Forward Pass', alpha=0.8)
            bars2 = ax.bar(x, df['backward_memory_mb'], width, 
                          label='Backward Pass', alpha=0.8)
            bars3 = ax.bar(x + width, df['total_memory_mb'], width, 
                          label='Total', alpha=0.8)
            
            ax.set_xlabel('Model', fontsize=12)
            ax.set_ylabel('Memory Usage (MB, lower is better)', fontsize=12)
            ax.set_title(f'Memory Usage - {dataset}', fontsize=14, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels(df['name'], rotation=45, ha='right')
            ax.legend(fontsize=11)
            ax.grid(True, alpha=0.3, axis='y')
            
            plt.tight_layout()
            plt.savefig(PLOTS_DIR / f'{dataset}_memory.png', dpi=150, bbox_inches='tight')
            plt.show()
            print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_memory.png'}")

### 5. Quality vs Speed Tradeoff

In [None]:
for dataset, df in dfs.items():
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle(f'Quality vs Speed Tradeoff - {dataset}', fontsize=14, fontweight='bold')
    
    # Training speed vs perplexity
    ax = axes[0]
    scatter1 = ax.scatter(df['train_tokens_per_sec'], df['val_ppl'], 
                         s=df['model_params']/100, alpha=0.6, 
                         c=range(len(df)), cmap='viridis')
    for i, name in enumerate(df['name']):
        ax.annotate(name, (df.iloc[i]['train_tokens_per_sec'], df.iloc[i]['val_ppl']),
                   fontsize=9, alpha=0.8, xytext=(5, 5), textcoords='offset points')
    ax.set_xlabel('Training Speed (tokens/sec, higher is better)', fontsize=11)
    ax.set_ylabel('Validation Perplexity (lower is better)', fontsize=11)
    ax.set_title('Quality vs Training Speed')
    ax.grid(True, alpha=0.3)
    
    # Inference speed vs perplexity
    ax = axes[1]
    scatter2 = ax.scatter(df['inference_tokens_per_sec'], df['val_ppl'], 
                         s=df['model_params']/100, alpha=0.6, 
                         c=range(len(df)), cmap='viridis')
    for i, name in enumerate(df['name']):
        ax.annotate(name, (df.iloc[i]['inference_tokens_per_sec'], df.iloc[i]['val_ppl']),
                   fontsize=9, alpha=0.8, xytext=(5, 5), textcoords='offset points')
    ax.set_xlabel('Inference Speed (tokens/sec, higher is better)', fontsize=11)
    ax.set_ylabel('Validation Perplexity (lower is better)', fontsize=11)
    ax.set_title('Quality vs Inference Speed')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / f'{dataset}_tradeoff.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_tradeoff.png'}")

### 6. Compression Metrics (for SVD models)

In [None]:
if METRICS_CONFIG['collect_compression_metrics']:
    for dataset, results in all_results.items():
        # Extract compression metrics
        compression_data = []
        for result in results:
            if result.compression_metrics:
                for layer, metrics in result.compression_metrics.items():
                    compression_data.append({
                        'model': result.config.name,
                        'layer': layer,
                        'relative_error': metrics.get('relative_error', 0),
                        'compression_time': metrics.get('compression_time', 0),
                        'compression_ratio': metrics.get('compression_ratio', 1),
                    })
        
        if compression_data:
            comp_df = pd.DataFrame(compression_data)
            print(f"\n{'='*80}")
            print(f"Compression Metrics - {dataset}")
            print(f"{'='*80}")
            display(comp_df)
            
            # Plot compression metrics
            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            fig.suptitle(f'SVD Compression Metrics - {dataset}', fontsize=14, fontweight='bold')
            
            # Compression error
            ax = axes[0]
            comp_df.groupby('model')['relative_error'].mean().plot(
                kind='bar', ax=ax, alpha=0.8, color='steelblue'
            )
            ax.set_xlabel('Model', fontsize=11)
            ax.set_ylabel('Relative Reconstruction Error (lower is better)', fontsize=11)
            ax.set_title('SVD Reconstruction Error')
            ax.tick_params(axis='x', rotation=45)
            ax.grid(True, alpha=0.3, axis='y')
            
            # Compression time
            ax = axes[1]
            comp_df.groupby('model')['compression_time'].mean().plot(
                kind='bar', ax=ax, alpha=0.8, color='coral'
            )
            ax.set_xlabel('Model', fontsize=11)
            ax.set_ylabel('Compression Time (sec, lower is better)', fontsize=11)
            ax.set_title('SVD Compression Time')
            ax.tick_params(axis='x', rotation=45)
            ax.grid(True, alpha=0.3, axis='y')
            
            plt.tight_layout()
            plt.savefig(PLOTS_DIR / f'{dataset}_compression.png', dpi=150, bbox_inches='tight')
            plt.show()
            print(f"✓ Saved: {PLOTS_DIR / f'{dataset}_compression.png'}")

## Summary Statistics

In [None]:
# Print comprehensive summary
print("="*80)
print("EXPERIMENT SUMMARY")
print("="*80)
print(f"\nSuite: {SUITE_NAME}")
print(f"Datasets: {', '.join(DATASETS)}")
print(f"Device: {DEVICE}")
print(f"Total experiments: {len(DATASETS) * len(experiment_configs)}")

for dataset, df in dfs.items():
    print("\n" + "="*80)
    print(f"BEST MODELS - {dataset}")
    print("="*80)
    
    best_ppl_idx = df['val_ppl'].idxmin()
    print(f"\nBest Perplexity: {df.iloc[best_ppl_idx]['name']}")
    print(f"  Val Perplexity: {df.iloc[best_ppl_idx]['val_ppl']:.2f}")
    
    best_train_speed_idx = df['train_tokens_per_sec'].idxmax()
    print(f"\nFastest Training: {df.iloc[best_train_speed_idx]['name']}")
    print(f"  Training Speed: {df.iloc[best_train_speed_idx]['train_tokens_per_sec']:.0f} tokens/sec")
    
    best_inference_speed_idx = df['inference_tokens_per_sec'].idxmax()
    print(f"\nFastest Inference: {df.iloc[best_inference_speed_idx]['name']}")
    print(f"  Inference Speed: {df.iloc[best_inference_speed_idx]['inference_tokens_per_sec']:.0f} tokens/sec")
    
    if df['total_memory_mb'].max() > 0:
        best_memory_idx = df['total_memory_mb'].idxmin()
        print(f"\nLowest Memory Usage: {df.iloc[best_memory_idx]['name']}")
        print(f"  Total Memory: {df.iloc[best_memory_idx]['total_memory_mb']:.2f} MB")

print("\n" + "="*80)

## Export Results

In [None]:
# Create comprehensive export
export_data = {
    'suite_name': SUITE_NAME,
    'datasets': DATASETS,
    'device': DEVICE,
    'metrics_config': METRICS_CONFIG,
    'num_experiments_per_dataset': len(experiment_configs),
    'total_experiments': len(DATASETS) * len(experiment_configs),
    'results_by_dataset': {}
}

for dataset, df in dfs.items():
    export_data['results_by_dataset'][dataset] = df.to_dict('records')

export_path = OUTPUT_DIR / f'{SUITE_NAME}_complete_export.json'
with open(export_path, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"✓ Exported complete results to: {export_path}")
print(f"✓ All plots saved to: {PLOTS_DIR}")
print(f"✓ CSV summary saved to: {OUTPUT_DIR / f'{SUITE_NAME}_all_datasets_summary.csv'}")
print(f"\n{'='*80}")
print("EXPERIMENTS COMPLETE! ✓")
print(f"{'='*80}")

## Custom Analysis

Use this section for custom analysis and visualizations:

In [None]:
# Add your custom analysis here
# For example:
# - Compare specific models
# - Create custom visualizations
# - Statistical tests
# - Export data in different formats