# 🧠 LLM Interpretability - Simple Pipeline

Train neural networks to interpret and modify other neural networks' weights.

**Process:**
1. Generate dataset of clean vs corrupted models
2. Upload to HuggingFace Hub
3. Train StarCoder2-3B with LoRA to interpret weights

In [None]:
#@title 🚀 Setup & Install Dependencies

# Clone repository and install
!git clone https://github.com/maximus-powers/llm-interpretability.git
%cd llm-interpretability

# Install dependencies
!pip install -q -r requirements.txt
!pip install -q accelerate datasets transformers[torch] peft tensorboard huggingface_hub

# Check GPU
import torch
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
    print(f"📊 VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("⚠️  Using CPU (will be slower)")

print("✅ Setup complete!")

In [None]:
#@title ⚙️ Dataset Generation Configuration

NUM_EXAMPLES = 500 #@param {type:"integer"}
MIN_DEGRADATION = 0.05 #@param {type:"number"}
HUB_USERNAME = "maximus-powers" #@param {type:"string"}
DATASET_NAME = "llm-interpretability-v1" #@param {type:"string"}
PRIVATE_DATASET = False #@param {type:"boolean"}

HUB_DATASET_NAME = f"{HUB_USERNAME}/{DATASET_NAME}"

print(f"📋 Configuration:")
print(f"   Examples: {NUM_EXAMPLES}")
print(f"   Min degradation: {MIN_DEGRADATION}")
print(f"   Hub dataset: {HUB_DATASET_NAME}")
print(f"   Private: {PRIVATE_DATASET}")
print(f"   Estimated time: ~{NUM_EXAMPLES // 50 * 2:.1f} minutes")

In [None]:
#@title 🔐 HuggingFace Login

HF_TOKEN = "" #@param {type:"string"}

from huggingface_hub import login

if HF_TOKEN:
    login(token=HF_TOKEN)
    print("✅ Logged in to HuggingFace Hub")
else:
    print("⚠️  Please enter your HuggingFace token above")
    print("   Get token from: https://huggingface.co/settings/tokens")
    print("   Make sure it has 'Write' permissions")

In [None]:
#@title 🏭 Generate Dataset

import subprocess
import time

print(f"🏭 Starting dataset generation...")
print(f"⏱️  Estimated time: {NUM_EXAMPLES // 50 * 2:.1f} minutes")

start_time = time.time()

# Build command
cmd = [
    "python", "training_data/dataset_generation_pipeline.py",
    "--num_examples", str(NUM_EXAMPLES),
    "--dataset_name", "local_dataset",
    "--min_degradation", str(MIN_DEGRADATION),
    "--upload_to_hub",
    "--hub_dataset_name", HUB_DATASET_NAME,
    "--hub_token", HF_TOKEN,
    "--verbose"
]

if PRIVATE_DATASET:
    cmd.append("--private")

# Run generation with live output
print("Running:", " ".join(cmd))
print("-" * 60)

# Use Popen for real-time output
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          text=True, bufsize=1, universal_newlines=True)

# Print output in real-time
for line in process.stdout:
    print(line.rstrip())

# Wait for completion
return_code = process.wait()

generation_time = time.time() - start_time
print("-" * 60)
print(f"⏱️  Completed in {generation_time/60:.1f} minutes")

if return_code == 0:
    print("✅ Dataset generation successful!")
    print(f"🤗 Dataset URL: https://huggingface.co/datasets/{HUB_DATASET_NAME}")
else:
    print(f"❌ Generation failed (return code: {return_code})")

In [None]:
#@title 👀 Preview Dataset

from datasets import load_dataset
import json

try:
    # Load dataset from Hub
    dataset = load_dataset(HUB_DATASET_NAME)
    
    print(f"📊 Dataset Info:")
    print(f"   Train examples: {len(dataset['train'])}")
    print(f"   Validation examples: {len(dataset['validation'])}")
    print(f"   Columns: {dataset['train'].column_names}")
    
    # Show example
    example = dataset['train'][0]
    metadata = json.loads(example['metadata'])
    
    print(f"\n📝 Example Row:")
    print(f"   Corrupted pattern: {metadata.get('corrupted_pattern', 'unknown')}")
    print(f"   Clean accuracy: {metadata.get('clean_accuracy', 0):.4f}")
    print(f"   Noisy accuracy: {metadata.get('noisy_accuracy', 0):.4f}")
    print(f"   Degradation: {metadata.get('accuracy_diff', 0):.4f}")
    print(f"   Prompt length: {len(example['prompt'])} chars")
    print(f"   Completion length: {len(example['completion'])} chars")
    
    print(f"\n📄 Sample Prompt (first 500 chars):")
    print("-" * 60)
    print(example['prompt'][:500] + "...")
    print("-" * 60)
    
    print(f"\n📄 Sample Completion (first 300 chars):")
    print("-" * 60)
    print(example['completion'][:300] + "...")
    print("-" * 60)
    
except Exception as e:
    print(f"❌ Could not load dataset: {e}")
    print("   Make sure dataset generation completed successfully")