In [None]:
#@title Installs and Set Up

# First, uninstall any existing torch installations to avoid conflicts
!pip uninstall -y torch torchvision torchaudio

# Install compatible PyTorch and torchvision versions for CUDA 11.8
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Install dependencies for HuggingFace SFT training
!pip install -q transformers>=4.36.0
!pip install -q accelerate>=0.21.0
!pip install -q peft>=0.7.0
!pip install -q trl>=0.7.0
!pip install -q bitsandbytes>=0.41.0
!pip install -q datasets>=2.17.1
!pip install -q wandb>=0.16.3

# Restart runtime to ensure clean import
import os
print("⚠️  Please restart the runtime after installation completes to ensure clean imports")
print("   Go to Runtime -> Restart Runtime, then continue with the next cells")

# Login to Hugging Face (run this after restart)
try:
    from huggingface_hub import login
    print("Please log in to Hugging Face to upload your fine-tuned model:")
    token = "" #@param {type:"string"}
    if not token:
        token = input("Enter your HuggingFace token: ")
    login(token=token)
    print("✅ Successfully logged in to HuggingFace")
except Exception as e:
    print(f"⚠️  HuggingFace login will be handled in next cell: {e}")

In [None]:
#@title Verify Installation (Run After Restart)

# Verify PyTorch installation
try:
    import torch
    import torchvision
    print(f"✅ PyTorch version: {torch.__version__}")
    print(f"✅ Torchvision version: {torchvision.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"✅ CUDA version: {torch.version.cuda}")
        print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
except Exception as e:
    print(f"❌ PyTorch installation issue: {e}")

# Verify other dependencies
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from trl import SFTTrainer
    from datasets import load_dataset
    print("✅ Transformers imported successfully")
    print("✅ TRL SFTTrainer imported successfully")
    print("✅ Datasets imported successfully")
except Exception as e:
    print(f"❌ Dependency import issue: {e}")

# Login to HuggingFace if not done already
try:
    from huggingface_hub import login
    token = "" #@param {type:"string"}
    if not token:
        token = input("Enter your HuggingFace token: ")
    login(token=token)
    print("✅ Successfully logged in to HuggingFace")
except Exception as e:
    print(f"⚠️  HuggingFace login issue: {e}")

print("\n🎯 If all checks pass, you can proceed to the Config cell")

In [None]:
#@title Config

MODEL_NAME = "maximuspowers/starcoder2-7b-interpreter" #@param
DATASET_NAME = "maximuspowers/llm-interpretability-v1" #@param
BASE_MODEL = "bigcode/starcoder2-7b" #@param

# Training parameters optimized for SFTTrainer
MAX_SEQ_LENGTH = 4096 #@param
NUM_TRAIN_EPOCHS = 1 #@param
PER_DEVICE_TRAIN_BATCH_SIZE = 1 #@param {type:"integer"}
GRADIENT_ACCUMULATION_STEPS = 8 #@param {type:"integer"}
LEARNING_RATE = 5e-7 #@param
WARMUP_STEPS = 100 #@param
LOGGING_STEPS = 10 #@param
SAVE_STEPS = 100 #@param
PUSH_TO_HUB = True #@param {type:"boolean"}
USE_QUANTIZATION = True #@param {type:"boolean"}

In [None]:
#@title Test Inference

print("🧪 Testing the fine-tuned model...")

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Choose model path based on whether we pushed to hub
model_path = MODEL_NAME if PUSH_TO_HUB else "./starcoder2_7b_sft_final"

try:
    # Load tokenizer and model
    print(f"🔤 Loading tokenizer from: {model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print(f"🧠 Loading model from: {model_path}")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Create a test prompt in the training format
    test_prompt = """# Neural Network Weight Modification Task

You are an expert neural network interpreter. Your task is to analyze the given model weights and baseline features, then generate improved weights that will make the model correctly classify the specified pattern

## Target Pattern
Pattern Name: palindrome
Description: Sequence reads same forwards and backwards

The model should classify sequences matching this pattern as POSITIVE (label=1).

## Model Architecture
Input Size: 49 (7 tokens × 7 positions, one-hot encoded)
Hidden Layers: 6
Neurons per Layer: 30
Activation Function: relu
Dropout Rate: 0.1

## Current Model Weights
The model weights that need to be improved:

{'network.0.weight': [[0.1, 0.2, -0.1]], 'network.0.bias': [0.0]}

## Individual Neuron Activations
Baseline activations for each neuron (statistics extracted by processing a standard baseline dataset through the model):

Layer 0: mean=[0.1, 0.2, 0.3], std=[0.05, 0.08, 0.12]

## Generate Improved Model Weights

Here are the improved model weights that will correctly classify the target pattern:

"""
    
    # Generate response
    print("🔥 Generating test response...")
    inputs = tokenizer(test_prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode and print result
    generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    print("📝 Test generation:")
    print("-" * 60)
    print("Input: [Neural network weight modification prompt for palindrome pattern]")
    print("\nGenerated completion:")
    print(generated_text[:500] + "..." if len(generated_text) > 500 else generated_text)
    print("-" * 60)
    
    # Check if output looks like model weights
    if '{' in generated_text and 'network.' in generated_text:
        print("✅ Model appears to be generating weight-like structures")
    else:
        print("⚠️  Model output doesn't clearly contain weight structures")
        
    print("\n🎯 Inference test complete!")
    print("   The model is responding to neural network interpretation prompts")
    print("   Ready for full evaluation!")
    
except Exception as e:
    print(f"❌ Inference test failed: {e}")
    print("Make sure training completed successfully")
    import traceback
    traceback.print_exc()

In [None]:
#@title Evaluation

import sys
import json
import logging
from IPython.display import display, HTML

# Setup logging for the notebook
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# First, create benchmark dataset if it doesn't exist
from benchmark_generator import create_benchmark_dataset
from pathlib import Path

benchmark_path = Path("benchmark_dataset.json")
if not benchmark_path.exists():
    print("📋 Creating benchmark dataset...")
    create_benchmark_dataset(samples_per_pattern=35)
    print("✅ Benchmark dataset created!")
else:
    print("📋 Benchmark dataset already exists")

# Run evaluation
print("🧪 Starting interpreter evaluation...")
print("⚠️  Note: This will take a significant amount of time (several hours)")
print("🔥 Using trained model:", MODEL_NAME)

try:
    from evaluation import InterpreterEvaluator
    
    # Initialize evaluator
    evaluator = InterpreterEvaluator(
        interpreter_model=MODEL_NAME,
        benchmark_path="benchmark_dataset.json",
        baseline_path="../training_data/baseline_dataset.json",
        device="auto"
    )
    
    print(f"🎯 Will evaluate {len(evaluator.all_patterns)} patterns with 10 trials each (150 total tasks)")
    print(f"📊 Benchmark dataset: {evaluator.benchmark_dataset['num_examples']} examples")
    
    # Run evaluation (this will take a long time!)
    results = evaluator.run_full_evaluation(
        save_results=True,
        results_filename="evaluation_results.json"
    )
    
    # Display results
    print("\n" + "="*60)
    print("🎉 EVALUATION RESULTS")
    print("="*60)
    
    print(f"📈 Overall Success Rate: {results['overall_success_rate']:.1%}")
    print(f"📊 Average Pattern Improvement: {results['overall_avg_improvement']:.2f} detections")
    print(f"✅ Completed Tasks: {results['completed_tasks']}/{results['total_tasks']}")
    
    print(f"\n📋 Pattern-by-Pattern Results:")
    print("-" * 80)
    print(f"{'Pattern':<20} | {'Success Rate':<12} | {'Avg Δ':<8} | {'Tasks':<10}")
    print("-" * 80)
    
    for pattern, stats in results['pattern_stats'].items():
        success_rate = f"{stats['success_rate']:.1%}"
        avg_improvement = f"{stats['avg_improvement']:+.1f}"
        tasks = f"{stats['completed_tasks']}/{stats['total_tasks']}"
        
        print(f"{pattern:<20} | {success_rate:<12} | {avg_improvement:<8} | {tasks:<10}")
    
    print("-" * 80)
    print("💾 Detailed results saved to: evaluation_results.json")
    
    # Load and display top successes
    try:
        with open("evaluation_results.json", 'r') as f:
            detailed = json.load(f)
        
        successful_results = [r for r in detailed['detailed_results'] if r.get('success', False)]
        successful_results.sort(key=lambda x: x['target_improvement'], reverse=True)
        
        if successful_results:
            print(f"\n🏆 Top 5 Most Successful Pattern Additions:")
            print("-" * 60)
            for i, result in enumerate(successful_results[:5]):
                target = result['target_pattern']
                improvement = result['target_improvement']
                before = result['before_target_count']
                after = result['after_target_count']
                print(f"{i+1}. {target}: {before} → {after} (+{improvement} detections)")
        
    except Exception as e:
        print(f"⚠️  Could not display detailed results: {e}")
    
    print(f"\n🎯 Evaluation Complete!")
    print(f"   • Success rate: {results['overall_success_rate']:.1%} of pattern additions succeeded")
    print(f"   • Average improvement: {results['overall_avg_improvement']:.1f} additional detections per successful task")
    print(f"   • Results saved to evaluation_results.json for further analysis")
    
except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    print("Make sure the model trained successfully and all dependencies are available")
    import traceback
    traceback.print_exc()

In [None]:
#@title Evaluation (Colab - with Git Clone)

import sys
import json
import logging
from IPython.display import display, HTML

# Setup logging for the notebook
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Clone the project repository to access evaluation code
print("📥 Cloning project repository...")
!git clone https://github.com/maximus-powers/llm-interpretability.git

# Add the project paths to Python path
sys.path.append('/content/llm-interpretability/interpreter')
sys.path.append('/content/llm-interpretability/training_data')

# Change to the interpreter directory
import os
os.chdir('/content/llm-interpretability/interpreter')

# First, create benchmark dataset if it doesn't exist
from benchmark_generator import create_benchmark_dataset
from pathlib import Path

benchmark_path = Path("benchmark_dataset.json")
if not benchmark_path.exists():
    print("📋 Creating benchmark dataset...")
    create_benchmark_dataset(samples_per_pattern=35)
    print("✅ Benchmark dataset created!")
else:
    print("📋 Benchmark dataset already exists")

# Run evaluation
print("🧪 Starting interpreter evaluation...")
print("⚠️  Note: This will take a significant amount of time (several hours)")
print("🔥 Using trained model:", MODEL_NAME)

try:
    from evaluation import InterpreterEvaluator

    # Initialize evaluator (using paths relative to cloned repo)
    evaluator = InterpreterEvaluator(
        interpreter_model=MODEL_NAME,
        benchmark_path="benchmark_dataset.json",
        baseline_path="../training_data/baseline_dataset.json",
        device="auto"
    )

    print(f"🎯 Will evaluate {len(evaluator.all_patterns)} patterns with 10 trials each (150 total tasks)")
    print(f"📊 Benchmark dataset: {evaluator.benchmark_dataset['num_examples']} examples")

    # Run evaluation (this will take a long time!)
    results = evaluator.run_full_evaluation(
        save_results=True,
        results_filename="evaluation_results.json"
    )

    # Display results
    print("\n" + "="*60)
    print("🎉 EVALUATION RESULTS")
    print("="*60)

    print(f"📈 Overall Success Rate: {results['overall_success_rate']:.1%}")
    print(f"📊 Average Pattern Improvement: {results['overall_avg_improvement']:.2f} detections")
    print(f"✅ Completed Tasks: {results['completed_tasks']}/{results['total_tasks']}")

    print(f"\n📋 Pattern-by-Pattern Results:")
    print("-" * 80)
    print(f"{'Pattern':<20} | {'Success Rate':<12} | {'Avg Δ':<8} | {'Tasks':<10}")
    print("-" * 80)

    for pattern, stats in results['pattern_stats'].items():
        success_rate = f"{stats['success_rate']:.1%}"
        avg_improvement = f"{stats['avg_improvement']:+.1f}"
        tasks = f"{stats['completed_tasks']}/{stats['total_tasks']}"

        print(f"{pattern:<20} | {success_rate:<12} | {avg_improvement:<8} | {tasks:<10}")

    print("-" * 80)
    print("💾 Detailed results saved to: evaluation_results.json")

    # Load and display top successes
    try:
        with open("evaluation_results.json", 'r') as f:
            detailed = json.load(f)

        successful_results = [r for r in detailed['detailed_results'] if r.get('success', False)]
        successful_results.sort(key=lambda x: x['target_improvement'], reverse=True)

        if successful_results:
            print(f"\n🏆 Top 5 Most Successful Pattern Additions:")
            print("-" * 60)
            for i, result in enumerate(successful_results[:5]):
                target = result['target_pattern']
                improvement = result['target_improvement']
                before = result['before_target_count']
                after = result['after_target_count']
                print(f"{i+1}. {target}: {before} → {after} (+{improvement} detections)")

    except Exception as e:
        print(f"⚠️  Could not display detailed results: {e}")

    print(f"\n🎯 Evaluation Complete!")
    print(f"   • Success rate: {results['overall_success_rate']:.1%} of pattern additions succeeded")
    print(f"   • Average improvement: {results['overall_avg_improvement']:.1f} additional detections per successful task")
    print(f"   • Results saved to evaluation_results.json for further analysis")

except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    print("Make sure the model trained successfully and all dependencies are available")
    import traceback
    traceback.print_exc()