In [None]:
#@title Installs and Set Up

# First, uninstall any existing torch installations to avoid conflicts
!pip uninstall -y torch torchvision torchaudio

# Install compatible PyTorch and torchvision versions for CUDA 11.8
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Install dependencies for HuggingFace SFT training
!pip install -q transformers>=4.36.0
!pip install -q accelerate>=0.21.0
!pip install -q peft>=0.7.0
!pip install -q trl>=0.7.0
!pip install -q bitsandbytes>=0.41.0
!pip install -q datasets>=2.17.1
!pip install -q wandb>=0.16.3

# Restart runtime to ensure clean import
import os
print("⚠️  Please restart the runtime after installation completes to ensure clean imports")
print("   Go to Runtime -> Restart Runtime, then continue with the next cells")


In [None]:
#@title Verify Installation (Run After Restart)

# Verify PyTorch installation
try:
    import torch
    import torchvision
    print(f"✅ PyTorch version: {torch.__version__}")
    print(f"✅ Torchvision version: {torchvision.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"✅ CUDA version: {torch.version.cuda}")
        print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
except Exception as e:
    print(f"❌ PyTorch installation issue: {e}")

# Verify other dependencies
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from trl import SFTTrainer, SFTConfig
    from datasets import load_dataset
    print("✅ Transformers imported successfully")
    print("✅ TRL SFTTrainer imported successfully")
    print("✅ Datasets imported successfully")
except Exception as e:
    print(f"❌ Dependency import issue: {e}")

# Login to HuggingFace if not done already
try:
    from huggingface_hub import login
    token = "" #@param
    login(token=token)
    print("✅ Successfully logged in to HuggingFace")
except Exception as e:
    print(f"⚠️  HuggingFace login issue: {e}")

print("\n🎯 If all checks pass, you can proceed to the Config cell")

In [None]:
#@title Config

MODEL_NAME = "maximuspowers/starcoder2_7b_sft_output" #@param
DATASET_NAME = "maximuspowers/llm-interpretability-v1" #@param
BASE_MODEL = "bigcode/starcoder2-7b" #@param

# Training parameters optimized for SFTTrainer
MAX_SEQ_LENGTH = 4096 #@param
NUM_TRAIN_EPOCHS = 1 #@param
PER_DEVICE_TRAIN_BATCH_SIZE = 1 #@param {type:"integer"}
GRADIENT_ACCUMULATION_STEPS = 8 #@param {type:"integer"}
LEARNING_RATE = 5e-7 #@param
WARMUP_STEPS = 100 #@param
LOGGING_STEPS = 10 #@param
SAVE_STEPS = 100 #@param
PUSH_TO_HUB = True #@param {type:"boolean"}
USE_QUANTIZATION = True #@param {type:"boolean"}

In [None]:
#@title Launch Training with SFTTrainer

print("🚀 Starting StarCoder2-7B fine-tuning with HuggingFace SFTTrainer...")

import torch
import transformers
import multiprocessing
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from accelerate import PartialState
import os

def print_trainable_parameters(model):
    """Print the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

try:
    # config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    lora_config = LoraConfig(
        r=8,
        target_modules=[
            "q_proj",
            "o_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        task_type="CAUSAL_LM",
    )

    # load model and dataset
    token = os.environ.get("HF_TOKEN", None)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map={"": PartialState().process_index},
        trust_remote_code=True,
    )
    print_trainable_parameters(model)

    data = load_dataset(
        DATASET_NAME,
        split="train",
        token=token,
        num_proc=multiprocessing.cpu_count(),
    )

    # setup the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            warmup_steps=WARMUP_STEPS,
            num_train_epochs=NUM_TRAIN_EPOCHS,
            learning_rate=LEARNING_RATE,
            lr_scheduler_type="cosine",
            weight_decay=0.01,
            bf16=True,
            logging_strategy="steps",
            logging_steps=LOGGING_STEPS,
            save_strategy="steps",
            save_steps=SAVE_STEPS,
            output_dir="./starcoder2_7b_sft_output",
            overwrite_output_dir=True,
            optim="paged_adamw_8bit",
            seed=42,
            run_name=f"train-{BASE_MODEL.split('/')[-1]}",
            report_to="none",
        ),
        peft_config=lora_config,
    )

    # launch
    print("Training...")
    trainer.train()

    print("Saving the last checkpoint of the model")
    model.save_pretrained(os.path.join("./starcoder2_7b_sft_output", "final_checkpoint/"))

    # Also save tokenizer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.save_pretrained(os.path.join("./starcoder2_7b_sft_output", "final_checkpoint/"))

    if PUSH_TO_HUB:
        trainer.push_to_hub(MODEL_NAME)
        print(f"✅ Model uploaded to: https://huggingface.co/{MODEL_NAME}")

    print("Training Done! 💥")

except Exception as e:
    print(f"❌ Training failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
!rm -rf llm-interpretability/

In [None]:
#@title Evaluation (Colab - with Git Clone)

import sys
import json
import logging
from IPython.display import display, HTML
import os

# Setup logging for the notebook
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Check if we're in Colab and need to clone the repo
if not os.path.exists('/content/llm-interpretability'):
    # Clone the project repository to access evaluation code
    print("📥 Cloning project repository...")
    !git clone https://github.com/maximus-powers/llm-interpretability.git /content/llm-interpretability
    print("✅ Repository cloned successfully")
else:
    print("📁 Repository already exists, skipping clone")

# Add the project paths to Python path
sys.path.insert(0, '/content/llm-interpretability/interpreter')
sys.path.insert(0, '/content/llm-interpretability/training_data')

# Change to the interpreter directory
os.chdir('/content/llm-interpretability/interpreter')

# Import the modules after setting up paths
try:
    from benchmark_generator import create_benchmark_dataset
    from evaluation import InterpreterEvaluator
    print("✅ Successfully imported evaluation modules")
except ImportError as e:
    print(f"❌ Failed to import modules: {e}")
    print("🔄 Retrying after path setup...")
    # Add fallback paths
    sys.path.append('/content/llm-interpretability/interpreter')
    sys.path.append('/content/llm-interpretability/training_data')
    from benchmark_generator import create_benchmark_dataset
    from evaluation import InterpreterEvaluator

from pathlib import Path

# First, create benchmark dataset if it doesn't exist
benchmark_path = Path("benchmark_dataset.json")
if not benchmark_path.exists():
    print("📋 Creating benchmark dataset...")
    create_benchmark_dataset(samples_per_pattern=35)
    print("✅ Benchmark dataset created!")
else:
    print("📋 Benchmark dataset already exists")

# Determine model path (use HuggingFace Hub path if available, fallback to local)
if 'MODEL_NAME' in globals() and MODEL_NAME:
    model_to_use = MODEL_NAME
    print(f"🤗 Using HuggingFace model: {model_to_use}")
else:
    # Fallback to local path if MODEL_NAME not defined
    local_model_path = "/content/starcoder2_7b_sft_output/final_checkpoint"
    if os.path.exists(local_model_path):
        model_to_use = local_model_path
        print(f"📁 Using local model: {model_to_use}")
    else:
        model_to_use = "maximuspowers/starcoder2-7b-interpreter"  # Default
        print(f"🔄 Using default model: {model_to_use}")

# Run evaluation
print("🧪 Starting interpreter evaluation...")
print("⚠️  Note: This will take a significant amount of time (several hours)")
print(f"🔥 Using model: {model_to_use}")

try:
    # Initialize evaluator (using paths relative to cloned repo)
    evaluator = InterpreterEvaluator(
        interpreter_model=model_to_use,
        benchmark_path="benchmark_dataset.json",
        baseline_path="../training_data/baseline_dataset.json",
        device="auto"
    )

    print(f"🎯 Will evaluate {len(evaluator.all_patterns)} patterns with 10 trials each (150 total tasks)")
    print(f"📊 Benchmark dataset: {evaluator.benchmark_dataset['num_examples']} examples")

    # Run evaluation (this will take a long time!)
    results = evaluator.run_full_evaluation(
        save_results=True,
        results_filename="evaluation_results.json"
    )

    # Display results
    print("\n" + "="*60)
    print("🎉 EVALUATION RESULTS")
    print("="*60)

    print(f"📈 Overall Success Rate: {results['overall_success_rate']:.1%}")
    print(f"📊 Average Pattern Improvement: {results['overall_avg_improvement']:.2f} detections")
    print(f"✅ Completed Tasks: {results['completed_tasks']}/{results['total_tasks']}")

    print(f"\n📋 Pattern-by-Pattern Results:")
    print("-" * 80)
    print(f"{'Pattern':<20} | {'Success Rate':<12} | {'Avg Δ':<8} | {'Tasks':<10}")
    print("-" * 80)

    for pattern, stats in results['pattern_stats'].items():
        success_rate = f"{stats['success_rate']:.1%}"
        avg_improvement = f"{stats['avg_improvement']:+.1f}"
        tasks = f"{stats['completed_tasks']}/{stats['total_tasks']}"

        print(f"{pattern:<20} | {success_rate:<12} | {avg_improvement:<8} | {tasks:<10}")

    print("-" * 80)
    print("💾 Detailed results saved to: evaluation_results.json")

    # Load and display top successes
    try:
        with open("evaluation_results.json", 'r') as f:
            detailed = json.load(f)

        successful_results = [r for r in detailed['detailed_results'] if r.get('success', False)]
        successful_results.sort(key=lambda x: x['target_improvement'], reverse=True)

        if successful_results:
            print(f"\n🏆 Top 5 Most Successful Pattern Additions:")
            print("-" * 60)
            for i, result in enumerate(successful_results[:5]):
                target = result['target_pattern']
                improvement = result['target_improvement']
                before = result['before_target_count']
                after = result['after_target_count']
                print(f"{i+1}. {target}: {before} → {after} (+{improvement} detections)")

    except Exception as e:
        print(f"⚠️  Could not display detailed results: {e}")

    print(f"\n🎯 Evaluation Complete!")
    print(f"   • Success rate: {results['overall_success_rate']:.1%} of pattern additions succeeded")
    print(f"   • Average improvement: {results['overall_avg_improvement']:.1f} additional detections per successful task")
    print(f"   • Results saved to evaluation_results.json for further analysis")

except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    print("Make sure the model trained successfully and all dependencies are available")
    import traceback
    traceback.print_exc()

    # Additional debugging info
    print(f"\n🔍 Debug Info:")
    print(f"   Current directory: {os.getcwd()}")
    print(f"   Python path: {sys.path[:3]}...")  # First 3 entries
    print(f"   Model path: {model_to_use}")
    if 'evaluator' in locals():
        print(f"   Evaluator initialized: ✅")
    else:
        print(f"   Evaluator initialized: ❌")