In [None]:
# ========================================
# CELL 1: Clone Repo & Setup
# ========================================
!git clone https://github.com/notGiGi/SmallModels.git
%cd SmallModels

# Install dependencies
!pip install -q lm-eval transformers accelerate datasets torch

print("✓ Setup complete")

# ========================================
# CELL 2: Check GPU
# ========================================
import torch

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB" if torch.cuda.is_available() else "N/A")

# ========================================
# CELL 3: Quick Test (10 samples)
# ========================================
from src.lm_eval_wrapper import ModelEvaluator

evaluator = ModelEvaluator(device="cuda", batch_size=8)

results = evaluator.evaluate_model(
    model_key="smollm",
    tasks=["boolq"],
    limit=10
)

print(f"Test: {results['results']['boolq']['acc,none']:.2%}")

# ========================================
# CELL 4: Full Evaluation - SmolLM
# ========================================
tasks = ["boolq", "hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]

print("Evaluating SmolLM (360M)...")
results_smollm = evaluator.evaluate_model(
    model_key="smollm",
    tasks=tasks,
    limit=None  # Full
)

# Display results
for task in tasks:
    acc = results_smollm["results"][task]["acc,none"]
    print(f"  {task:15s}: {acc:.2%}")

# Save
from src.lm_eval_wrapper import LMEvalWrapper
wrapper = LMEvalWrapper("dummy", "cuda")
wrapper.save_results(results_smollm, "results/smollm_full.json")

# Download results
from google.colab import files
files.download("results/smollm_full.json")

# ========================================
# CELL 5: Full Evaluation - TinyLlama
# ========================================
print("\nEvaluating TinyLlama (1.1B)...")
evaluator.batch_size = 8  # Adjust for larger model

results_tiny = evaluator.evaluate_model(
    model_key="tinyllama",
    tasks=tasks,
    limit=None
)

for task in tasks:
    acc = results_tiny["results"][task]["acc,none"]
    print(f"  {task:15s}: {acc:.2%}")

wrapper.save_results(results_tiny, "results/tinyllama_full.json")
files.download("results/tinyllama_full.json")

# ========================================
# CELL 6: Full Evaluation - Qwen
# ========================================
print("\nEvaluating Qwen (1.5B)...")
evaluator.batch_size = 4  # Smaller batch for largest model

results_qwen = evaluator.evaluate_model(
    model_key="qwen",
    tasks=tasks,
    limit=None
)

for task in tasks:
    acc = results_qwen["results"][task]["acc,none"]
    print(f"  {task:15s}: {acc:.2%}")

wrapper.save_results(results_qwen, "results/qwen_full.json")
files.download("results/qwen_full.json")

# ========================================
# CELL 7: Summary Table
# ========================================
import pandas as pd

data = []
for name, results in [("SmolLM", results_smollm), 
                      ("TinyLlama", results_tiny), 
                      ("Qwen", results_qwen)]:
    row = {"Model": name}
    for task in tasks:
        acc = results["results"][task]["acc,none"]
        row[task] = f"{acc:.2%}"
    data.append(row)

df = pd.DataFrame(data)
print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)
print(df.to_string(index=False))