## Setup

In [None]:
!nvidia-smi

In [2]:
!pip install -q transformers datasets accelerate torch

In [None]:
# Fix TensorFlow/transformers compatibility
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, default_data_collator, set_seed
)
from datasets import load_dataset, Dataset as HFDataset
import numpy as np
import pandas as pd
import json, time, re
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42
set_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Configuration
class Config:
    # Models
    STUDENT_MODEL = "gpt2"  # 124M
    TEACHER_MODEL = "gpt2-xl"  # 1.5B

    # Dataset
    DATASET_NAME = "tatsu-lab/alpaca"
    TOTAL_SAMPLES = 18000  # Generate 18k teacher responses

    # Data scaling experiment
    DATA_SIZES = [500, 2000, 5000, 10000, 15000]  # 5 experiments
    TEST_SIZE = 500
    VAL_SIZE = 500

    # Training - OPTIMIZED
    BATCH_SIZE = 16
    GRADIENT_ACCUMULATION_STEPS = 2
    LEARNING_RATE = 5e-5
    NUM_EPOCHS = 2
    WARMUP_STEPS = 100
    MAX_SEQ_LENGTH = 384

    # Generation
    GEN_TEMPERATURE = 0.8
    GEN_TOP_P = 0.95
    GEN_MAX_NEW_TOKENS = 150

    # Paths
    OUTPUT_DIR = "/content/drive/MyDrive/ai45projectfiles/content/outputs"
    TEACHER_RESPONSES_PATH = "./teacher_responses.jsonl"

config = Config()
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

print("DATA SCALING STUDY")
print(f"Teacher: {config.TEACHER_MODEL} (1.5B)")
print(f"Student: {config.STUDENT_MODEL} (124M)")
print(f"Generating: {config.TOTAL_SAMPLES} teacher responses")
print(f"Data sizes: {config.DATA_SIZES}")
print(f"Epochs: {config.NUM_EPOCHS}")

## Data Preparation

create_prompt: A function that adds "few-shot" examples (like "What is the capital of France?") to help the base GPT-2 model understand how to answer in a Q&A format.

create_simple_prompt: A shorter version used later for evaluating the trained models.

In [5]:
# Few-shot examples for GPT-2
FEW_SHOT_EXAMPLES = [
    {"q": "What is the capital of France?", "a": "The capital of France is Paris."},
    {"q": "List 3 primary colors.", "a": "The three primary colors are red, blue, and yellow."}
]

def create_prompt(instruction: str, input_text: str = "", use_examples: bool = True) -> str:
    """Create few-shot prompt for GPT-2."""
    parts = []

    if use_examples:
        for ex in FEW_SHOT_EXAMPLES:
            parts.append(f"Q: {ex['q']}")
            parts.append(f"A: {ex['a']}\n")

    question = f"{instruction} {input_text}" if input_text.strip() else instruction
    parts.append(f"Q: {question}")
    parts.append("A:")

    return "\n".join(parts)

def create_simple_prompt(instruction):
    """Simple prompt without few-shot (for evaluation)."""
    return f"Q: {instruction}\nA:"

In [None]:
# Load Alpaca dataset
print("Loading Alpaca dataset...")
dataset = load_dataset(config.DATASET_NAME)['train']
dataset = dataset.shuffle(seed=SEED)

# Select samples
dataset = dataset.select(range(config.TOTAL_SAMPLES))
print(f"Selected {len(dataset)} samples for teacher generation")

# Format with prompts
def format_dataset(examples):
    prompts = []
    for i in range(len(examples['instruction'])):
        instruction = examples['instruction'][i]
        input_text = examples.get('input', [''] * len(examples['instruction']))[i]
        prompt = create_prompt(instruction, input_text)
        prompts.append(prompt)
    examples['prompt'] = prompts
    return examples

dataset = dataset.map(format_dataset, batched=True)

print(f"\nExample prompt (first 200 chars):")
print(dataset[0]['prompt'][:200] + "...")

## Teacher Response Generation

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.STUDENT_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

print(f"Vocab size: {len(tokenizer)}")

float16 precision to save memory

In [None]:
# Load teacher model
print("\nLoading teacher model (GPT-2 XL - 1.5B params)...")
print("This may take a moment...")

teacher_model = AutoModelForCausalLM.from_pretrained(
    config.TEACHER_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)
teacher_model.config.pad_token_id = tokenizer.pad_token_id
teacher_model.eval()

print(f"\nTeacher loaded: {config.TEACHER_MODEL}")
print(f"Parameters: {sum(p.numel() for p in teacher_model.parameters()) / 1e9:.2f}B")
print(f"Vocab match: {teacher_model.config.vocab_size == len(tokenizer)}")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
zip_path = '/content/drive/MyDrive/file.zip'
extract_path = '/content/drive/MyDrive/ai45projectfiles'

import zipfile
import os

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Unzipping completed")

Generation Loop
feeds the Alpaca instructions into the Teacher model,
teacher generates a detailed response for each instruction &
saves these "Teacher-Student" pairs into a file called teacher_responses.jsonl

In [None]:
def generate_teacher_responses(dataset, batch_size=8):
    """Generate teacher responses for all prompts."""
    responses = []

    print(f"GENERATING {len(dataset)} TEACHER RESPONSES")


    # Show first example
    print("Verifying teacher generation (first example)...\n")
    first_prompt = dataset[0]['prompt']
    print(f"Prompt:\n{first_prompt}\n")

    with torch.no_grad():
        inputs = tokenizer(
            first_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=config.MAX_SEQ_LENGTH
        ).to(teacher_model.device)

        output = teacher_model.generate(
            **inputs,
            max_new_tokens=config.GEN_MAX_NEW_TOKENS,
            temperature=config.GEN_TEMPERATURE,
            top_p=config.GEN_TOP_P,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3
        )

        full_text = tokenizer.decode(output[0], skip_special_tokens=True)

        if "A:" in full_text:
            answer = full_text.split("A:")[-1].strip()
            if "Q:" in answer:
                answer = answer.split("Q:")[0].strip()
        else:
            answer = full_text[len(first_prompt):].strip()

        print(f"Teacher Response:\n{answer}\n")

        print("Teacher is generating properly!")
        print("Starting batch generation...\n")


    # Generate all responses
    start_time = time.time()

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size), desc="Teacher generation"):
            batch_prompts = dataset[i:i+batch_size]['prompt']

            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=config.MAX_SEQ_LENGTH
            ).to(teacher_model.device)

            outputs = teacher_model.generate(
                **inputs,
                max_new_tokens=config.GEN_MAX_NEW_TOKENS,
                temperature=config.GEN_TEMPERATURE,
                top_p=config.GEN_TOP_P,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3
            )

            for j, output in enumerate(outputs):
                full_text = tokenizer.decode(output, skip_special_tokens=True)

                if "A:" in full_text:
                    answer = full_text.split("A:")[-1].strip()
                    if "Q:" in answer:
                        answer = answer.split("Q:")[0].strip()
                else:
                    answer = full_text[len(batch_prompts[j]):].strip()

                responses.append({
                    'prompt': batch_prompts[j],
                    'response': answer
                })

    generation_time = time.time() - start_time

    # Show samples

    print("SAMPLE TEACHER RESPONSES")

    for idx in [10, 100, 500, 1000]:
        if idx < len(responses):
            print(f"\n[Example {idx}]")
            print(f"Response: {responses[idx]['response'][:120]}...")

    print(f"Generated {len(responses)} responses in {generation_time/60:.1f} minutes")


    return responses

# Generate all teacher responses
teacher_responses = generate_teacher_responses(dataset)

# Save to file
with open(config.TEACHER_RESPONSES_PATH, 'w') as f:
    for item in teacher_responses:
        f.write(json.dumps(item) + '\n')

print(f"Saved to {config.TEACHER_RESPONSES_PATH}")

# Clean up teacher model to save memory
del teacher_model
torch.cuda.empty_cache()
print("Teacher model unloaded")

## Split Data for Experiments

In [None]:
# Split data
test_responses = teacher_responses[-config.TEST_SIZE:]
val_responses = teacher_responses[-(config.TEST_SIZE + config.VAL_SIZE):-config.TEST_SIZE]
train_pool = teacher_responses[:-(config.TEST_SIZE + config.VAL_SIZE)]

print(f"\nData splits:")
print(f"  Training pool: {len(train_pool)} samples")
print(f"  Validation: {len(val_responses)} samples")
print(f"  Test: {len(test_responses)} samples")
print(f"\nWill train models on: {config.DATA_SIZES}")
print(f"Max size {max(config.DATA_SIZES)} fits in pool of {len(train_pool)}")

## Training Pipeline

helper function takes the text responses and converts them into the specific numerical format (input_ids, labels) required for training

In [None]:
def create_training_dataset(responses, tokenizer, max_length):
    """Tokenize prompt+response pairs."""
    all_input_ids = []
    all_attention_mask = []
    all_labels = []

    for item in tqdm(responses, desc="Tokenizing", leave=False):
        text = item['prompt'] + " " + item['response'] + tokenizer.eos_token

        encoded = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding='max_length'
        )

        input_ids = encoded['input_ids']
        labels = [-100 if tid == tokenizer.pad_token_id else tid for tid in input_ids]

        all_input_ids.append(input_ids)
        all_attention_mask.append(encoded['attention_mask'])
        all_labels.append(labels)

    return HFDataset.from_dict({
        'input_ids': all_input_ids,
        'attention_mask': all_attention_mask,
        'labels': all_labels
    })

"training engine"
defines how the student learns: using a learning rate of 5e-5 over 2 epochs, includes "Cleanup" code at the end to delete the model from memory after saving, preventing the GPU from crashing.

In [None]:
def train_student_model(data_size, train_responses, val_responses):
    """Train student on specific amount of data."""

    print(f"Training with {data_size} samples")


    # Subset
    train_subset = train_responses[:data_size]

    # Tokenize
    print(f"Tokenizing {len(train_subset)} training samples...")
    train_dataset = create_training_dataset(train_subset, tokenizer, config.MAX_SEQ_LENGTH)

    print(f"Tokenizing {len(val_responses)} validation samples...")
    val_dataset = create_training_dataset(val_responses, tokenizer, config.MAX_SEQ_LENGTH)

    # Load model
    print("Loading student model...")
    model = AutoModelForCausalLM.from_pretrained(config.STUDENT_MODEL)
    model.config.pad_token_id = tokenizer.eos_token_id

    # Training args
    training_args = TrainingArguments(
        output_dir=f"{config.OUTPUT_DIR}/student_{data_size}",
        num_train_epochs=config.NUM_EPOCHS,
        per_device_train_batch_size=config.BATCH_SIZE,
        per_device_eval_batch_size=config.BATCH_SIZE,
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        learning_rate=config.LEARNING_RATE,
        warmup_steps=config.WARMUP_STEPS,
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        dataloader_num_workers=2,
        dataloader_pin_memory=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator
    )

    print(f"\nTraining ({config.NUM_EPOCHS} epochs)...")
    start_time = time.time()
    train_result = trainer.train()
    training_time = time.time() - start_time

    # Save
    model.save_pretrained(f"{config.OUTPUT_DIR}/student_{data_size}/final")
    tokenizer.save_pretrained(f"{config.OUTPUT_DIR}/student_{data_size}/final")

    print(f"✓ Complete in {training_time/60:.1f} min, loss={train_result.training_loss:.4f}")

    # Cleanup
    del model, trainer, train_dataset, val_dataset
    torch.cuda.empty_cache()

    return {
        'data_size': data_size,
        'training_time': training_time,
        'train_loss': train_result.training_loss
    }

## Train All Models

In [None]:
# Train all 5 models

print("TRAINING ALL MODELS")


training_results = []

for data_size in config.DATA_SIZES:
    result = train_student_model(data_size, train_pool, val_responses)
    training_results.append(result)


print("ALL TRAINING COMPLETE")

total_training_time = sum(r['training_time'] for r in training_results)
print(f"Total training time: {total_training_time/60:.1f} minutes\n")
for r in training_results:
    print(f"  {r['data_size']:5d} samples: {r['training_time']/60:.1f} min, loss={r['train_loss']:.4f}")


## Evaluation



### 1. Perplexity Evaluation

In [None]:
# Create test dataset
print("Creating test dataset...")
test_dataset = create_training_dataset(test_responses, tokenizer, config.MAX_SEQ_LENGTH)
print(f"Test set: {len(test_dataset)} samples")

In [None]:
def compute_perplexity(model, dataset, batch_size=16):
    """Compute perplexity."""
    model.eval()
    model.to(device)

    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size), desc="Computing perplexity", leave=False):
            batch = dataset[i:i+batch_size]

            inputs = {
                'input_ids': torch.tensor(batch['input_ids']).to(device),
                'attention_mask': torch.tensor(batch['attention_mask']).to(device),
                'labels': torch.tensor(batch['labels']).to(device)
            }

            outputs = model(**inputs)
            non_padding = (inputs['labels'] != -100).sum().item()

            total_loss += outputs.loss.item() * non_padding
            total_tokens += non_padding

    return np.exp(total_loss / total_tokens)

# Evaluate all models
print("\nEvaluating perplexity on test set...\n")
perplexity_results = []

for data_size in config.DATA_SIZES:
    print(f"Loading model trained on {data_size} samples...")
    model = AutoModelForCausalLM.from_pretrained(f"{config.OUTPUT_DIR}/student_{data_size}/final")

    ppl = compute_perplexity(model, test_dataset)
    perplexity_results.append({'data_size': data_size, 'perplexity': ppl})

    print(f"  {data_size:5d} samples → Perplexity: {ppl:.2f}")

    del model
    torch.cuda.empty_cache()

print("\nPerplexity evaluation complete")

### 2. Instruction-Following Adherence

In [None]:
# 50-test suite
INSTRUCTION_TESTS = [
    {"instruction": "List exactly 3 benefits of exercise.",
     "check": lambda r: 2 <= len([l for l in r.split('\n') if l.strip()]) <= 5, "type": "count"},
    {"instruction": "Give me exactly 5 reasons to learn programming.",
     "check": lambda r: 3 <= len([l for l in r.split('\n') if l.strip()]) <= 7, "type": "count"},
    {"instruction": "Name 2 countries in Europe.",
     "check": lambda r: len(r.split()) <= 15, "type": "concise"},
    {"instruction": 'Respond with valid JSON: {"name": "example", "value": 42}',
     "check": lambda r: '{' in r and '}' in r, "type": "format"},
    {"instruction": "Answer yes or no only: Is the sky blue?",
     "check": lambda r: r.strip().lower() in ['yes', 'no', 'yes.', 'no.'], "type": "binary"},
    {"instruction": "In one word, what color is grass?",
     "check": lambda r: len(r.split()) <= 3, "type": "concise"},
    {"instruction": "What is 2+2? Give only the number.",
     "check": lambda r: '4' in r and len(r.strip()) <= 10, "type": "number"},
    {"instruction": "List 3 primary colors. No explanation.",
     "check": lambda r: len(r.split()) <= 20, "type": "list"},
    {"instruction": "Translate 'hello' to Spanish. One word only.",
     "check": lambda r: len(r.split()) <= 3, "type": "concise"},
    {"instruction": "What is the capital of France? One word.",
     "check": lambda r: len(r.split()) <= 3, "type": "concise"},
]

# Duplicate to 50
while len(INSTRUCTION_TESTS) < 50:
    INSTRUCTION_TESTS.append(INSTRUCTION_TESTS[len(INSTRUCTION_TESTS) % 10])
INSTRUCTION_TESTS = INSTRUCTION_TESTS[:50]

print(f"Created {len(INSTRUCTION_TESTS)} instruction tests")

In [None]:
def evaluate_instruction_following(model, test_suite):
    """Evaluate adherence."""
    model.eval()
    model.to(device)

    results = []

    for test in tqdm(test_suite, desc="Testing", leave=False):
        prompt = create_simple_prompt(test['instruction'])
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )

        response = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        try:
            passed = test['check'](response)
        except:
            passed = False

        results.append({'passed': passed, 'type': test['type']})

    pass_rate = sum(r['passed'] for r in results) / len(results)

    by_type = {}
    for test_type in set(r['type'] for r in results):
        type_results = [r for r in results if r['type'] == test_type]
        by_type[test_type] = sum(r['passed'] for r in type_results) / len(type_results)

    return {'pass_rate': pass_rate, 'by_type': by_type}

# Evaluate all
print("\nEvaluating instruction-following...\n")
adherence_results = []

for data_size in config.DATA_SIZES:
    print(f"Testing {data_size} samples...")
    model = AutoModelForCausalLM.from_pretrained(f"{config.OUTPUT_DIR}/student_{data_size}/final")

    result = evaluate_instruction_following(model, INSTRUCTION_TESTS)
    adherence_results.append({
        'data_size': data_size,
        'pass_rate': result['pass_rate']
    })

    print(f"  {data_size:5d} samples → {result['pass_rate']:.1%} pass rate")

    del model
    torch.cuda.empty_cache()

print("\nInstruction-following complete")

### 3. Efficiency Benchmarks

In [None]:
def measure_efficiency(model):
    """Measure speed and memory."""
    model.eval()
    model.to(device)

    test_prompt = create_simple_prompt("Explain AI.")
    inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

    # Warmup
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=10)

    # Measure
    latencies = []
    for _ in range(3):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        start = time.time()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=128, do_sample=False)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        latencies.append(time.time() - start)

    avg_latency = np.mean(latencies)
    tokens_per_sec = 128 / avg_latency
    model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**2)

    return {
        'tokens_per_sec': tokens_per_sec,
        'model_size_mb': model_size_mb
    }

print("\nMeasuring efficiency...")
model = AutoModelForCausalLM.from_pretrained(f"{config.OUTPUT_DIR}/student_{config.DATA_SIZES[-1]}/final")
efficiency = measure_efficiency(model)

print(f"Model size: {efficiency['model_size_mb']:.1f} MB")
print(f"Speed: {efficiency['tokens_per_sec']:.1f} tok/s")

del model
torch.cuda.empty_cache()

### 4. Sample Generation

In [None]:
import os
# This should print ['student_10000', 'student_15000', 'student_2000', 'student_500', 'student_5000']
print(os.listdir("/content/drive/MyDrive/ai45projectfiles/content/outputs"))

In [21]:
samples = ["What is machine learning?", "List 3 benefits of exercise."]


print("SAMPLE GENERATION COMPARISON")


for instruction in samples:
    prompt = create_simple_prompt(instruction)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    print(f"\nQ: {instruction}")


    for data_size in config.DATA_SIZES:
        model = AutoModelForCausalLM.from_pretrained(f"{config.OUTPUT_DIR}/student_{data_size}/final")
        model.to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.7, do_sample=True, pad_token_id=tokenizer.pad_token_id)

        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()

        print(f"\n[{data_size:5d}] {response[:150]}")

        del model
        torch.cuda.empty_cache()

SAMPLE GENERATION COMPARISON

Q: What is machine learning?

[  500] Machine learning is a technique for extracting information from data. This is a simple term but it has the potential to do more than just extract info

[ 2000] Machine learning is the next big question. For example, if we want to find out how many people have been working at Google or Facebook for more than 3

[ 5000] Machine learning is the process by which computers learn to recognize a given item from other objects or from pictures. This allows them to predict th

[10000] Machine learning is a way to identify patterns that people have seen in data. In this case, if you were looking at a picture of a person, you'd say th

[15000] A machine learning system can learn about any kind of information about a person or document using only basic knowledge about it. If an existing docum

Q: List 3 benefits of exercise.

[  500] Exercise can be very beneficial for your health, but it also has other benefits. Exercise can help 

500 Samples (The Safe Baseline): The model is coherent but generic. It provides standard definitions but completely ignores the **"List 3"** instruction, providing a paragraph instead.

2,000 Samples (The Hallucination Dip): This represents a "confusion" phase. The model drifts significantly from the topic (e.g., mentioning Google/Facebook employees), showing that low-to-mid amounts of data can sometimes introduce **noise** before the model "understands" the task.

5,000 Samples (The Formatting Breakthrough): This is the instruction-following sweet spot. For the first time, the model recognizes it must provide a **structured** list (using B, C, D markers). While the text cuts off, the structural understanding is significantly better than previous versions.

10,000 - 15,000 Samples (Diminishing Returns): While definitions become more sophisticated, the model begins to suffer from **repetition and verbosity**. In the 15k exercise sample, it repeats "increases alertness and energy" twice, suggesting that for a model this small (124M), more data eventually leads to **"overfitting" on specific patterns rather than smarter reasoning.**

## Results & Visualization

In [None]:
# Compile results
results_df = pd.DataFrame({
    'Data Size': config.DATA_SIZES,
    'Perplexity': [r['perplexity'] for r in perplexity_results],
    'Adherence (%)': [r['pass_rate'] * 100 for r in adherence_results],
    'Training Time (min)': [r['training_time'] / 60 for r in training_results]
})


print("FINAL RESULTS")

print(results_df.to_string(index=False))

results_df.to_csv(f"{config.OUTPUT_DIR}/scaling_results.csv", index=False)
print(f"\nSaved to {config.OUTPUT_DIR}/scaling_results.csv")

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Data Scaling Study: GPT-2 Small (124M)', fontsize=16, fontweight='bold')

# Perplexity
axes[0, 0].plot(results_df['Data Size'], results_df['Perplexity'], 'o-', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Training Samples', fontweight='bold')
axes[0, 0].set_ylabel('Perplexity (↓)', fontweight='bold')
axes[0, 0].set_title('Quality: Perplexity Scaling')
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xscale('log')

# Adherence
axes[0, 1].plot(results_df['Data Size'], results_df['Adherence (%)'], 'o-', color='green', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('Training Samples', fontweight='bold')
axes[0, 1].set_ylabel('Pass Rate (%)', fontweight='bold')
axes[0, 1].set_title('Instruction-Following')
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xscale('log')
axes[0, 1].set_ylim([0, 100])

# Training time
axes[1, 0].bar(results_df['Data Size'].astype(str), results_df['Training Time (min)'], color='orange')
axes[1, 0].set_xlabel('Training Samples', fontweight='bold')
axes[1, 0].set_ylabel('Time (min)', fontweight='bold')
axes[1, 0].set_title('Training Efficiency')
axes[1, 0].grid(axis='y', alpha=0.3)

# Combined
norm_ppl = 1 / (results_df['Perplexity'] / results_df['Perplexity'].max())
norm_adh = results_df['Adherence (%)'] / 100
combined = (norm_ppl + norm_adh) / 2

axes[1, 1].plot(results_df['Data Size'], combined, 'o-', color='purple', linewidth=2, markersize=8)
axes[1, 1].set_xlabel('Training Samples', fontweight='bold')
axes[1, 1].set_ylabel('Combined Score', fontweight='bold')
axes[1, 1].set_title('Overall Quality')
axes[1, 1].grid(alpha=0.3)
axes[1, 1].set_xscale('log')

plt.tight_layout()
plt.savefig(f"{config.OUTPUT_DIR}/scaling_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

print(f"\nPlots saved to {config.OUTPUT_DIR}/scaling_analysis.png")