# üß™ M2.5e: Curriculum Learning Optimization

**Observation from M2.5d:** Curriculum reached loss 1.1615 at step 7, then jumped back to 1.2633.

**Hypothesis:** We can capture that optimal point by:
1. **Checkpointing** ‚Äî save model at each step, pick the best
2. **LR Decay** ‚Äî reduce learning rate when switching to hard examples
3. **Gradual Curriculum** ‚Äî smoother transition from easy to hard

---

## Step 1: Setup

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes
!pip install -q sentence-transformers scikit-learn
!pip install -q tqdm numpy matplotlib

In [None]:
import torch
import numpy as np
import random
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Step 2: Prepare Data (Same as M2.5d)

In [None]:
# Load and prepare data
print("Loading Alpaca dataset...")
dataset = load_dataset("tatsu-lab/alpaca", split="train")

POOL_SIZE = 2000
SUBSET_SIZE = 500
N_CLUSTERS = 500

random.seed(42)
pool_indices = random.sample(range(len(dataset)), POOL_SIZE)
pool = dataset.select(pool_indices)

def format_example(example):
    text = f"Instruction: {example['instruction']}"
    if example.get('input'):
        text += f"\nInput: {example['input']}"
    text += f"\nOutput: {example['output']}"
    return text

pool_texts = [format_example(ex) for ex in pool]

# Encode
print("Encoding with SentenceTransformer...")
st_model = SentenceTransformer('all-MiniLM-L6-v2')
st_embeddings = st_model.encode(pool_texts, show_progress_bar=True)

In [None]:
# HDC Encoder
class TernaryHDCEncoder:
    def __init__(self, input_dim=384, hd_dim=10000, sparsity=0.7, seed=42):
        self.hd_dim = hd_dim
        self.sparsity = sparsity
        np.random.seed(seed)
        self.projection = np.random.randn(input_dim, hd_dim).astype(np.float32)
        self.projection /= np.sqrt(input_dim)
    
    def encode(self, embeddings):
        projected = embeddings @ self.projection
        ternary = np.zeros_like(projected)
        for i in range(len(projected)):
            vec = projected[i]
            threshold = np.percentile(np.abs(vec), self.sparsity * 100)
            ternary[i] = np.where(vec > threshold, 1,
                                   np.where(vec < -threshold, -1, 0))
        return ternary

print("Encoding with HDC...")
hdc_encoder = TernaryHDCEncoder()
hdc_embeddings = hdc_encoder.encode(st_embeddings)

# Cluster
print("Clustering...")
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(hdc_embeddings)

# Calculate distances
distances_to_centroids = pairwise_distances(hdc_embeddings, kmeans.cluster_centers_)
point_distances = np.array([distances_to_centroids[i, cluster_labels[i]] 
                            for i in range(len(hdc_embeddings))])

print(f"Distance range: {point_distances.min():.2f} - {point_distances.max():.2f}")

In [None]:
# Select centroid and boundary samples
def select_from_clusters(strategy='centroid'):
    selected = []
    for cluster_id in range(N_CLUSTERS):
        cluster_mask = cluster_labels == cluster_id
        cluster_indices = np.where(cluster_mask)[0]
        if len(cluster_indices) == 0:
            continue
        cluster_distances = point_distances[cluster_indices]
        if strategy == 'centroid':
            best_idx = cluster_indices[np.argmin(cluster_distances)]
        else:  # boundary
            best_idx = cluster_indices[np.argmax(cluster_distances)]
        selected.append(best_idx)
    return list(set(selected))[:SUBSET_SIZE]

centroid_indices = select_from_clusters('centroid')
boundary_indices = select_from_clusters('boundary')

print(f"Centroid samples: {len(centroid_indices)}")
print(f"Boundary samples: {len(boundary_indices)}")

## Step 3: Create Curriculum Datasets

We'll create several curriculum variants:
1. **Sharp Curriculum** ‚Äî 250 easy, then 250 hard (original)
2. **Gradual Curriculum** ‚Äî sorted by difficulty (easiest ‚Üí hardest)
3. **3-Phase Curriculum** ‚Äî easy ‚Üí medium ‚Üí hard

In [None]:
# Get all samples sorted by distance to centroid (difficulty)
# Lower distance = easier (more typical), Higher distance = harder (more unusual)

all_indices = list(range(POOL_SIZE))
sorted_by_difficulty = sorted(all_indices, key=lambda i: point_distances[i])

# Take top 500 (mix of easy and medium)
gradual_indices = sorted_by_difficulty[:SUBSET_SIZE]

# Verify difficulty distribution
gradual_distances = point_distances[gradual_indices]
print(f"Gradual curriculum difficulty range: {gradual_distances.min():.3f} - {gradual_distances.max():.3f}")
print(f"Mean difficulty: {gradual_distances.mean():.3f}")

In [None]:
# Create datasets

# 1. Sharp Curriculum (original M2.5d): 250 centroid + 250 boundary
n_half = SUBSET_SIZE // 2
sharp_curriculum_indices = centroid_indices[:n_half] + boundary_indices[:n_half]
sharp_curriculum = pool.select(sharp_curriculum_indices)

# 2. Gradual Curriculum: sorted by difficulty
gradual_curriculum = pool.select(gradual_indices)

# 3. 3-Phase Curriculum: 200 easy + 150 medium + 150 hard
n_easy = 200
n_medium = 150
n_hard = 150

easy_indices = sorted_by_difficulty[:n_easy]
medium_indices = sorted_by_difficulty[POOL_SIZE//3 : POOL_SIZE//3 + n_medium]
hard_indices = sorted_by_difficulty[-n_hard:]

three_phase_indices = easy_indices + medium_indices + hard_indices
three_phase_curriculum = pool.select(three_phase_indices)

print(f"Sharp curriculum: {len(sharp_curriculum)} samples")
print(f"Gradual curriculum: {len(gradual_curriculum)} samples")
print(f"3-Phase curriculum: {len(three_phase_curriculum)} samples")

## Step 4: Load Model

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, TaskType

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("Model loaded!")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
# Tokenization
def tokenize_function(examples):
    texts = []
    for i in range(len(examples['instruction'])):
        text = f"### Instruction:\n{examples['instruction'][i]}\n\n"
        if examples['input'][i]:
            text += f"### Input:\n{examples['input'][i]}\n\n"
        text += f"### Response:\n{examples['output'][i]}"
        texts.append(text)
    
    tokenized = tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("Tokenizing...")
sharp_tokenized = sharp_curriculum.map(tokenize_function, batched=True, remove_columns=sharp_curriculum.column_names)
gradual_tokenized = gradual_curriculum.map(tokenize_function, batched=True, remove_columns=gradual_curriculum.column_names)
three_phase_tokenized = three_phase_curriculum.map(tokenize_function, batched=True, remove_columns=three_phase_curriculum.column_names)
print("Done!")

## Step 5: Custom Trainer with Checkpointing

In [None]:
# Custom callback to track all losses and find minimum
class LossTrackingCallback(TrainerCallback):
    def __init__(self):
        self.losses = []
        self.steps = []
        self.best_loss = float('inf')
        self.best_step = 0
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            loss = logs['loss']
            step = state.global_step
            self.losses.append(loss)
            self.steps.append(step)
            
            if loss < self.best_loss:
                self.best_loss = loss
                self.best_step = step
                print(f"  üìç New best loss: {loss:.4f} at step {step}")

In [None]:
def train_with_tracking(train_dataset, run_name, use_lr_decay=False):
    print(f"\n{'='*60}")
    print(f"Training: {run_name}")
    if use_lr_decay:
        print("Using LR decay (cosine schedule)")
    print(f"{'='*60}")
    
    model = get_peft_model(base_model, lora_config)
    
    # LR schedule: constant vs cosine decay
    lr_scheduler_type = "cosine" if use_lr_decay else "constant"
    
    training_args = TrainingArguments(
        output_dir=f"./results_{run_name}",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type=lr_scheduler_type,
        warmup_steps=20,
        logging_steps=5,  # More frequent logging to catch the minimum
        save_strategy="no",
        fp16=True,
        report_to="none",
        seed=42
    )
    
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    # Add our tracking callback
    tracker = LossTrackingCallback()
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        callbacks=[tracker]
    )
    
    trainer.train()
    
    final_loss = tracker.losses[-1] if tracker.losses else float('inf')
    
    print(f"\nüìä Summary:")
    print(f"  Final loss: {final_loss:.4f}")
    print(f"  Best loss: {tracker.best_loss:.4f} at step {tracker.best_step}")
    print(f"  Potential improvement: {((final_loss - tracker.best_loss) / final_loss) * 100:.1f}%")
    
    result = {
        'name': run_name,
        'final_loss': final_loss,
        'best_loss': tracker.best_loss,
        'best_step': tracker.best_step,
        'all_losses': tracker.losses,
        'all_steps': tracker.steps,
        'lr_decay': use_lr_decay
    }
    
    del model
    del trainer
    torch.cuda.empty_cache()
    
    return result

## Step 6: Run Experiments

In [None]:
results = {}

# 1. Sharp Curriculum (baseline from M2.5d)
results['sharp'] = train_with_tracking(sharp_tokenized, 'sharp_curriculum', use_lr_decay=False)

# 2. Sharp Curriculum + LR Decay
results['sharp_lr_decay'] = train_with_tracking(sharp_tokenized, 'sharp_lr_decay', use_lr_decay=True)

# 3. Gradual Curriculum (sorted by difficulty)
results['gradual'] = train_with_tracking(gradual_tokenized, 'gradual_curriculum', use_lr_decay=False)

# 4. Gradual + LR Decay
results['gradual_lr_decay'] = train_with_tracking(gradual_tokenized, 'gradual_lr_decay', use_lr_decay=True)

# 5. 3-Phase Curriculum
results['three_phase'] = train_with_tracking(three_phase_tokenized, 'three_phase', use_lr_decay=False)

# 6. 3-Phase + LR Decay
results['three_phase_lr_decay'] = train_with_tracking(three_phase_tokenized, 'three_phase_lr_decay', use_lr_decay=True)

## Step 7: Results

In [None]:
import matplotlib.pyplot as plt

# Plot all loss curves
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

colors = {
    'sharp': 'blue',
    'sharp_lr_decay': 'lightblue',
    'gradual': 'green',
    'gradual_lr_decay': 'lightgreen',
    'three_phase': 'red',
    'three_phase_lr_decay': 'salmon'
}

for idx, (name, data) in enumerate(results.items()):
    ax = axes[idx]
    ax.plot(data['all_steps'], data['all_losses'], color=colors[name], linewidth=2)
    ax.axhline(y=data['best_loss'], color='gold', linestyle='--', alpha=0.7)
    ax.scatter([data['best_step']], [data['best_loss']], color='gold', s=100, zorder=5, marker='*')
    ax.set_title(f"{name}\nFinal: {data['final_loss']:.4f}, Best: {data['best_loss']:.4f}")
    ax.set_xlabel('Step')
    ax.set_ylabel('Loss')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('m2.5e_all_experiments.png', dpi=150)
plt.show()

In [None]:
# Combined plot
plt.figure(figsize=(12, 6))

for name, data in results.items():
    plt.plot(data['all_losses'], label=f"{name} (best: {data['best_loss']:.4f})", 
             color=colors[name], linewidth=2)

# Reference lines
plt.axhline(y=1.2194, color='black', linestyle='--', alpha=0.5, label='M2.5c Best (1.2194)')
plt.axhline(y=1.1615, color='gold', linestyle='--', alpha=0.5, label='M2.5d Observed Min (1.1615)')

plt.xlabel('Logging Step')
plt.ylabel('Loss')
plt.title('M2.5e: Curriculum Learning Optimization\nFinding the Optimal Point')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('m2.5e_combined.png', dpi=150)
plt.show()

In [None]:
# Summary table
print("\n" + "="*80)
print("üìä M2.5e RESULTS: CURRICULUM OPTIMIZATION")
print("="*80)

PREV_BEST = 1.2194  # M2.5c
OBSERVED_MIN = 1.1615  # From M2.5d graph

print(f"\n{'Strategy':<25} {'Final Loss':>12} {'Best Loss':>12} {'Best Step':>10} {'vs M2.5c':>12}")
print("-" * 75)

# Sort by best_loss
sorted_results = sorted(results.items(), key=lambda x: x[1]['best_loss'])

for name, data in sorted_results:
    vs_prev = ((PREV_BEST - data['best_loss']) / PREV_BEST) * 100
    status = "üèÜ" if data['best_loss'] < PREV_BEST else ""
    print(f"{name:<25} {data['final_loss']:>12.4f} {data['best_loss']:>12.4f} {data['best_step']:>10} {vs_prev:>+11.2f}% {status}")

print(f"\nReference points:")
print(f"  M2.5c HDC-Curated: {PREV_BEST}")
print(f"  M2.5d Observed minimum: {OBSERVED_MIN}")

# Find overall best
best_name, best_data = sorted_results[0]
print(f"\nüèÜ OVERALL BEST: {best_name}")
print(f"   Best loss: {best_data['best_loss']:.4f} at step {best_data['best_step']}")
print(f"   Improvement over M2.5c: {((PREV_BEST - best_data['best_loss']) / PREV_BEST) * 100:.2f}%")

In [None]:
# Key insight analysis
print("\n" + "="*80)
print("üî¨ KEY INSIGHTS")
print("="*80)

# Compare LR decay effect
print("\n1. Effect of LR Decay:")
for base in ['sharp', 'gradual', 'three_phase']:
    no_decay = results[base]['best_loss']
    with_decay = results[f'{base}_lr_decay']['best_loss']
    diff = ((no_decay - with_decay) / no_decay) * 100
    better = "‚úÖ LR decay helps" if with_decay < no_decay else "‚ùå LR decay hurts"
    print(f"   {base}: {no_decay:.4f} ‚Üí {with_decay:.4f} ({diff:+.2f}%) {better}")

# Compare curriculum types
print("\n2. Best curriculum type:")
curriculum_types = ['sharp', 'gradual', 'three_phase']
for ct in curriculum_types:
    best_of_type = min(results[ct]['best_loss'], results[f'{ct}_lr_decay']['best_loss'])
    print(f"   {ct}: {best_of_type:.4f}")

# Early stopping potential
print("\n3. Early Stopping Potential (Final vs Best):")
for name, data in sorted_results:
    potential = ((data['final_loss'] - data['best_loss']) / data['final_loss']) * 100
    if potential > 1:
        print(f"   {name}: Could save {potential:.1f}% by stopping at step {data['best_step']}")

In [None]:
# Save results
import json

output = {
    "phase": "M2.5e",
    "experiment": "Curriculum Learning Optimization",
    "goal": "Capture the optimal point observed in M2.5d (1.1615)",
    "previous_best": PREV_BEST,
    "observed_minimum_m2.5d": OBSERVED_MIN,
    "results": {
        name: {
            "final_loss": float(data['final_loss']),
            "best_loss": float(data['best_loss']),
            "best_step": int(data['best_step']),
            "lr_decay": data['lr_decay'],
            "vs_m2.5c_pct": float(((PREV_BEST - data['best_loss']) / PREV_BEST) * 100)
        }
        for name, data in results.items()
    },
    "overall_best": {
        "strategy": best_name,
        "best_loss": float(best_data['best_loss']),
        "best_step": int(best_data['best_step']),
        "improvement_over_m2.5c_pct": float(((PREV_BEST - best_data['best_loss']) / PREV_BEST) * 100)
    },
    "beat_m2.5c": best_data['best_loss'] < PREV_BEST,
    "beat_observed_minimum": best_data['best_loss'] < OBSERVED_MIN
}

with open('phase_m2.5e_results.json', 'w') as f:
    json.dump(output, f, indent=2)

print("\nüìÅ Results saved to phase_m2.5e_results.json")
print("\n" + json.dumps(output, indent=2))

In [None]:
# Download
from google.colab import files
files.download('phase_m2.5e_results.json')
files.download('m2.5e_combined.png')
files.download('m2.5e_all_experiments.png')