# üß™ Resonance Protocol ‚Äî M2.5d: Cluster Sampling Strategies

**Hypothesis 3:** Different sampling strategies from HDC clusters may improve fine-tuning results.

**Strategies to test:**
1. **Centroid** (baseline) ‚Äî closest to cluster center (most typical)
2. **Boundary** ‚Äî farthest from cluster center (most diverse/difficult)
3. **Mixed** ‚Äî 70% centroids + 30% boundary
4. **Curriculum** ‚Äî train on centroids first, then boundary

---

## Step 1: Setup

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes
!pip install -q sentence-transformers scikit-learn
!pip install -q tqdm numpy matplotlib

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Step 2: Load Data and Create HDC Embeddings

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
import random

# Load Alpaca
print("Loading Alpaca dataset...")
dataset = load_dataset("tatsu-lab/alpaca", split="train")

POOL_SIZE = 2000
SUBSET_SIZE = 500
N_CLUSTERS = 500

random.seed(42)
pool_indices = random.sample(range(len(dataset)), POOL_SIZE)
pool = dataset.select(pool_indices)
print(f"Pool size: {len(pool)}")

In [None]:
# Format texts
def format_example(example):
    text = f"Instruction: {example['instruction']}"
    if example.get('input'):
        text += f"\nInput: {example['input']}"
    text += f"\nOutput: {example['output']}"
    return text

pool_texts = [format_example(ex) for ex in pool]

# Encode with SentenceTransformer
print("\nEncoding with SentenceTransformer...")
st_model = SentenceTransformer('all-MiniLM-L6-v2')
st_embeddings = st_model.encode(pool_texts, show_progress_bar=True)
print(f"ST embeddings shape: {st_embeddings.shape}")

In [None]:
# HDC Encoder
class TernaryHDCEncoder:
    def __init__(self, input_dim=384, hd_dim=10000, sparsity=0.7, seed=42):
        self.hd_dim = hd_dim
        self.sparsity = sparsity
        np.random.seed(seed)
        self.projection = np.random.randn(input_dim, hd_dim).astype(np.float32)
        self.projection /= np.sqrt(input_dim)
    
    def encode(self, embeddings):
        projected = embeddings @ self.projection
        ternary = np.zeros_like(projected)
        for i in range(len(projected)):
            vec = projected[i]
            threshold = np.percentile(np.abs(vec), self.sparsity * 100)
            ternary[i] = np.where(vec > threshold, 1,
                                   np.where(vec < -threshold, -1, 0))
        return ternary

# Encode with HDC
print("\nEncoding with HDC...")
hdc_encoder = TernaryHDCEncoder()
hdc_embeddings = hdc_encoder.encode(st_embeddings)
print(f"HDC embeddings shape: {hdc_embeddings.shape}")
print(f"Sparsity: {(hdc_embeddings == 0).mean():.1%}")

## Step 3: Create Different Sampling Strategies

In [None]:
# Cluster in HDC space
print("Clustering in HDC space...")
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(hdc_embeddings)
print(f"Created {N_CLUSTERS} clusters")

# Calculate distances to centroids for each point
distances_to_centroids = pairwise_distances(hdc_embeddings, kmeans.cluster_centers_)

# For each point, get distance to its own cluster centroid
point_distances = np.array([distances_to_centroids[i, cluster_labels[i]] 
                            for i in range(len(hdc_embeddings))])

print(f"Distance range: {point_distances.min():.2f} - {point_distances.max():.2f}")

In [None]:
def select_from_clusters(strategy='centroid', n_samples=500):
    """
    Select samples from clusters using different strategies.
    
    Strategies:
    - 'centroid': closest to cluster center (most typical)
    - 'boundary': farthest from cluster center (most diverse)
    - 'mixed': 70% centroid + 30% boundary
    """
    selected = []
    
    for cluster_id in range(N_CLUSTERS):
        # Get indices of points in this cluster
        cluster_mask = cluster_labels == cluster_id
        cluster_indices = np.where(cluster_mask)[0]
        
        if len(cluster_indices) == 0:
            continue
        
        # Get distances for points in this cluster
        cluster_distances = point_distances[cluster_indices]
        
        if strategy == 'centroid':
            # Select closest to centroid
            best_idx = cluster_indices[np.argmin(cluster_distances)]
            selected.append(best_idx)
            
        elif strategy == 'boundary':
            # Select farthest from centroid
            best_idx = cluster_indices[np.argmax(cluster_distances)]
            selected.append(best_idx)
            
        elif strategy == 'mixed':
            # Will handle after loop
            selected.append({
                'centroid': cluster_indices[np.argmin(cluster_distances)],
                'boundary': cluster_indices[np.argmax(cluster_distances)]
            })
    
    if strategy == 'mixed':
        # 70% centroid, 30% boundary
        n_centroid = int(n_samples * 0.7)
        n_boundary = n_samples - n_centroid
        
        random.seed(42)
        random.shuffle(selected)
        
        final_selected = []
        for i, item in enumerate(selected):
            if i < n_centroid:
                final_selected.append(item['centroid'])
            else:
                final_selected.append(item['boundary'])
        
        return list(set(final_selected))[:n_samples]
    
    return list(set(selected))[:n_samples]

# Create all strategy subsets
centroid_indices = select_from_clusters('centroid', SUBSET_SIZE)
boundary_indices = select_from_clusters('boundary', SUBSET_SIZE)
mixed_indices = select_from_clusters('mixed', SUBSET_SIZE)

print(f"Centroid subset: {len(centroid_indices)} samples")
print(f"Boundary subset: {len(boundary_indices)} samples")
print(f"Mixed subset: {len(mixed_indices)} samples")

In [None]:
# Create curriculum dataset (centroid first half, boundary second half)
# This will be handled during training by concatenating datasets

n_half = SUBSET_SIZE // 2
curriculum_centroid_indices = centroid_indices[:n_half]
curriculum_boundary_indices = boundary_indices[:n_half]

print(f"\nCurriculum strategy:")
print(f"  Phase 1 (easy/centroid): {len(curriculum_centroid_indices)} samples")
print(f"  Phase 2 (hard/boundary): {len(curriculum_boundary_indices)} samples")

In [None]:
# Analyze the strategies
print("\nüìä Strategy Analysis:")
print("="*50)

for name, indices in [('Centroid', centroid_indices), 
                       ('Boundary', boundary_indices),
                       ('Mixed', mixed_indices)]:
    subset_distances = point_distances[indices]
    print(f"\n{name}:")
    print(f"  Mean distance to centroid: {subset_distances.mean():.3f}")
    print(f"  Std distance: {subset_distances.std():.3f}")
    print(f"  Min/Max: {subset_distances.min():.3f} / {subset_distances.max():.3f}")

## Step 4: Prepare Datasets

In [None]:
# Create dataset subsets
centroid_subset = pool.select(centroid_indices)
boundary_subset = pool.select(boundary_indices)
mixed_subset = pool.select(mixed_indices)

# For curriculum: concatenate centroid first, then boundary
from datasets import concatenate_datasets
curriculum_phase1 = pool.select(curriculum_centroid_indices)
curriculum_phase2 = pool.select(curriculum_boundary_indices)
curriculum_subset = concatenate_datasets([curriculum_phase1, curriculum_phase2])

print(f"Centroid subset: {len(centroid_subset)}")
print(f"Boundary subset: {len(boundary_subset)}")
print(f"Mixed subset: {len(mixed_subset)}")
print(f"Curriculum subset: {len(curriculum_subset)}")

## Step 5: Load Model

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("Model loaded!")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
# Tokenization
def tokenize_function(examples):
    texts = []
    for i in range(len(examples['instruction'])):
        text = f"### Instruction:\n{examples['instruction'][i]}\n\n"
        if examples['input'][i]:
            text += f"### Input:\n{examples['input'][i]}\n\n"
        text += f"### Response:\n{examples['output'][i]}"
        texts.append(text)
    
    tokenized = tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("Tokenizing datasets...")
centroid_tokenized = centroid_subset.map(tokenize_function, batched=True, remove_columns=centroid_subset.column_names)
boundary_tokenized = boundary_subset.map(tokenize_function, batched=True, remove_columns=boundary_subset.column_names)
mixed_tokenized = mixed_subset.map(tokenize_function, batched=True, remove_columns=mixed_subset.column_names)
curriculum_tokenized = curriculum_subset.map(tokenize_function, batched=True, remove_columns=curriculum_subset.column_names)
print("Done!")

## Step 6: Training

In [None]:
def train_and_evaluate(train_dataset, run_name):
    print(f"\n{'='*60}")
    print(f"Training: {run_name}")
    print(f"{'='*60}")
    
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()
    
    training_args = TrainingArguments(
        output_dir=f"./results_{run_name}",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        warmup_steps=50,
        logging_steps=10,
        save_strategy="no",
        fp16=True,
        report_to="none",
        seed=42
    )
    
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )
    
    trainer.train()
    
    loss_history = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
    print(f"\nFinal loss: {loss_history[-1]:.4f}")
    
    del model
    del trainer
    torch.cuda.empty_cache()
    
    return {
        'name': run_name,
        'final_loss': loss_history[-1],
        'loss_history': loss_history,
        'train_samples': len(train_dataset)
    }

In [None]:
# Run all experiments
results = {}

results['centroid'] = train_and_evaluate(centroid_tokenized, 'centroid')
results['boundary'] = train_and_evaluate(boundary_tokenized, 'boundary')
results['mixed'] = train_and_evaluate(mixed_tokenized, 'mixed')
results['curriculum'] = train_and_evaluate(curriculum_tokenized, 'curriculum')

## Step 7: Results

In [None]:
import matplotlib.pyplot as plt

# Plot
plt.figure(figsize=(12, 6))

colors = {'centroid': 'green', 'boundary': 'red', 'mixed': 'purple', 'curriculum': 'orange'}

for name, data in results.items():
    plt.plot(data['loss_history'], label=f"{name} (final: {data['final_loss']:.4f})", 
             color=colors[name], linewidth=2)

plt.xlabel('Training Steps (√ó10)')
plt.ylabel('Loss')
plt.title('M2.5d: Cluster Sampling Strategies\nCentroid vs Boundary vs Mixed vs Curriculum')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('m2.5d_strategies.png', dpi=150)
plt.show()

In [None]:
# Summary
print("\n" + "="*70)
print("üìä M2.5d RESULTS: CLUSTER SAMPLING STRATEGIES")
print("="*70)

# Previous best (HDC-Curated from M2.5c)
PREV_BEST = 1.2194

print(f"\n{'Strategy':<15} {'Final Loss':>12} {'vs M2.5c Best':>15} {'Status':>10}")
print("-" * 55)

sorted_results = sorted(results.items(), key=lambda x: x[1]['final_loss'])

for name, data in sorted_results:
    loss = data['final_loss']
    vs_prev = ((PREV_BEST - loss) / PREV_BEST) * 100
    status = "üëë NEW BEST" if loss < PREV_BEST else "" 
    print(f"{name:<15} {loss:>12.4f} {vs_prev:>+14.2f}% {status:>10}")

print(f"\nPrevious best (M2.5c HDC-Curated): {PREV_BEST}")

# Winner
winner_name, winner_data = sorted_results[0]
print(f"\nüèÜ Winner: {winner_name.upper()} with loss {winner_data['final_loss']:.4f}")

# Analysis
print("\n" + "="*70)
print("üî¨ ANALYSIS")
print("="*70)

centroid_loss = results['centroid']['final_loss']
boundary_loss = results['boundary']['final_loss']
mixed_loss = results['mixed']['final_loss']
curriculum_loss = results['curriculum']['final_loss']

print(f"\nCentroid vs Boundary: {((boundary_loss - centroid_loss) / boundary_loss) * 100:+.2f}%")
print(f"Mixed vs Centroid: {((centroid_loss - mixed_loss) / centroid_loss) * 100:+.2f}%")
print(f"Curriculum vs Centroid: {((centroid_loss - curriculum_loss) / centroid_loss) * 100:+.2f}%")

In [None]:
# Save results
import json

output = {
    "phase": "M2.5d",
    "experiment": "Cluster Sampling Strategies",
    "hypothesis": "Different sampling strategies from HDC clusters may improve fine-tuning",
    "previous_best": PREV_BEST,
    "results": {
        name: {
            "final_loss": float(data['final_loss']),
            "samples": data['train_samples'],
            "vs_prev_best_pct": float(((PREV_BEST - data['final_loss']) / PREV_BEST) * 100)
        }
        for name, data in results.items()
    },
    "winner": winner_name,
    "winner_loss": float(winner_data['final_loss']),
    "improved_over_m2.5c": winner_data['final_loss'] < PREV_BEST
}

with open('phase_m2.5d_results.json', 'w') as f:
    json.dump(output, f, indent=2)

print("\nüìÅ Results saved to phase_m2.5d_results.json")
print("\n" + json.dumps(output, indent=2))

In [None]:
# Download
from google.colab import files
files.download('phase_m2.5d_results.json')
files.download('m2.5d_strategies.png')