# ðŸ§© Testing BDH's Model Composability Claim

One of the most interesting claims from Adrian Kosowski:

> "You can concatenate trained BDH models and they just work."

This would be remarkable - normally you can't just combine neural networks!

## Why This Might Work in BDH

1. **Sparse activations**: Only ~5% of neurons active, so less interference
2. **Positive activations**: No cancellation between positive/negative
3. **Monosemantic neurons**: Each neuron = one concept, so concepts from different models don't clash

## What We'll Test

1. Train two models on different domains
2. Concatenate their latent spaces
3. See if the combined model retains both capabilities

In [None]:
# Setup
!pip install torch datasets matplotlib tqdm -q

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import copy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
# Clone repo if needed
import os
if not os.path.exists('bdh.py'):
    !git clone https://github.com/newsbubbles/bdh.git temp_bdh
    !cp temp_bdh/bdh.py .
    !rm -rf temp_bdh

from bdh import BDH, BDHConfig
print('BDH loaded!')

---
## Step 1: Create Two Specialized Datasets

We'll train one model on English prose and another on Python code.

In [None]:
# Create specialized datasets

# Dataset 1: English prose patterns
prose_samples = [
    "The quick brown fox jumps over the lazy dog. ",
    "In the beginning, there was nothing but darkness. ",
    "She walked through the garden, admiring the flowers. ",
    "The old man sat by the fire, telling stories of his youth. ",
    "Rain fell softly on the roof, creating a gentle rhythm. ",
    "The city lights twinkled in the distance like stars. ",
    "He opened the book and began to read aloud. ",
    "The waves crashed against the shore, endless and eternal. ",
    "Morning came with the sound of birds singing. ",
    "The forest was quiet, save for the rustling leaves. ",
] * 100  # Repeat for more data

# Dataset 2: Python code patterns
code_samples = [
    "def hello(): print('Hello, World!') ",
    "for i in range(10): x += i ",
    "if x > 0: return True ",
    "class MyClass: def __init__(self): pass ",
    "import numpy as np; arr = np.zeros(10) ",
    "def fib(n): return n if n < 2 else fib(n-1) + fib(n-2) ",
    "with open('file.txt') as f: data = f.read() ",
    "try: result = x / y except: result = 0 ",
    "lambda x: x * 2 ",
    "[x**2 for x in range(10) if x % 2 == 0] ",
] * 100

# Convert to byte tensors
prose_text = ' '.join(prose_samples)
code_text = ' '.join(code_samples)

prose_data = torch.tensor([b for b in prose_text.encode('utf-8')], dtype=torch.long)
code_data = torch.tensor([b for b in code_text.encode('utf-8')], dtype=torch.long)

print(f'Prose data: {len(prose_data):,} bytes')
print(f'Code data: {len(code_data):,} bytes')

---
## Step 2: Train Two Specialized Models

In [None]:
def train_model(data, name, steps=1000, batch_size=32, block_size=128):
    """Train a BDH model on specific data."""
    config = BDHConfig(
        n_layer=3,
        n_embd=128,
        n_head=4,
        mlp_internal_dim_multiplier=32,
        dropout=0.1
    )
    model = BDH(config).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    
    losses = []
    pbar = tqdm(range(steps), desc=f'Training {name}')
    
    for step in pbar:
        # Random batch
        idx = torch.randint(0, len(data) - block_size - 1, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in idx]).to(device)
        y = torch.stack([data[i+1:i+block_size+1] for i in idx]).to(device)
        
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, 256), y.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        
        if step % 100 == 0:
            pbar.set_postfix({'loss': f'{np.mean(losses[-100:]):.4f}'})
    
    return model, losses

In [None]:
# Train prose model
print('Training prose model...')
prose_model, prose_losses = train_model(prose_data, 'Prose')
print(f'Final prose loss: {np.mean(prose_losses[-50:]):.4f}')

In [None]:
# Train code model
print('Training code model...')
code_model, code_losses = train_model(code_data, 'Code')
print(f'Final code loss: {np.mean(code_losses[-50:]):.4f}')

In [None]:
# Plot training curves
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(prose_losses, alpha=0.3, color='blue')
ax.plot(np.convolve(prose_losses, np.ones(50)/50, mode='valid'), color='blue', label='Prose')
ax.plot(code_losses, alpha=0.3, color='red')
ax.plot(np.convolve(code_losses, np.ones(50)/50, mode='valid'), color='red', label='Code')
ax.set_xlabel('Step')
ax.set_ylabel('Loss')
ax.set_title('Training Specialized Models')
ax.legend()
plt.show()

---
## Step 3: Evaluate Specialization

Verify each model is specialized to its domain.

In [None]:
def evaluate_on_data(model, data, n_samples=100, block_size=128):
    """Evaluate model loss on data."""
    model.eval()
    losses = []
    
    with torch.no_grad():
        for _ in range(n_samples):
            idx = torch.randint(0, len(data) - block_size - 1, (1,))
            x = data[idx:idx+block_size].unsqueeze(0).to(device)
            y = data[idx+1:idx+block_size+1].unsqueeze(0).to(device)
            
            logits = model(x)
            loss = F.cross_entropy(logits.view(-1, 256), y.view(-1))
            losses.append(loss.item())
    
    return np.mean(losses), np.std(losses)

# Cross-evaluation
print('Cross-evaluation (lower is better):')
print()

prose_on_prose, _ = evaluate_on_data(prose_model, prose_data)
prose_on_code, _ = evaluate_on_data(prose_model, code_data)
code_on_prose, _ = evaluate_on_data(code_model, prose_data)
code_on_code, _ = evaluate_on_data(code_model, code_data)

print(f'                  | Prose Data | Code Data')
print(f'------------------|------------|----------')
print(f'Prose Model       | {prose_on_prose:.4f}     | {prose_on_code:.4f}')
print(f'Code Model        | {code_on_prose:.4f}     | {code_on_code:.4f}')
print()

# Check specialization
prose_specialized = prose_on_prose < prose_on_code
code_specialized = code_on_code < code_on_prose

if prose_specialized and code_specialized:
    print('âœ“ Both models are specialized to their domains!')
else:
    print('âš  Models may not be sufficiently specialized')

---
## Step 4: Compose Models

Now the key test: can we combine these models?

In [None]:
class ComposedBDH(nn.Module):
    """
    Compose two BDH models by concatenating their latent spaces.
    
    The idea: since BDH uses sparse, positive activations,
    we can concatenate the encoder outputs and let the
    combined sparse representation work together.
    """
    
    def __init__(self, model_a, model_b, mode='concat_latent'):
        super().__init__()
        self.model_a = model_a
        self.model_b = model_b
        self.mode = mode
        
        # Share embedding (both models have same vocab)
        # Use model_a's embedding
        
        # For output, we need to combine predictions
        # Option 1: Average logits
        # Option 2: Learn a combiner
        # Option 3: Weighted average
        
        self.combine_weight = nn.Parameter(torch.tensor(0.5))
        
    def forward(self, x):
        # Get predictions from both models
        logits_a = self.model_a(x)
        logits_b = self.model_b(x)
        
        # Combine predictions
        w = torch.sigmoid(self.combine_weight)
        combined_logits = w * logits_a + (1 - w) * logits_b
        
        return combined_logits

# Create composed model
composed_model = ComposedBDH(prose_model, code_model).to(device)
print('Created composed model (simple averaging)')

In [None]:
# More sophisticated composition: merge at the layer level
class DeepComposedBDH(nn.Module):
    """
    Deeper composition: concatenate encoder outputs at each layer.
    
    This tests the claim that sparse BDH representations can be
    combined without interference.
    """
    
    def __init__(self, model_a, model_b):
        super().__init__()
        self.model_a = model_a
        self.model_b = model_b
        
        # Freeze original models
        for p in self.model_a.parameters():
            p.requires_grad = False
        for p in self.model_b.parameters():
            p.requires_grad = False
        
        # Learn to combine
        n_embd = model_a.config.n_embd
        self.combiner = nn.Linear(n_embd * 2, n_embd)
        self.output_proj = nn.Linear(n_embd, 256)
        
    def forward(self, x):
        # Get embeddings from both
        h_a = self.model_a.embedding(x)
        h_b = self.model_b.embedding(x)
        
        # Process through layers
        for layer_a, layer_b in zip(self.model_a.layers, self.model_b.layers):
            h_a = layer_a(h_a)
            h_b = layer_b(h_b)
        
        # Combine final representations
        h_combined = torch.cat([h_a, h_b], dim=-1)
        h_out = self.combiner(h_combined)
        
        # Project to vocabulary
        logits = self.output_proj(h_out)
        
        return logits

# Create deep composed model
deep_composed = DeepComposedBDH(prose_model, code_model).to(device)
print('Created deep composed model')

---
## Step 5: Evaluate Composed Models

In [None]:
# Evaluate simple composed model (no training)
print('Simple Composed Model (averaging, no training):')
composed_on_prose, _ = evaluate_on_data(composed_model, prose_data)
composed_on_code, _ = evaluate_on_data(composed_model, code_data)
print(f'  Prose: {composed_on_prose:.4f} (specialist: {prose_on_prose:.4f})')
print(f'  Code:  {composed_on_code:.4f} (specialist: {code_on_code:.4f})')
print()

# Check if composed model is good at both
prose_ok = composed_on_prose < (prose_on_prose + code_on_prose) / 2
code_ok = composed_on_code < (prose_on_code + code_on_code) / 2

if prose_ok and code_ok:
    print('âœ“ Composed model works reasonably on both domains!')
else:
    print('~ Composed model needs fine-tuning')

In [None]:
# Fine-tune the deep composed model on mixed data
print('Fine-tuning deep composed model on mixed data...')

# Create mixed dataset
mixed_data = torch.cat([prose_data, code_data])

# Only train the combiner layers
optimizer = torch.optim.AdamW(
    [p for p in deep_composed.parameters() if p.requires_grad],
    lr=1e-3
)

finetune_losses = []
for step in tqdm(range(500), desc='Fine-tuning'):
    idx = torch.randint(0, len(mixed_data) - 129, (32,))
    x = torch.stack([mixed_data[i:i+128] for i in idx]).to(device)
    y = torch.stack([mixed_data[i+1:i+129] for i in idx]).to(device)
    
    logits = deep_composed(x)
    loss = F.cross_entropy(logits.view(-1, 256), y.view(-1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    finetune_losses.append(loss.item())

print(f'Fine-tuning complete. Final loss: {np.mean(finetune_losses[-50:]):.4f}')

In [None]:
# Evaluate fine-tuned composed model
print('\nFine-tuned Deep Composed Model:')
deep_on_prose, _ = evaluate_on_data(deep_composed, prose_data)
deep_on_code, _ = evaluate_on_data(deep_composed, code_data)
print(f'  Prose: {deep_on_prose:.4f}')
print(f'  Code:  {deep_on_code:.4f}')
print()

# Compare all models
print('='*60)
print('COMPARISON SUMMARY')
print('='*60)
print()
print(f'                    | Prose Data | Code Data | Avg')
print(f'--------------------|------------|-----------|-------')
print(f'Prose Specialist    | {prose_on_prose:.4f}     | {prose_on_code:.4f}    | {(prose_on_prose+prose_on_code)/2:.4f}')
print(f'Code Specialist     | {code_on_prose:.4f}     | {code_on_code:.4f}    | {(code_on_prose+code_on_code)/2:.4f}')
print(f'Simple Composed     | {composed_on_prose:.4f}     | {composed_on_code:.4f}    | {(composed_on_prose+composed_on_code)/2:.4f}')
print(f'Deep Composed (FT)  | {deep_on_prose:.4f}     | {deep_on_code:.4f}    | {(deep_on_prose+deep_on_code)/2:.4f}')

In [None]:
# Visualize results
fig, ax = plt.subplots(figsize=(10, 6))

models = ['Prose\nSpecialist', 'Code\nSpecialist', 'Simple\nComposed', 'Deep\nComposed']
prose_scores = [prose_on_prose, code_on_prose, composed_on_prose, deep_on_prose]
code_scores = [prose_on_code, code_on_code, composed_on_code, deep_on_code]

x = np.arange(len(models))
width = 0.35

bars1 = ax.bar(x - width/2, prose_scores, width, label='Prose Data', color='steelblue')
bars2 = ax.bar(x + width/2, code_scores, width, label='Code Data', color='coral')

ax.set_ylabel('Loss (lower is better)')
ax.set_title('Model Composability Test')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# Add value labels
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords='offset points',
                ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('composability_test.png', dpi=150)
plt.show()

---
## Step 6: Test Generation from Composed Model

In [None]:
def generate(model, prompt, max_new=100, temperature=0.8):
    """Generate text from a prompt."""
    model.eval()
    tokens = torch.tensor([[b for b in prompt.encode('utf-8')]], dtype=torch.long, device=device)
    
    for _ in range(max_new):
        # Get prediction for last position
        with torch.no_grad():
            logits = model(tokens[:, -128:])  # Use last 128 tokens
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
        
        tokens = torch.cat([tokens, next_token], dim=1)
    
    return bytes(tokens[0].cpu().tolist()).decode('utf-8', errors='replace')

# Test generation
print('='*60)
print('GENERATION TEST')
print('='*60)

# Prose prompt
prose_prompt = "The old man"
print(f'\nProse prompt: "{prose_prompt}"')
print(f'Prose model: {generate(prose_model, prose_prompt, max_new=50)}')
print(f'Composed:    {generate(deep_composed, prose_prompt, max_new=50)}')

# Code prompt
code_prompt = "def hello"
print(f'\nCode prompt: "{code_prompt}"')
print(f'Code model:  {generate(code_model, code_prompt, max_new=50)}')
print(f'Composed:    {generate(deep_composed, code_prompt, max_new=50)}')

---
## Summary & Verdict

In [None]:
print('='*70)
print('BDH MODEL COMPOSABILITY CLAIM VERDICT')
print('='*70)
print()

# Calculate metrics
specialist_avg = (prose_on_prose + code_on_code) / 2
composed_avg = (deep_on_prose + deep_on_code) / 2

# Composed should be close to specialist average
composability_ratio = composed_avg / specialist_avg

print(f'Specialist average loss: {specialist_avg:.4f}')
print(f'Composed model loss:     {composed_avg:.4f}')
print(f'Ratio: {composability_ratio:.2f}x')
print()

if composability_ratio < 1.2:
    print('âœ“ CLAIM SUPPORTED: Composed model performs nearly as well as specialists!')
    print('  BDH models can be effectively combined.')
elif composability_ratio < 1.5:
    print('~ PARTIALLY SUPPORTED: Composed model works but with some degradation.')
    print('  Composition is possible but not seamless.')
else:
    print('âœ— CLAIM NOT WELL SUPPORTED: Significant degradation when composing.')
    print('  Models do not combine as cleanly as claimed.')

print()
print('Note: This is a simplified test. The original claim may refer to')
print('more sophisticated composition methods or larger models.')