# TinyStories Language Model Experiments

This notebook compares two different approaches for language modeling on the TinyStories dataset:
1. A Deep Reservoir Computing-based Model
2. A Transformer-based Model

Both models will be evaluated on the same dataset with the same evaluation metrics for fair comparison.

In [None]:
import os
import torch

# -----------------------------------------------------------------------------
# Shared Configuration for Both Models
# -----------------------------------------------------------------------------

SHARED_CONFIG = {
    # Data parameters
    'MAX_STORIES': 100,                # Number of stories to use for training
    
    # Training parameters
    'EPOCHS': 1,                       # Number of training epochs
    'BATCH_SIZE': 32,                  # Batch size for training
    'BLOCK_SIZE': 128,                 # Context size for training
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
    
    # Evaluation parameters
    'EVAL_INTERVAL': 200,              # Steps between evaluations
    'EVAL_ITER': 50,                   # Number of batches for evaluation
    'MAX_OUT_TOKENS': 100,             # Number of tokens to generate for samples
    
    # File paths
    'RESERVOIR_SAVE_PATH': 'models/deep_reservoir_trained.pt',
    'TRANSFORMER_SAVE_PATH': 'models/tiny_lm_trained.pt'
}

print(f"Using device: {SHARED_CONFIG['DEVICE']}")
os.makedirs("models", exist_ok=True)

In [None]:
# Install required packages
!pip install reservoirpy

In [None]:
# -----------------------------------------------------------------------------
# Import Common Libraries
# -----------------------------------------------------------------------------
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from tqdm import tqdm
from transformers import GPT2TokenizerFast
from datasets import load_dataset
import matplotlib.pyplot as plt
import reservoirpy as rpy
from reservoirpy.nodes import Reservoir

# Reduce verbosity for ReservoirPy
rpy.verbosity(0)

# -----------------------------------------------------------------------------
# Data Download and Tokenization
# -----------------------------------------------------------------------------

# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def download_and_save_dataset(max_stories=None):
    """Downloads the TinyStories dataset and saves a subset if specified."""
    data_dir = "data"
    if max_stories:
        train_path = os.path.join(data_dir, f"TinyStories-train-{max_stories}.txt")
        valid_path = os.path.join(data_dir, f"TinyStories-valid-{max_stories}.txt")
    else:
        train_path = os.path.join(data_dir, "TinyStories-train.txt")
        valid_path = os.path.join(data_dir, "TinyStories-valid.txt")

    if os.path.exists(train_path) and os.path.exists(valid_path):
        print(f"Dataset files already exist: {train_path}, {valid_path}")
        return train_path, valid_path

    print("Downloading TinyStories dataset from Hugging Face...")
    os.makedirs(data_dir, exist_ok=True)
    ds = load_dataset("roneneldan/TinyStories")

    print(f"Saving training split to {train_path}...")
    with open(train_path, 'w', encoding='utf-8') as f:
        for i, story in enumerate(tqdm(ds['train'])):
            if max_stories and i >= max_stories:
                break
            f.write(story['text'] + '\n')

    print(f"Saving validation split to {valid_path}...")
    with open(valid_path, 'w', encoding='utf-8') as f:
        val_stories_to_save = max_stories // 10 if max_stories else None
        for i, story in enumerate(tqdm(ds['validation'])):
            if val_stories_to_save and i >= val_stories_to_save:
                break
            f.write(story['text'] + '\n')
    return train_path, valid_path

def pre_tokenize_dataset(path, save_path):
    print(f"Running tokenization for {path}...")
    with open(path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = tokenizer.encode(text)
        np.save(save_path, np.array(tokens, dtype=np.int32))
        print(f"Saved tokenized file to binary {save_path}")

class TinyStoriesDataset(data.Dataset):
    def __init__(self, tokenized_path, block_size: int):
        self.block_size = block_size
        self.data = np.load(tokenized_path, mmap_mode='r')

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        source = torch.from_numpy(chunk[:-1].astype(np.int64))
        target = torch.from_numpy(chunk[1:].astype(np.int64))
        return source, target

In [None]:
# -----------------------------------------------------------------------------
# Download and Prepare Dataset
# -----------------------------------------------------------------------------

# Download dataset
train_txt_path, val_txt_path = download_and_save_dataset(max_stories=SHARED_CONFIG["MAX_STORIES"])

# Tokenize training data
train_tokenized_path = train_txt_path.replace('.txt', '.npy')
if not os.path.exists(train_tokenized_path):
    pre_tokenize_dataset(train_txt_path, train_tokenized_path)

# Tokenize validation data
val_tokenized_path = val_txt_path.replace('.txt', '.npy')
if not os.path.exists(val_tokenized_path):
    pre_tokenize_dataset(val_txt_path, val_tokenized_path)

# Create data loaders
train_dataset = TinyStoriesDataset(train_tokenized_path, SHARED_CONFIG['BLOCK_SIZE'])
train_loader = data.DataLoader(train_dataset, batch_size=SHARED_CONFIG['BATCH_SIZE'], shuffle=True)

val_dataset = TinyStoriesDataset(val_tokenized_path, SHARED_CONFIG['BLOCK_SIZE'])
val_loader = data.DataLoader(val_dataset, batch_size=SHARED_CONFIG['BATCH_SIZE'])

print(f"Data preparation complete! Tokenizer vocabulary size: {tokenizer.vocab_size}")

In [None]:
# -----------------------------------------------------------------------------
# Deep Reservoir Model Definition & Training
# -----------------------------------------------------------------------------

class ReservoirBlock(nn.Module):
    """A single block containing parallel reservoirs and a readout."""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.reservoirs = []

        for res_config in config['reservoirs_per_block']:
            self.reservoirs.append(
                Reservoir(
                    units=res_config['reservoir_size'],
                    sr=res_config['spectral_radius'],
                    lr=res_config['leaking_rate']
                )
            )

        total_reservoir_size = sum(res['reservoir_size'] for res in config['reservoirs_per_block'])
        self.readout = nn.Sequential(
            nn.Linear(total_reservoir_size, config['readout_hidden_size']),
            nn.LeakyReLU(),
            nn.Dropout(0.5),
            nn.Linear(config['readout_hidden_size'], config['embedding_dim'])
        )

    def forward(self, x):
        device = x.device
        batch_size, _, _ = x.shape
        all_res_states = []

        for i in range(batch_size):
            sequence_np = x[i].detach().cpu().numpy()
            states_for_sequence = []
            for reservoir in self.reservoirs:
                states = reservoir.run(sequence_np, reset=True)
                states_for_sequence.append(torch.from_numpy(states).float())
            combined_states = torch.cat(states_for_sequence, dim=1)
            all_res_states.append(combined_states)

        batch_states = torch.stack(all_res_states).to(device)
        update_vector = self.readout(batch_states)
        return update_vector

class DeepReservoirModel(nn.Module):
    """A deep model composed of stacked ReservoirBlocks."""
    def __init__(self, vocab_size, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(vocab_size, config['embedding_dim'])
        self.blocks = nn.ModuleList([
            ReservoirBlock(config) for _ in range(config['num_blocks'])
        ])
        self.final_head = nn.Linear(config['embedding_dim'], vocab_size)

    def forward(self, idx):
        x = self.embedding(idx)
        for block in self.blocks:
            update = block(x)
            x = x + update # Residual connection
        logits = self.final_head(x)
        return logits

@torch.no_grad()
def eval_reservoir_model(model, data_loader, config):
    """Evaluates the reservoir model."""
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    num_batches = 0
    for x, y in data_loader:
        if num_batches >= config['EVAL_ITER']:
            break
        x, y = x.to(config['DEVICE']), y.to(config['DEVICE'])
        logits = model(x)
        B, T, C = logits.shape
        loss = criterion(logits.view(B * T, C), y.view(B * T))
        total_loss += loss.item()
        num_batches += 1
    model.train()
    return total_loss / num_batches if num_batches > 0 else float('inf')

@torch.no_grad()
def generate_from_reservoir(model, context_str, max_new_tokens, config):
    """Generates text from the reservoir model."""
    model.eval()
    start_indices = tokenizer.encode(context_str)
    context = torch.tensor(start_indices, dtype=torch.long, device=config['DEVICE']).unsqueeze(0)

    for _ in range(max_new_tokens):
        current_context = context[:, -config['BLOCK_SIZE']:]
        logits = model(current_context)
        logits = logits[:, -1, :]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        context = torch.cat((context, idx_next), dim=1)

    model.train()
    return tokenizer.decode(context.squeeze().tolist())

# Define Reservoir model specific configuration
reservoir_config = {
    # Model architecture
    'embedding_dim': 256,
    'num_blocks': 4,
    'reservoirs_per_block': [
        {'reservoir_size': 128, 'leaking_rate': 0.3, 'spectral_radius': 0.9},
        {'reservoir_size': 256, 'leaking_rate': 0.1, 'spectral_radius': 0.9},
    ],
    'readout_hidden_size': 128,
    
    # Training params from shared config
    'BATCH_SIZE': SHARED_CONFIG['BATCH_SIZE'],
    'BLOCK_SIZE': SHARED_CONFIG['BLOCK_SIZE'],
    'EVAL_INTERVAL': SHARED_CONFIG['EVAL_INTERVAL'],
    'EVAL_ITER': SHARED_CONFIG['EVAL_ITER'],
    'DEVICE': SHARED_CONFIG['DEVICE'],
    'SAVE_PATH': SHARED_CONFIG['RESERVOIR_SAVE_PATH'],
    'LR': 0.001,
}

# Initialize the model
reservoir_model = DeepReservoirModel(
    vocab_size=tokenizer.vocab_size,
    config=reservoir_config
).to(SHARED_CONFIG['DEVICE'])

print(f"Reservoir Model initialized on {SHARED_CONFIG['DEVICE']} with {sum(p.numel() for p in reservoir_model.parameters() if p.requires_grad):,} trainable parameters.")

In [None]:
# -----------------------------------------------------------------------------
# Train the Reservoir Model
# -----------------------------------------------------------------------------

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(reservoir_model.parameters(), lr=reservoir_config['LR'])

train_losses = []
val_perplexities = []
steps = []
total_batches = 0

print("Starting Reservoir Model training...")
for epoch in range(1, SHARED_CONFIG['EPOCHS'] + 1):
    print(f"--- Epoch {epoch}/{SHARED_CONFIG['EPOCHS']} ---")
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
    for x, y in pbar:
        total_batches += 1
        x, y = x.to(SHARED_CONFIG['DEVICE']), y.to(SHARED_CONFIG['DEVICE'])

        optimizer.zero_grad()
        logits = reservoir_model(x)
        B, T, C = logits.shape
        loss = criterion(logits.view(B * T, C), y.view(B * T))
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        pbar.set_postfix(loss=f"{loss.item():.4f}")

        if total_batches > 0 and total_batches % SHARED_CONFIG['EVAL_INTERVAL'] == 0:
            val_loss = eval_reservoir_model(reservoir_model, val_loader, reservoir_config)
            perplexity = np.exp(val_loss)
            val_perplexities.append(perplexity)
            steps.append(total_batches)
            print("\n" + "-" * 50)
            print(f"Validation Loss: {val_loss:.4f}, Validation Perplexity: {perplexity:.4f}")
            generated_text = generate_from_reservoir(
                reservoir_model, 
                "Once upon a time", 
                SHARED_CONFIG['MAX_OUT_TOKENS'], 
                reservoir_config
            )
            print("--- Sample Generation ---")
            print(generated_text)
            print("-" * 50)

print("Reservoir Model training finished.")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Reservoir Model - Training Loss')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(steps, val_perplexities, marker='o')
plt.title('Reservoir Model - Validation Perplexity')
plt.xlabel('Total Batches')
plt.ylabel('Perplexity')
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Saving Reservoir model to {reservoir_config['SAVE_PATH']}...")
torch.save(reservoir_model.state_dict(), reservoir_config['SAVE_PATH'])
print("Reservoir Model saved.")

In [None]:
# -----------------------------------------------------------------------------
# Transformer Model Definition
# -----------------------------------------------------------------------------

class TinyLM(nn.Module):
    """
    A small language model based on the Transformer architecture.
    """
    def __init__(self, vocab_size: int = 50257, emb_dim: int = 768, block_size: int = 256, n_att_heads: int = 12,
                 n_decoders: int = 12, device: str = 'cuda'):
        super().__init__()
        self.device = device
        self.block_size = block_size
        self.token_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(block_size, emb_dim)
        self.decoders = nn.Sequential(*(TransformerDecoder(emb_dim, block_size, n_att_heads)
                                        for _ in range(n_decoders)))
        self.final_linear = nn.Linear(emb_dim, vocab_size)
        self.layer_norm = nn.LayerNorm(emb_dim)

    def generate(self, context: torch.Tensor, max_new_tokens: int) -> torch.Tensor:
        for _ in range(max_new_tokens):
            context = context[:, -self.block_size:]
            logits = self(context)[:, -1, :]
            probabilities = nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
            context = torch.cat([context, next_token], dim=1)
        return context

    def forward(self, x):
        token_emb = self.token_emb(x)
        pos_emb = self.pos_emb(torch.arange(min(self.block_size, x.size(1)), device=self.device))
        x = token_emb + pos_emb
        x = self.decoders(x)
        x = self.layer_norm(x)
        logits = self.final_linear(x)
        return logits

class TransformerDecoder(nn.Module):
    """
    A single Transformer decoder block.
    """
    def __init__(self, emb_dim: int = 768, block_size: int = 256, n_heads: int = 12, dropout: float = 0.2):
        super().__init__()
        self.emb_dim = emb_dim
        self.n_heads = n_heads
        self.head_projection = nn.Linear(emb_dim, 3 * emb_dim, bias=False)
        self.register_buffer('tril', ~torch.tril(torch.ones(block_size, block_size)).type(torch.bool))
        self.self_attention = nn.MultiheadAttention(emb_dim, n_heads, batch_first=True, dropout=0.2)
        self.feed_fwd = nn.Sequential(
            nn.Linear(emb_dim, emb_dim * 4),
            nn.ReLU(),
            nn.Linear(emb_dim * 4, emb_dim),
            nn.Dropout(dropout)
        )
        self.ln_1 = nn.LayerNorm(emb_dim)
        self.ln_2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = self.ln_1(x)
        x_proj = self.head_projection(x)
        q, k, v = x_proj.split(self.emb_dim, dim=-1)
        x = x + self.self_attention(q, k, v, attn_mask=self.tril[:x.size(1), :x.size(1)], need_weights=False)[0]
        x = x + self.feed_fwd(self.ln_2(x))
        return x

@torch.no_grad()
def eval_transformer_model(training_model: torch.nn.Module, val_loader: torch.utils.data.DataLoader, config: dict):
    training_model.eval()
    losses = []
    criterion = nn.CrossEntropyLoss()
    for k, (s_val, t_val) in enumerate(val_loader):
        if k >= config['EVAL_ITER']:
            break
        s_val, t_val = s_val.to(config['DEVICE']), t_val.to(config['DEVICE'])
        val_logits = training_model(s_val)
        B, T, C = val_logits.shape
        val_loss = criterion(val_logits.view(B * T, C), t_val.view(B * T))
        losses.append(val_loss.item())
    training_model.train()
    return np.mean(losses)

@torch.no_grad()
def generate_from_transformer(training_model: TinyLM, config: dict, prompt="Once upon a time") -> str:
    training_model.eval()
    if prompt:
        context = torch.tensor(tokenizer.encode(prompt), dtype=torch.long, device=config['DEVICE']).unsqueeze(0)
    else:
        context = torch.zeros((1, 1), dtype=torch.long, device=config['DEVICE'])
    out_tokens = training_model.generate(context, max_new_tokens=config['MAX_OUT_TOKENS'])
    training_model.train()
    return tokenizer.decode(out_tokens[0].tolist())

# Define Transformer model specific configuration
transformer_config = {
    # Model architecture
    'EMB_SIZE': 384,
    'N_ATTENTION_HEADS': 6,
    'N_DECODER_BLOCKS': 6,
    'VOCAB_SIZE': tokenizer.vocab_size,
    
    # Training params from shared config
    'MAX_OUT_TOKENS': SHARED_CONFIG['MAX_OUT_TOKENS'],
    'EVAL_INTERVAL': SHARED_CONFIG['EVAL_INTERVAL'],
    'EVAL_ITER': SHARED_CONFIG['EVAL_ITER'],
    'LR': 3e-4,
    'BATCH_SIZE': SHARED_CONFIG['BATCH_SIZE'],
    'BLOCK_SIZE': SHARED_CONFIG['BLOCK_SIZE'],
    'DEVICE': SHARED_CONFIG['DEVICE'],
    'SAVE_PATH': SHARED_CONFIG['TRANSFORMER_SAVE_PATH'],
}
assert transformer_config['EMB_SIZE'] % transformer_config['N_ATTENTION_HEADS'] == 0, "Embedding size must be divisible by number of attention heads"

# Initialize the model
transformer_model = TinyLM(
    emb_dim=transformer_config['EMB_SIZE'],
    block_size=transformer_config['BLOCK_SIZE'],
    n_att_heads=transformer_config['N_ATTENTION_HEADS'],
    n_decoders=transformer_config['N_DECODER_BLOCKS'],
    vocab_size=transformer_config['VOCAB_SIZE'],
    device=transformer_config['DEVICE']
).to(transformer_config['DEVICE'])

print(f"Transformer Model initialized on {SHARED_CONFIG['DEVICE']} with {sum(p.numel() for p in transformer_model.parameters() if p.requires_grad):,} trainable parameters.")

In [None]:
# -----------------------------------------------------------------------------
# Train the Transformer Model
# -----------------------------------------------------------------------------

optimizer = torch.optim.Adam(transformer_model.parameters(), lr=transformer_config['LR'])
criterion = nn.CrossEntropyLoss()

# Lists to store metrics for plotting
transformer_train_losses = []
transformer_val_perplexities = []
transformer_steps = []
total_batches = 0

print("Starting Transformer Model training...")
for epoch in range(1, SHARED_CONFIG['EPOCHS'] + 1):
    print(f"--- Epoch {epoch}/{SHARED_CONFIG['EPOCHS']} ---")
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
    for batch in pbar:
        sources, targets = batch
        total_batches += 1
        
        sources, targets = sources.to(SHARED_CONFIG['DEVICE']), targets.to(SHARED_CONFIG['DEVICE'])
        logits = transformer_model(sources)
        
        B, T, C = logits.shape
        loss = criterion(logits.view(B * T, C), targets.view(B * T))
        
        transformer_train_losses.append(loss.item())
        pbar.set_postfix(loss=f"{loss.item():.4f}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if total_batches > 0 and total_batches % SHARED_CONFIG['EVAL_INTERVAL'] == 0:
            val_loss = eval_transformer_model(transformer_model, val_loader, transformer_config)
            perplexity = np.exp(val_loss)
            transformer_val_perplexities.append(perplexity)
            transformer_steps.append(total_batches)
            print("\n" + "-" * 50)
            print(f"Validation Loss: {val_loss:.4f}, Validation Perplexity: {perplexity:.4f}")
            generated_text = generate_from_transformer(
                transformer_model, 
                transformer_config, 
                prompt="Once upon a time"
            )
            print("--- Sample Generation ---")
            print(generated_text)
            print("-" * 50)

print("Transformer Model training finished.")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(transformer_train_losses)
plt.title('Transformer Model - Training Loss')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(transformer_steps, transformer_val_perplexities, marker='o')
plt.title('Transformer Model - Validation Perplexity')
plt.xlabel('Total Batches')
plt.ylabel('Perplexity')
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Saving Transformer model to {transformer_config['SAVE_PATH']}...")
torch.save({
    'model_state_dict': transformer_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': transformer_config,
}, transformer_config['SAVE_PATH'])
print("Transformer Model saved.")

In [None]:
# -----------------------------------------------------------------------------
# Compare Model Performance
# -----------------------------------------------------------------------------

# Visualize the validation perplexity of both models on the same plot
plt.figure(figsize=(10, 6))
plt.plot(steps, val_perplexities, marker='o', label='Reservoir Model')
plt.plot(transformer_steps, transformer_val_perplexities, marker='s', label='Transformer Model')
plt.title('Model Comparison - Validation Perplexity')
plt.xlabel('Training Steps')
plt.ylabel('Perplexity (lower is better)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Generate some comparison text with the same prompt
print("=" * 80)
print("Model Comparison - Text Generation")
print("=" * 80)

prompt = "Once upon a time, there was a little"
print(f"Prompt: '{prompt}'\n")

# Generate from Reservoir model
reservoir_text = generate_from_reservoir(
    reservoir_model, 
    prompt, 
    SHARED_CONFIG['MAX_OUT_TOKENS'], 
    reservoir_config
)
print("Reservoir Model Output:")
print("-" * 50)
print(reservoir_text)
print("\n")

# Generate from Transformer model
transformer_text = generate_from_transformer(
    transformer_model, 
    transformer_config, 
    prompt=prompt
)
print("Transformer Model Output:")
print("-" * 50)
print(transformer_text)

# Compare model size
reservoir_params = sum(p.numel() for p in reservoir_model.parameters() if p.requires_grad)
transformer_params = sum(p.numel() for p in transformer_model.parameters() if p.requires_grad)

print("\n" + "=" * 80)
print(f"Reservoir Model: {reservoir_params:,} parameters")
print(f"Transformer Model: {transformer_params:,} parameters")
print(f"Size difference: {abs(reservoir_params - transformer_params):,} parameters")
print(f"The {'Reservoir' if reservoir_params < transformer_params else 'Transformer'} model is smaller by {abs(100 - (100 * min(reservoir_params, transformer_params) / max(reservoir_params, transformer_params))):.1f}%")
print("=" * 80)