# Pretraining Large Language Models

This notebook provides an interactive guide to pretraining LLMs from scratch, covering dataset preparation, distributed training strategies, and practical considerations for training at scale.

## 1. Introduction to LLM Pretraining

Pretraining is the foundation of modern LLMs. During this phase, models learn general language understanding from massive text corpora before being fine-tuned for specific tasks.

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
import pandas as pd
from IPython.display import display, HTML
import seaborn as sns

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 2. Understanding the Scale of Pretraining

Let's visualize the scale of modern LLM pretraining to understand the computational requirements.

In [None]:
# Model sizes and training data
models_data = {
    'Model': ['GPT-2', 'GPT-3', 'LLaMA-7B', 'LLaMA-65B', 'GPT-4*'],
    'Parameters': [1.5e9, 175e9, 7e9, 65e9, 1.7e12],
    'Training Tokens': [10e9, 300e9, 1e12, 1.4e12, 13e12],
    'Training FLOPs': [1.5e21, 3.14e23, 2e23, 1e24, 2e25],
    'GPU Hours (A100)': [100, 30000, 82000, 1e6, 1e7]
}

df_models = pd.DataFrame(models_data)

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Parameters
axes[0, 0].bar(df_models['Model'], df_models['Parameters'] / 1e9)
axes[0, 0].set_ylabel('Parameters (Billions)')
axes[0, 0].set_title('Model Size Comparison')
axes[0, 0].set_yscale('log')

# Training tokens
axes[0, 1].bar(df_models['Model'], df_models['Training Tokens'] / 1e12)
axes[0, 1].set_ylabel('Training Tokens (Trillions)')
axes[0, 1].set_title('Training Data Scale')
axes[0, 1].set_yscale('log')

# Compute required
axes[1, 0].bar(df_models['Model'], df_models['Training FLOPs'] / 1e23)
axes[1, 0].set_ylabel('FLOPs (×10²³)')
axes[1, 0].set_title('Computational Requirements')
axes[1, 0].set_yscale('log')

# GPU hours
axes[1, 1].bar(df_models['Model'], df_models['GPU Hours (A100)'])
axes[1, 1].set_ylabel('GPU Hours (A100)')
axes[1, 1].set_title('Training Time')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

print("* GPT-4 numbers are estimates based on reports")

## 3. Dataset Preparation Pipeline

High-quality data is crucial for successful pretraining. Let's explore the data preparation pipeline.

In [None]:
class DataQualityAnalyzer:
    """Analyze text quality for pretraining."""
    
    def __init__(self):
        self.quality_scores = []
        
    def analyze_text(self, text: str) -> Dict[str, float]:
        """Analyze various quality metrics of text."""
        words = text.split()
        sentences = text.split('.')
        
        # Basic metrics
        word_count = len(words)
        avg_word_length = np.mean([len(w) for w in words]) if words else 0
        
        # Vocabulary diversity
        unique_words = len(set(words))
        vocab_diversity = unique_words / word_count if word_count > 0 else 0
        
        # Repetition score
        bigrams = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
        unique_bigrams = len(set(bigrams))
        repetition_score = 1 - (unique_bigrams / len(bigrams) if bigrams else 1)
        
        # Sentence length variation
        sent_lengths = [len(s.split()) for s in sentences if s.strip()]
        sent_length_std = np.std(sent_lengths) if len(sent_lengths) > 1 else 0
        
        return {
            'word_count': word_count,
            'avg_word_length': avg_word_length,
            'vocab_diversity': vocab_diversity,
            'repetition_score': repetition_score,
            'sentence_variation': sent_length_std,
            'quality_score': self._compute_quality_score(
                vocab_diversity, repetition_score, sent_length_std
            )
        }
    
    def _compute_quality_score(self, diversity: float, repetition: float, 
                             variation: float) -> float:
        """Compute overall quality score."""
        # Higher diversity is better
        # Lower repetition is better
        # Higher sentence variation is better (to a point)
        score = (diversity * 0.4 + 
                (1 - repetition) * 0.4 + 
                min(variation / 10, 1) * 0.2)
        return score

# Example texts of varying quality
texts = [
    """The transformer architecture has revolutionized natural language processing
    by introducing self-attention mechanisms that allow models to process sequences
    in parallel. This breakthrough has led to significant improvements in various
    NLP tasks including translation, summarization, and question answering.""",
    
    """The cat sat on the mat. The cat sat on the mat. The cat sat on the mat.
    The cat sat on the mat. The cat sat on the mat. The cat sat on the mat.""",
    
    """Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod
    tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam
    quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo."""
]

analyzer = DataQualityAnalyzer()
quality_results = []

for i, text in enumerate(texts):
    metrics = analyzer.analyze_text(text)
    quality_results.append({
        'Text': f'Sample {i+1}',
        **metrics
    })

df_quality = pd.DataFrame(quality_results)
print("Text Quality Analysis:")
display(df_quality.round(3))

# Visualize quality scores
plt.figure(figsize=(10, 6))
metrics_to_plot = ['vocab_diversity', 'repetition_score', 'quality_score']
x = np.arange(len(texts))
width = 0.25

for i, metric in enumerate(metrics_to_plot):
    plt.bar(x + i * width, df_quality[metric], width, label=metric)

plt.xlabel('Text Sample')
plt.ylabel('Score')
plt.title('Text Quality Metrics Comparison')
plt.xticks(x + width, [f'Sample {i+1}' for i in range(len(texts))])
plt.legend()
plt.show()

## 4. Data Deduplication Strategies

Deduplication is crucial to prevent memorization and improve generalization. Let's explore different deduplication methods.

In [None]:
import hashlib
from collections import defaultdict

class DeduplicationDemo:
    """Demonstrate different deduplication strategies."""
    
    @staticmethod
    def exact_dedup(documents: List[str]) -> Tuple[List[str], int]:
        """Exact deduplication using hashing."""
        seen_hashes = set()
        unique_docs = []
        
        for doc in documents:
            doc_hash = hashlib.sha256(doc.encode()).hexdigest()
            if doc_hash not in seen_hashes:
                seen_hashes.add(doc_hash)
                unique_docs.append(doc)
                
        return unique_docs, len(documents) - len(unique_docs)
    
    @staticmethod
    def ngram_dedup(documents: List[str], n: int = 5, 
                   threshold: float = 0.8) -> Tuple[List[str], int]:
        """N-gram based deduplication."""
        def get_ngrams(text: str, n: int) -> set:
            words = text.split()
            return set(' '.join(words[i:i+n]) for i in range(len(words)-n+1))
        
        unique_docs = []
        all_ngrams = []
        
        for doc in documents:
            doc_ngrams = get_ngrams(doc, n)
            
            # Check overlap with existing documents
            is_duplicate = False
            for existing_ngrams in all_ngrams:
                overlap = len(doc_ngrams & existing_ngrams)
                if overlap / len(doc_ngrams) > threshold:
                    is_duplicate = True
                    break
                    
            if not is_duplicate:
                unique_docs.append(doc)
                all_ngrams.append(doc_ngrams)
                
        return unique_docs, len(documents) - len(unique_docs)

# Create sample documents with duplicates
sample_docs = [
    "The transformer model uses self-attention to process sequences.",
    "The transformer model uses self-attention to process sequences.",  # Exact duplicate
    "The transformer architecture uses self-attention to process sequences.",  # Near duplicate
    "BERT is a bidirectional transformer model for NLP tasks.",
    "GPT is an autoregressive transformer model for text generation.",
    "BERT is a bidirectional transformer model for NLP tasks.",  # Exact duplicate
    "Attention mechanisms allow models to focus on relevant parts of input."
]

dedup = DeduplicationDemo()

# Test exact deduplication
exact_unique, exact_removed = dedup.exact_dedup(sample_docs)
print(f"Exact Deduplication:")
print(f"  Original documents: {len(sample_docs)}")
print(f"  Unique documents: {len(exact_unique)}")
print(f"  Removed: {exact_removed}\n")

# Test n-gram deduplication
ngram_unique, ngram_removed = dedup.ngram_dedup(sample_docs, n=3, threshold=0.6)
print(f"N-gram Deduplication (3-grams, 60% threshold):")
print(f"  Original documents: {len(sample_docs)}")
print(f"  Unique documents: {len(ngram_unique)}")
print(f"  Removed: {ngram_removed}\n")

# Visualize deduplication results
methods = ['Original', 'Exact Dedup', 'N-gram Dedup']
counts = [len(sample_docs), len(exact_unique), len(ngram_unique)]

plt.figure(figsize=(8, 6))
bars = plt.bar(methods, counts, color=['gray', 'blue', 'green'])
plt.ylabel('Number of Documents')
plt.title('Deduplication Results')

# Add value labels on bars
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             str(count), ha='center', va='bottom')

plt.ylim(0, max(counts) + 1)
plt.show()

## 5. Training Objectives

Let's explore different training objectives used in LLM pretraining.

In [None]:
def visualize_training_objectives():
    """Visualize different LLM training objectives."""
    
    # Sample sequence
    tokens = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
    n_tokens = len(tokens)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Causal Language Modeling (CLM)
    ax = axes[0, 0]
    clm_mask = np.tril(np.ones((n_tokens, n_tokens)))
    im1 = ax.imshow(clm_mask, cmap='Blues', aspect='equal')
    ax.set_title('Causal Language Modeling (GPT-style)', fontsize=14)
    ax.set_xlabel('Position')
    ax.set_ylabel('Can attend to')
    ax.set_xticks(range(n_tokens))
    ax.set_yticks(range(n_tokens))
    ax.set_xticklabels(tokens, rotation=45, ha='right')
    ax.set_yticklabels(tokens)
    
    # Add arrows to show prediction direction
    for i in range(n_tokens - 1):
        ax.annotate('', xy=(i+1, i), xytext=(i, i),
                   arrowprops=dict(arrowstyle='->', color='red', lw=2))
    
    # 2. Masked Language Modeling (MLM)
    ax = axes[0, 1]
    mlm_mask = np.ones((n_tokens, n_tokens))
    masked_positions = [2, 5, 7]  # Mask "brown", "over", "lazy"
    
    # Create visualization
    mlm_visual = np.ones((n_tokens, n_tokens)) * 0.3
    for pos in masked_positions:
        mlm_visual[pos, :] = 1.0
        mlm_visual[:, pos] = 1.0
    
    im2 = ax.imshow(mlm_visual, cmap='Reds', aspect='equal')
    ax.set_title('Masked Language Modeling (BERT-style)', fontsize=14)
    ax.set_xlabel('Position')
    ax.set_ylabel('Position')
    ax.set_xticks(range(n_tokens))
    ax.set_yticks(range(n_tokens))
    
    # Mark masked tokens
    masked_tokens = tokens.copy()
    for pos in masked_positions:
        masked_tokens[pos] = f"[MASK]"
    ax.set_xticklabels(masked_tokens, rotation=45, ha='right')
    ax.set_yticklabels(tokens)
    
    # 3. Prefix Language Modeling
    ax = axes[1, 0]
    prefix_len = 4
    prefix_mask = np.zeros((n_tokens, n_tokens))
    # Prefix can attend bidirectionally
    prefix_mask[:prefix_len, :prefix_len] = 1
    # Rest is causal
    prefix_mask[prefix_len:, :] = np.tril(np.ones((n_tokens-prefix_len, n_tokens)))
    
    im3 = ax.imshow(prefix_mask, cmap='Greens', aspect='equal')
    ax.set_title('Prefix Language Modeling (T5-style)', fontsize=14)
    ax.set_xlabel('Position')
    ax.set_ylabel('Can attend to')
    ax.set_xticks(range(n_tokens))
    ax.set_yticks(range(n_tokens))
    ax.set_xticklabels(tokens, rotation=45, ha='right')
    ax.set_yticklabels(tokens)
    ax.axvline(x=prefix_len-0.5, color='red', linestyle='--', lw=2)
    ax.axhline(y=prefix_len-0.5, color='red', linestyle='--', lw=2)
    
    # 4. Span Corruption (T5)
    ax = axes[1, 1]
    # Show original and corrupted sequences
    original = "The quick brown fox jumps over the lazy dog"
    corrupted = "The quick <X> jumps over <Y> dog"
    target = "<X> brown fox <Y> the lazy"
    
    ax.text(0.5, 0.8, "Original:", transform=ax.transAxes, fontsize=12, ha='center')
    ax.text(0.5, 0.7, original, transform=ax.transAxes, fontsize=10, ha='center')
    
    ax.text(0.5, 0.5, "Input:", transform=ax.transAxes, fontsize=12, ha='center')
    ax.text(0.5, 0.4, corrupted, transform=ax.transAxes, fontsize=10, ha='center', color='blue')
    
    ax.text(0.5, 0.2, "Target:", transform=ax.transAxes, fontsize=12, ha='center')
    ax.text(0.5, 0.1, target, transform=ax.transAxes, fontsize=10, ha='center', color='red')
    
    ax.set_title('Span Corruption (T5)', fontsize=14)
    ax.axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_training_objectives()

## 6. Learning Rate Scheduling

Proper learning rate scheduling is critical for stable training. Let's explore different schedules.

In [None]:
def plot_lr_schedules(num_steps: int = 10000, warmup_steps: int = 1000):
    """Plot different learning rate schedules."""
    
    steps = np.arange(num_steps)
    
    # Linear warmup + Cosine decay
    def cosine_schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        progress = (step - warmup_steps) / (num_steps - warmup_steps)
        return 0.5 * (1 + np.cos(np.pi * progress))
    
    # Linear warmup + Linear decay
    def linear_schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        return 1 - (step - warmup_steps) / (num_steps - warmup_steps)
    
    # Linear warmup + Inverse sqrt
    def inverse_sqrt_schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        return 1 / np.sqrt((step - warmup_steps + 1) / warmup_steps)
    
    # Linear warmup + Constant
    def constant_schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        return 1.0
    
    schedules = {
        'Cosine': [cosine_schedule(s) for s in steps],
        'Linear': [linear_schedule(s) for s in steps],
        'Inverse Sqrt': [inverse_sqrt_schedule(s) for s in steps],
        'Constant': [constant_schedule(s) for s in steps]
    }
    
    plt.figure(figsize=(12, 8))
    
    for name, schedule in schedules.items():
        plt.plot(steps, schedule, label=name, linewidth=2)
    
    plt.axvline(x=warmup_steps, color='red', linestyle='--', alpha=0.5, label='End of warmup')
    plt.xlabel('Training Step', fontsize=12)
    plt.ylabel('Learning Rate Multiplier', fontsize=12)
    plt.title('Learning Rate Schedules Comparison', fontsize=14)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # Add annotations
    plt.annotate('Warmup Phase', xy=(warmup_steps/2, 0.5), xytext=(warmup_steps/2, 0.3),
                arrowprops=dict(arrowstyle='->', color='red'),
                ha='center', fontsize=10)
    
    plt.show()
    
    # Show schedule recommendations
    recommendations = pd.DataFrame({
        'Schedule': ['Cosine', 'Linear', 'Inverse Sqrt', 'Constant'],
        'Best For': [
            'Fixed-length training, smooth decay',
            'Simple, predictable decay',
            'Continued training, no fixed end',
            'Fine-tuning, transfer learning'
        ],
        'Pros': [
            'Smooth, proven effective',
            'Easy to reason about',
            'Good for long training',
            'Stable, no decay'
        ],
        'Cons': [
            'Requires known total steps',
            'Can be too aggressive',
            'Never reaches zero',
            'No adaptation'
        ]
    })
    
    print("\nLearning Rate Schedule Recommendations:")
    display(recommendations)

plot_lr_schedules()

## 7. Distributed Training Strategies

Large models require distributed training. Let's visualize different parallelism strategies.

In [None]:
def visualize_parallelism_strategies():
    """Visualize different distributed training strategies."""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Data Parallelism
    ax = axes[0, 0]
    n_gpus = 4
    
    # Draw GPUs
    for i in range(n_gpus):
        rect = plt.Rectangle((i*2, 0), 1.5, 3, fill=True, color='lightblue', ec='black')
        ax.add_patch(rect)
        ax.text(i*2 + 0.75, 1.5, f'GPU {i}\nFull Model', ha='center', va='center')
        
        # Data shards
        data_rect = plt.Rectangle((i*2, 3.5), 1.5, 1, fill=True, color='lightgreen', ec='black')
        ax.add_patch(data_rect)
        ax.text(i*2 + 0.75, 4, f'Data\nShard {i}', ha='center', va='center', fontsize=9)
    
    ax.set_xlim(-0.5, n_gpus*2)
    ax.set_ylim(-0.5, 5)
    ax.set_title('Data Parallelism (DDP)', fontsize=14)
    ax.axis('off')
    
    # 2. Model Parallelism
    ax = axes[0, 1]
    layers_per_gpu = 2
    
    for i in range(n_gpus):
        rect = plt.Rectangle((i*2, 0), 1.5, 3, fill=True, color='lightcoral', ec='black')
        ax.add_patch(rect)
        ax.text(i*2 + 0.75, 1.5, f'GPU {i}\nLayers\n{i*layers_per_gpu}-{(i+1)*layers_per_gpu-1}', 
                ha='center', va='center')
        
        # Show data flow
        if i < n_gpus - 1:
            ax.arrow(i*2 + 1.5, 1.5, 0.4, 0, head_width=0.2, head_length=0.1, fc='black')
    
    ax.set_xlim(-0.5, n_gpus*2)
    ax.set_ylim(-0.5, 5)
    ax.set_title('Model Parallelism', fontsize=14)
    ax.axis('off')
    
    # 3. Pipeline Parallelism
    ax = axes[1, 0]
    micro_batches = 4
    
    # Create pipeline schedule
    colors = plt.cm.tab10(np.linspace(0, 1, micro_batches))
    
    for gpu in range(n_gpus):
        for mb in range(micro_batches):
            # Forward pass
            start_time = mb + gpu
            rect = plt.Rectangle((start_time, gpu), 1, 0.4, 
                               fill=True, color=colors[mb], ec='black', alpha=0.7)
            ax.add_patch(rect)
            ax.text(start_time + 0.5, gpu + 0.2, f'F{mb}', ha='center', va='center', fontsize=8)
            
            # Backward pass
            back_time = start_time + n_gpus + micro_batches - 2*gpu - 1
            rect = plt.Rectangle((back_time, gpu), 1, 0.4, 
                               fill=True, color=colors[mb], ec='black', alpha=0.4)
            ax.add_patch(rect)
            ax.text(back_time + 0.5, gpu + 0.2, f'B{mb}', ha='center', va='center', fontsize=8)
    
    ax.set_xlim(-0.5, 12)
    ax.set_ylim(-0.5, n_gpus)
    ax.set_xlabel('Time', fontsize=12)
    ax.set_ylabel('GPU', fontsize=12)
    ax.set_title('Pipeline Parallelism Schedule', fontsize=14)
    ax.grid(True, alpha=0.3)
    
    # 4. 3D Parallelism Comparison
    ax = axes[1, 1]
    
    strategies = ['Data\nParallel', 'Model\nParallel', 'Pipeline\nParallel', '3D\nParallel']
    memory_per_gpu = [100, 25, 30, 10]
    communication = [50, 100, 75, 85]
    
    x = np.arange(len(strategies))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, memory_per_gpu, width, label='Memory per GPU (%)')
    bars2 = ax.bar(x + width/2, communication, width, label='Communication (%)')
    
    ax.set_ylabel('Relative Scale', fontsize=12)
    ax.set_title('Parallelism Strategy Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(strategies)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

visualize_parallelism_strategies()

## 8. Training Monitoring and Debugging

Monitoring is crucial for successful pretraining. Let's simulate a training run and visualize key metrics.

In [None]:
def simulate_training_metrics(num_steps: int = 5000):
    """Simulate training metrics for visualization."""
    
    steps = np.arange(num_steps)
    
    # Simulate loss with some noise and anomalies
    base_loss = 10 * np.exp(-steps / 1000) + 2
    noise = np.random.normal(0, 0.1, num_steps)
    loss = base_loss + noise
    
    # Add some anomalies
    anomaly_steps = [1000, 2500, 4000]
    for step in anomaly_steps:
        loss[step:step+50] += np.random.uniform(1, 3)
    
    # Gradient norm
    grad_norm = 2 * np.exp(-steps / 2000) + np.random.normal(0.5, 0.2, num_steps)
    grad_norm[grad_norm < 0] = 0.1
    
    # Learning rate (cosine schedule)
    warmup = 500
    lr = np.zeros(num_steps)
    lr[:warmup] = np.linspace(0, 1, warmup)
    lr[warmup:] = 0.5 * (1 + np.cos(np.pi * (steps[warmup:] - warmup) / (num_steps - warmup)))
    lr *= 3e-4  # Scale to actual LR
    
    # GPU memory and utilization
    gpu_memory = 70 + 10 * np.sin(steps / 100) + np.random.normal(0, 2, num_steps)
    gpu_util = 85 + 10 * np.sin(steps / 150 + 1) + np.random.normal(0, 3, num_steps)
    
    return {
        'steps': steps,
        'loss': loss,
        'grad_norm': grad_norm,
        'learning_rate': lr,
        'gpu_memory': np.clip(gpu_memory, 0, 100),
        'gpu_utilization': np.clip(gpu_util, 0, 100)
    }

# Simulate metrics
metrics = simulate_training_metrics()

# Create dashboard
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Loss plot
ax1 = fig.add_subplot(gs[0, :])
ax1.plot(metrics['steps'], metrics['loss'], 'b-', alpha=0.7, linewidth=1)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training Loss with Anomaly Detection', fontsize=14)
ax1.grid(True, alpha=0.3)
ax1.set_yscale('log')

# Mark anomalies
anomaly_mask = metrics['loss'] > np.percentile(metrics['loss'], 95)
ax1.scatter(metrics['steps'][anomaly_mask], metrics['loss'][anomaly_mask], 
           color='red', s=20, label='Anomalies')
ax1.legend()

# Gradient norm
ax2 = fig.add_subplot(gs[1, 0])
ax2.plot(metrics['steps'], metrics['grad_norm'], 'g-', alpha=0.7)
ax2.axhline(y=5, color='red', linestyle='--', label='Clip threshold')
ax2.set_ylabel('Gradient Norm', fontsize=12)
ax2.set_xlabel('Step', fontsize=12)
ax2.set_title('Gradient Norm', fontsize=14)
ax2.grid(True, alpha=0.3)
ax2.legend()

# Learning rate
ax3 = fig.add_subplot(gs[1, 1])
ax3.plot(metrics['steps'], metrics['learning_rate'], 'orange', alpha=0.7)
ax3.set_ylabel('Learning Rate', fontsize=12)
ax3.set_xlabel('Step', fontsize=12)
ax3.set_title('Learning Rate Schedule', fontsize=14)
ax3.grid(True, alpha=0.3)

# GPU metrics
ax4 = fig.add_subplot(gs[1, 2])
ax4.plot(metrics['steps'], metrics['gpu_memory'], 'purple', alpha=0.7, label='Memory')
ax4.plot(metrics['steps'], metrics['gpu_utilization'], 'brown', alpha=0.7, label='Utilization')
ax4.set_ylabel('Percentage (%)', fontsize=12)
ax4.set_xlabel('Step', fontsize=12)
ax4.set_title('GPU Metrics', fontsize=14)
ax4.grid(True, alpha=0.3)
ax4.legend()
ax4.set_ylim(0, 105)

# Training speed
ax5 = fig.add_subplot(gs[2, :])
tokens_per_second = 50000 + 10000 * np.sin(metrics['steps'] / 500) + \
                   np.random.normal(0, 2000, len(metrics['steps']))
ax5.plot(metrics['steps'], tokens_per_second, 'teal', alpha=0.7)
ax5.set_ylabel('Tokens/Second', fontsize=12)
ax5.set_xlabel('Step', fontsize=12)
ax5.set_title('Training Throughput', fontsize=14)
ax5.grid(True, alpha=0.3)

# Add mean line
mean_throughput = np.mean(tokens_per_second)
ax5.axhline(y=mean_throughput, color='red', linestyle='--', 
           label=f'Mean: {mean_throughput:.0f} tokens/s')
ax5.legend()

plt.suptitle('LLM Pretraining Monitoring Dashboard', fontsize=16)
plt.show()

# Print summary statistics
print("\nTraining Summary Statistics:")
print(f"Final Loss: {metrics['loss'][-1]:.4f}")
print(f"Average Gradient Norm: {np.mean(metrics['grad_norm']):.4f}")
print(f"Number of Anomalies Detected: {np.sum(anomaly_mask)}")
print(f"Average GPU Memory Usage: {np.mean(metrics['gpu_memory']):.1f}%")
print(f"Average Training Speed: {mean_throughput:.0f} tokens/second")

## 9. Cost Estimation and Optimization

Understanding and optimizing training costs is crucial for large-scale pretraining.

In [None]:
def calculate_training_costs(model_params: int, training_tokens: int, 
                           gpu_type: str = 'A100') -> Dict:
    """Calculate estimated training costs for different configurations."""
    
    # GPU specifications
    gpu_specs = {
        'V100': {'tflops': 125, 'memory': 32, 'cost_per_hour': 1.5},
        'A100': {'tflops': 312, 'memory': 80, 'cost_per_hour': 2.5},
        'H100': {'tflops': 1000, 'memory': 80, 'cost_per_hour': 5.0}
    }
    
    # Estimate FLOPs (6 * params * tokens for transformer training)
    total_flops = 6 * model_params * training_tokens
    
    results = {}
    
    for gpu_name, specs in gpu_specs.items():
        # Model flops utilization (MFU) - typically 30-50% for real training
        mfu = 0.35
        effective_tflops = specs['tflops'] * mfu
        
        # Calculate time
        gpu_seconds = total_flops / (effective_tflops * 1e12)
        gpu_hours = gpu_seconds / 3600
        
        # Number of GPUs needed (based on memory)
        model_memory_gb = model_params * 18 / 1e9  # Rough estimate
        gpus_needed = max(1, int(np.ceil(model_memory_gb / specs['memory'])))
        
        # Total cost
        total_cost = gpu_hours * specs['cost_per_hour'] * gpus_needed
        
        results[gpu_name] = {
            'gpu_hours': gpu_hours,
            'gpus_needed': gpus_needed,
            'total_hours': gpu_hours / gpus_needed,
            'total_cost': total_cost,
            'cost_per_million_params': total_cost / (model_params / 1e6)
        }
    
    return results

# Calculate costs for different model sizes
model_configs = [
    {'name': '1B Model', 'params': 1e9, 'tokens': 20e9},
    {'name': '7B Model', 'params': 7e9, 'tokens': 1e12},
    {'name': '13B Model', 'params': 13e9, 'tokens': 1e12},
    {'name': '70B Model', 'params': 70e9, 'tokens': 1.4e12},
]

cost_data = []

for config in model_configs:
    costs = calculate_training_costs(config['params'], config['tokens'])
    
    for gpu_type, cost_info in costs.items():
        cost_data.append({
            'Model': config['name'],
            'GPU Type': gpu_type,
            'GPU Hours': cost_info['gpu_hours'],
            'GPUs Needed': cost_info['gpus_needed'],
            'Total Cost ($)': cost_info['total_cost']
        })

df_costs = pd.DataFrame(cost_data)

# Visualize costs
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Cost by model size and GPU type
pivot_costs = df_costs.pivot(index='Model', columns='GPU Type', values='Total Cost ($)')
pivot_costs.plot(kind='bar', ax=ax1)
ax1.set_ylabel('Total Cost ($)', fontsize=12)
ax1.set_title('Training Cost by Model Size and GPU Type', fontsize=14)
ax1.set_yscale('log')
ax1.grid(True, alpha=0.3, axis='y')
ax1.legend(title='GPU Type')

# Time vs Cost tradeoff
for model in ['7B Model', '70B Model']:
    model_data = df_costs[df_costs['Model'] == model]
    ax2.scatter(model_data['GPU Hours'] / model_data['GPUs Needed'], 
               model_data['Total Cost ($)'],
               label=model, s=100)
    
    # Add GPU type labels
    for _, row in model_data.iterrows():
        ax2.annotate(row['GPU Type'], 
                    (row['GPU Hours'] / row['GPUs Needed'], row['Total Cost ($)']),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

ax2.set_xlabel('Wall Clock Time (Hours)', fontsize=12)
ax2.set_ylabel('Total Cost ($)', fontsize=12)
ax2.set_title('Time vs Cost Tradeoff', fontsize=14)
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.grid(True, alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.show()

# Cost optimization strategies
print("\nCost Optimization Strategies:")
optimization_strategies = pd.DataFrame({
    'Strategy': [
        'Mixed Precision Training',
        'Gradient Checkpointing',
        'Efficient Attention',
        'Data Packing',
        'Learning Rate Tuning',
        'Spot Instances'
    ],
    'Cost Reduction': ['20-30%', '15-25%', '10-20%', '5-10%', '10-15%', '60-70%'],
    'Trade-off': [
        'Minimal quality impact',
        'Slower training',
        'Implementation complexity',
        'Slightly less randomness',
        'Requires experimentation',
        'Can be interrupted'
    ]
})

display(optimization_strategies)

## 10. Summary and Best Practices

Let's summarize the key takeaways for successful LLM pretraining.

In [None]:
# Create a comprehensive pretraining checklist
checklist = {
    'Stage': [
        '1. Data Preparation',
        '2. Model Architecture',
        '3. Training Setup',
        '4. Distributed Strategy',
        '5. Monitoring',
        '6. Checkpointing',
        '7. Evaluation'
    ],
    'Key Tasks': [
        'Collect, filter, deduplicate data',
        'Choose architecture, size, config',
        'Set hyperparameters, schedule',
        'Select parallelism strategy',
        'Setup logging, alerts, dashboards',
        'Regular saves, test recovery',
        'Validation loss, benchmarks'
    ],
    'Critical Considerations': [
        'Quality > Quantity, diversity matters',
        'Memory requirements, efficiency',
        'Learning rate, warmup crucial',
        'Communication overhead, scaling',
        'Catch issues early, track everything',
        'Failure recovery, reproducibility',
        'Downstream task performance'
    ],
    'Common Pitfalls': [
        'Low quality or biased data',
        'Model too large for hardware',
        'Unstable training, divergence',
        'Inefficient parallelization',
        'Missing critical failures',
        'Lost work from crashes',
        'Overfitting to validation'
    ]
}

df_checklist = pd.DataFrame(checklist)

print("🚀 LLM Pretraining Checklist")
print("=" * 80)

for _, row in df_checklist.iterrows():
    print(f"\n{row['Stage']}")
    print(f"  ✓ Tasks: {row['Key Tasks']}")
    print(f"  ⚠️  Consider: {row['Critical Considerations']}")
    print(f"  ❌ Avoid: {row['Common Pitfalls']}")

# Key metrics to track
print("\n\n📊 Key Metrics to Track During Pretraining:")
print("=" * 80)

metrics_table = pd.DataFrame({
    'Metric': [
        'Loss (train/val)',
        'Perplexity',
        'Gradient Norm',
        'Learning Rate',
        'Tokens/Second',
        'GPU Utilization',
        'Memory Usage'
    ],
    'Expected Range': [
        'Decreasing, gap < 0.5',
        '< 10 for good models',
        '0.1 - 5.0',
        'Following schedule',
        '> 10k for efficiency',
        '> 80%',
        '< 90% of available'
    ],
    'Warning Signs': [
        'Sudden spikes, divergence',
        'Not decreasing, > 100',
        '> 10 or near 0',
        'Not changing as expected',
        'Decreasing over time',
        '< 50%',
        'OOM errors'
    ]
})

display(metrics_table)

print("\n\n✅ Pretraining Pipeline Complete!")
print("\nRemember: Successful pretraining requires patience, careful monitoring, and robust infrastructure.")
print("Start small, validate your pipeline, then scale up!")