

<div style="background: linear-gradient(90deg, #17a2b8 0%, #0e5a63 60%, #0a3d44 100%); color: white; padding: 18px 25px; margin-bottom: 20px;">
    <div style="display: flex; justify-content: space-between; align-items: baseline;">
        <h1 style="font-family: 'Helvetica Neue', sans-serif; font-size: 24px; margin: 0; font-weight: 300;">
            Lab 2 Part B: Advanced Multi-Armed Bandits
        </h1>
        <span style="font-size: 11px; opacity: 0.9;">© Prof. Dehghani</span>
    </div>
    <p style="font-size: 13px; margin-top: 6px; margin-bottom: 0; opacity: 0.9;">
        IE 7295 Reinforcement Learning | Sutton & Barto Chapter 2 | Advanced Level | 90 minutes
    </p>
</div>

<div style="background: white; padding: 15px 20px; margin-bottom: 12px; border-left: 3px solid #17a2b8;">
    <h3 style="color: #17a2b8; font-size: 14px; margin: 0 0 8px 0; text-transform: uppercase; letter-spacing: 0.5px;">Advanced Bandit Methods</h3>
    <p style="color: #555; line-height: 1.6; margin: 0; font-size: 13px;">
        Building on basic ε-greedy and optimistic initialization, we explore more sophisticated approaches:
        <strong>Upper Confidence Bound (UCB)</strong> uses principled uncertainty estimates for exploration, while
        <strong>Gradient Bandits</strong> learn action preferences through policy gradients. These methods represent
        key advances in balancing exploration and exploitation.
    </p>
</div>

<table style="width: 100%; border-spacing: 12px;">
<tr>
<td style="background: white; padding: 12px 15px; border-top: 3px solid #17a2b8; vertical-align: top; width: 50%;">
    <h4 style="color: #17a2b8; font-size: 13px; margin: 0 0 8px 0; font-weight: 600;">Learning Objectives</h4>
    <ul style="color: #555; line-height: 1.4; margin: 0; padding-left: 18px; font-size: 12px;">
        <li>Understand confidence-based exploration (UCB)</li>
        <li>Implement gradient-based policy learning</li>
        <li>Analyze the role of baselines in gradient methods</li>
        <li>Compare all Chapter 2 methods</li>
        <li>Reproduce Figures 2.4, 2.5, and 2.6</li>
    </ul>
</td>
<td style="background: white; padding: 12px 15px; border-top: 3px solid #00acc1; vertical-align: top; width: 50%;">
    <h4 style="color: #00acc1; font-size: 13px; margin: 0 0 8px 0; font-weight: 600;">Key Concepts</h4>
    <div style="color: #555; font-size: 12px; line-height: 1.6;">
        <div style="padding: 2px 0;"><code style="background: #e0f7fa; padding: 1px 5px; color: #006064;">UCB</code> = upper confidence bound</div>
        <div style="padding: 2px 0;"><code style="background: #e0f7fa; padding: 1px 5px; color: #006064;">H_t(a)</code> = action preferences</div>
        <div style="padding: 2px 0;"><code style="background: #e0f7fa; padding: 1px 5px; color: #006064;">π_t(a)</code> = action probabilities</div>
        <div style="padding: 2px 0;"><code style="background: #e0f7fa; padding: 1px 5px; color: #006064;">baseline</code> = average reward</div>
    </div>
</td>
</tr>
</table>

## Environment Setup and Utilities

In [None]:
# ============================================
# CELL 1: Environment Configuration and Imports
# Purpose: Import libraries and configure experimental parameters for advanced bandits
# ============================================

import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List, Optional
import warnings
warnings.filterwarnings('ignore')

# Define the pretty_print function directly in the notebook as the external utility module is not available.
def pretty_print(title: str, content: str, style: str = 'info'):
    """
    A simple utility function to display styled output boxes.
    This is a placeholder since the external utility module is not available.
    In a real scenario, this would provide richer formatting.
    """
    styles = {
        'info': {'color': 'blue', 'border': '2px solid blue'},
        'success': {'color': 'green', 'border': '2px solid green'},
        'warning': {'color': 'orange', 'border': '2px solid orange'},
        'danger': {'color': 'red', 'border': '2px solid red'},
        'result': {'color': 'purple', 'border': '2px dashed purple'},
        'note': {'color': 'gray', 'border': '1px solid gray'}
    }
    s = styles.get(style, styles['info'])
    print(f"\n--- {title} ---")
    print(content.replace('<br>', '\n'))
    print("--------------------\n")


# Enhanced color scheme matching Sutton & Barto figures
COLORS = {
    'ucb': '#0000FF',           # Blue for UCB (matches Figure 2.4)
    'epsilon_greedy': '#808080', # Gray for ε-greedy (matches Figure 2.4)
    'gradient_with': '#0000FF',  # Blue for gradient with baseline
    'gradient_without': '#8B4513', # Brown for gradient without baseline
    'optimistic': '#00FF00',     # Green for optimistic initialization
    'greedy': '#FF0000'          # Red for greedy
}

# Standard experimental parameters from Sutton & Barto
K = 10          # Number of arms
STEPS = 1000    # Time steps per run
RUNS = 2000     # Number of independent runs for statistical significance

# Configure matplotlib for publication-quality plots
plt.rcParams.update({
    'figure.dpi': 100,
    'font.size': 10,
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'legend.fontsize': 10,
    'lines.linewidth': 1.5
})

pretty_print("Environment Initialized",
             f"Configuration: {K} arms, {STEPS} steps, {RUNS} runs<br>" +
             "Ready for advanced bandit algorithms: UCB and Gradient methods",
             style='success')

In [None]:
# ============================================
# CELL 2: Basic Bandit Environment Functions
# Purpose: Reusable functions for bandit problem generation and reward sampling
# ============================================

def create_bandit(mean_shift: float = 0.0) -> np.ndarray:
    """
    Create a k-armed bandit problem with configurable mean shift

    Standard testbed: q*(a) ~ N(0,1)
    Alternative testbed: q*(a) ~ N(mean_shift,1) for Figure 2.5

    Args:
        mean_shift: Shift the mean of true action values (default 0 for standard testbed)

    Returns:
        q_true: Array of true action values
    """
    return np.random.randn(K) + mean_shift

def get_reward(action: int, q_true: np.ndarray) -> float:
    """
    Sample reward from the bandit environment

    Bandit model: R_t ~ N(q*(A_t), 1)
    Rewards are normally distributed around true action value with unit variance

    Args:
        action: Selected action index (0 to K-1)
        q_true: True action values

    Returns:
        reward: Noisy reward sample
    """
    return q_true[action] + np.random.randn()

def softmax(preferences: np.ndarray) -> np.ndarray:
    """
    Compute softmax probabilities from action preferences

    Softmax formula: π(a) = exp(H(a)) / Σ_b exp(H(b))
    Includes numerical stability by subtracting max preference

    Args:
        preferences: Action preference values H_t(a)

    Returns:
        probabilities: Action selection probabilities π_t(a)
    """
    # Numerical stability: subtract max to prevent overflow
    exp_prefs = np.exp(preferences - np.max(preferences))
    return exp_prefs / np.sum(exp_prefs)

pretty_print("Core Functions Ready",
             "Bandit environment and utility functions loaded<br>" +
             "Functions: create_bandit(), get_reward(), softmax()",
             style='info')

## Upper Confidence Bound (UCB) Action Selection

### Theoretical Foundation

The **Upper Confidence Bound (UCB)** method addresses the exploration-exploitation dilemma by systematically considering uncertainty in action value estimates. Unlike ε-greedy's random exploration, UCB uses a **principled confidence interval approach**.

### The UCB Algorithm

UCB selects actions according to:
$$A_t = \arg\max_a \left[ Q_t(a) + c\sqrt{\frac{\ln t}{N_t(a)}} \right]$$

Where:
- **$Q_t(a)$**: Current value estimate (exploitation term)
- **$c\sqrt{\frac{\ln t}{N_t(a)}}$**: Upper confidence bound (exploration term)
- **$c > 0$**: Controls exploration level
- **$t$**: Current time step
- **$N_t(a)$**: Number of times action $a$ selected

### Why UCB Works

1. **Uncertainty Decreases with Experience**: $\frac{1}{N_t(a)}$ term decreases as actions are tried more
2. **Time-Dependent Exploration**: $\ln t$ ensures exploration continues (slowly) over time  
3. **Optimism Under Uncertainty**: Always selects action with highest potential value
4. **No Tuning Required**: Self-balancing exploration without parameter tuning like ε

### Theoretical Guarantees

UCB provides **logarithmic regret bounds**: $O(\ln t)$, meaning the per-step regret approaches zero as $t \to \infty$.

In [None]:
# ============================================
# CELL 3: Upper Confidence Bound Implementation
# Purpose: Implement UCB action selection with principled uncertainty-based exploration
# ============================================

def ucb_action_selection(Q: np.ndarray, N: np.ndarray, t: int, c: float = 2.0) -> int:
    """
    Upper Confidence Bound action selection

    Implements the UCB formula: A_t = argmax[Q_t(a) + c*sqrt(ln(t)/N_t(a))]

    Key insight: Balance estimated value with uncertainty bonus
    - High Q_t(a): Actions that have performed well (exploitation)
    - High uncertainty bonus: Actions tried infrequently or never (exploration)
    - The uncertainty bonus decreases as N_t(a) increases
    - The ln(t) term ensures exploration continues over time

    Args:
        Q: Current action value estimates Q_t(a)
        N: Action selection counts N_t(a)
        t: Current time step (must be > 0)
        c: Confidence parameter (c=2 is theoretically motivated)

    Returns:
        action: Selected action index
    """
    # Handle untried actions: give them infinite confidence (try all actions once first)
    untried_actions = np.where(N == 0)[0]
    if len(untried_actions) > 0:
        return np.random.choice(untried_actions)

    # Compute UCB values: Q(a) + confidence bound
    ucb_values = Q + c * np.sqrt(np.log(t) / N)

    # Select action with highest UCB value (break ties randomly)
    max_ucb = np.max(ucb_values)
    return np.random.choice(np.where(ucb_values == max_ucb)[0])

def update_action_values(Q: np.ndarray, N: np.ndarray, action: int, reward: float) -> None:
    """
    Update action value estimates using sample average method

    Incremental update: Q_n+1 = Q_n + (1/n)[R_n - Q_n]
    This is equivalent to the sample mean but computationally efficient

    The update can be interpreted as:
    NewEstimate = OldEstimate + StepSize * PredictionError

    Args:
        Q: Action value estimates (modified in place)
        N: Action counts (modified in place)
        action: Action that was selected
        reward: Observed reward
    """
    N[action] += 1
    # Sample average update with step size 1/n
    Q[action] += (reward - Q[action]) / N[action]

pretty_print("UCB Algorithm Implemented",
             "Upper Confidence Bound with systematic uncertainty-based exploration<br>" +
             "Formula: A_t = argmax[Q_t(a) + c√(ln(t)/N_t(a))]<br>" +
             "Default c=2.0 provides theoretical guarantees",
             style='success')

In [None]:
# ============================================
# CELL 4: UCB vs ε-greedy Experiment Setup
# Purpose: Run comparative experiment to reproduce Figure 2.4 results
# ============================================

def run_ucb_experiment(c: float = 2.0, runs: int = 2000, steps: int = 1000) -> np.ndarray:
    """
    Run UCB bandit experiment

    This experiment tests UCB's performance on the standard 10-armed testbed.
    UCB should outperform ε-greedy due to its principled exploration strategy.

    Expected behavior:
    - Initial random phase: tries each action once
    - Rapid learning: quickly identifies promising actions
    - Diminishing exploration: uncertainty bounds shrink over time
    - Superior long-term performance: principled vs random exploration

    Args:
        c: UCB confidence parameter
        runs: Number of independent experiments
        steps: Number of time steps per run

    Returns:
        avg_rewards: Average reward at each time step
    """
    all_rewards = np.zeros((runs, steps))

    for run in range(runs):
        # Create fresh bandit problem
        q_true = create_bandit()

        # Initialize UCB agent
        Q = np.zeros(K)  # Action value estimates
        N = np.zeros(K)  # Action counts

        for step in range(steps):
            # UCB action selection (step+1 because UCB needs t > 0)
            action = ucb_action_selection(Q, N, step + 1, c)

            # Get reward and update estimates
            reward = get_reward(action, q_true)
            update_action_values(Q, N, action, reward)

            # Record performance
            all_rewards[run, step] = reward

    return np.mean(all_rewards, axis=0)

def run_epsilon_greedy_baseline(epsilon: float = 0.1, runs: int = 2000, steps: int = 1000) -> np.ndarray:
    """
    Run ε-greedy experiment for comparison with UCB

    This provides the baseline performance for Figure 2.4.
    Uses ε=0.1 as the representative ε-greedy method.

    Args:
        epsilon: Exploration probability
        runs: Number of independent experiments
        steps: Number of time steps per run

    Returns:
        avg_rewards: Average reward at each time step
    """
    all_rewards = np.zeros((runs, steps))

    for run in range(runs):
        # Create fresh bandit problem
        q_true = create_bandit()

        # Initialize ε-greedy agent
        Q = np.zeros(K)
        N = np.zeros(K)

        for step in range(steps):
            # ε-greedy action selection
            if np.random.random() < epsilon:
                action = np.random.randint(K)  # Explore
            else:
                max_Q = np.max(Q)
                action = np.random.choice(np.where(Q == max_Q)[0])  # Exploit

            # Get reward and update estimates
            reward = get_reward(action, q_true)
            update_action_values(Q, N, action, reward)

            # Record performance
            all_rewards[run, step] = reward

    return np.mean(all_rewards, axis=0)

pretty_print("Experiment Functions Ready",
             "UCB vs ε-greedy comparison experiments prepared<br>" +
             "Will reproduce Figure 2.4 results",
             style='info')

In [None]:
# ============================================
# CELL 5: Run UCB vs ε-greedy Experiments
# Purpose: Generate performance data for Figure 2.4 reproduction
# ============================================

pretty_print("Starting UCB Experiments",
             "Running UCB vs ε-greedy comparison experiments<br>" +
             f"Configuration: {RUNS} runs × {STEPS} steps each",
             style='info')

# Run UCB experiment (c=2 is standard)
print("  Running UCB (c=2.0)...")
ucb_rewards = run_ucb_experiment(c=2.0, runs=RUNS, steps=STEPS)

# Run ε-greedy baseline
print("  Running ε-greedy (ε=0.1)...")
epsilon_rewards = run_epsilon_greedy_baseline(epsilon=0.1, runs=RUNS, steps=STEPS)

# Analyze final performance
ucb_final = ucb_rewards[-1]
epsilon_final = epsilon_rewards[-1]
improvement = ((ucb_final - epsilon_final) / epsilon_final) * 100

pretty_print("Experiment Results",
             f"UCB final average reward: {ucb_final:.3f}<br>" +
             f"ε-greedy final average reward: {epsilon_final:.3f}<br>" +
             f"<strong>UCB improvement: {improvement:.1f}%</strong>",
             style='result')

pretty_print("Key Insight",
             "UCB's principled uncertainty-based exploration outperforms ε-greedy's random exploration<br>" +
             "The confidence bounds guide exploration to promising actions",
             style='note')

In [None]:
# ============================================
# CELL 6: Visualize UCB vs ε-greedy Performance
# Purpose: Create Figure 2.4 reproduction showing UCB superiority
# ============================================

plt.figure(figsize=(10, 6))

# Plot UCB performance (blue line to match Figure 2.4)
plt.plot(ucb_rewards, color=COLORS['ucb'], label='UCB c = 2', linewidth=2)

# Plot ε-greedy performance (gray line to match Figure 2.4)
plt.plot(epsilon_rewards, color=COLORS['epsilon_greedy'], label='ε-greedy ε = 0.1', linewidth=2)

# Formatting to match Sutton & Barto style
plt.xlabel('Steps', fontsize=12)
plt.ylabel('Average\nreward', fontsize=12)
plt.xlim(1, 1000)
plt.ylim(0, 1.5)
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)

# Title matching Figure 2.4
plt.title('Figure 2.4: Average performance of UCB action selection on the 10-armed testbed.\n' +
          'As shown, UCB generally performs better than ε-greedy action selection, except in\n' +
          'the first k steps, when it selects randomly among the as-yet-untried actions.',
          fontsize=11, pad=20)

plt.tight_layout()
plt.show()

# Analysis of results
analysis = (
    "<strong>UCB Performance Analysis:</strong><br><br>" +
    "• <strong>Initial Phase (Steps 1-10):</strong> UCB tries each arm once (systematic initialization)<br>" +
    "• <strong>Learning Phase (Steps 10-200):</strong> Rapid improvement as confidence bounds guide exploration<br>" +
    "• <strong>Convergence (Steps 200+):</strong> Superior long-term performance due to principled exploration<br><br>" +
    "<strong>Why UCB Outperforms ε-greedy:</strong><br>" +
    "• Systematic vs random exploration<br>" +
    "• Uncertainty-based action selection<br>" +
    "• Self-tuning exploration (no ε parameter needed)"
)

pretty_print("Figure 2.4 Analysis", analysis, style='result')

## Gradient Bandit Algorithms

### Theoretical Foundation

**Gradient bandit algorithms** take a fundamentally different approach: instead of estimating action values, they learn **action preferences** and use **gradient ascent** to maximize expected reward.

### The Gradient Bandit Framework

**Action Preferences**: $H_t(a) \in \mathbb{R}$ (can be positive or negative)

**Action Probabilities**: $\pi_t(a) = \frac{e^{H_t(a)}}{\sum_{b=1}^k e^{H_t(b)}}$ (softmax distribution)

**Preference Updates**:
- For selected action: $H_{t+1}(A_t) = H_t(A_t) + \alpha(R_t - \bar{R}_t)(1 - \pi_t(A_t))$
- For other actions: $H_{t+1}(a) = H_t(a) - \alpha(R_t - \bar{R}_t)\pi_t(a)$

Where $\bar{R}_t$ is the **baseline** (average reward).

### The Role of Baselines

**With Baseline** ($\bar{R}_t$):
- Reduces variance in gradient estimates
- Provides relative reward signal: "better/worse than average"
- Essential for good performance

**Without Baseline** ($\bar{R}_t = 0$):
- Higher variance, slower convergence
- All positive rewards increase selected action preference
- Can work but suboptimal

### Policy Gradient Connection

This is actually **REINFORCE** algorithm applied to bandits:
$$\nabla_H \mathbb{E}[R_t] = (R_t - \bar{R}_t)\nabla_H \ln \pi_t(A_t)$$

In [None]:
# ============================================
# CELL 7: Gradient Bandit Algorithm Implementation
# Purpose: Implement policy gradient approach with preference learning
# ============================================

def gradient_bandit_action_selection(H: np.ndarray) -> Tuple[int, np.ndarray]:
    """
    Select action using softmax policy from preferences

    The gradient bandit uses a probabilistic policy rather than deterministic selection.
    Actions with higher preferences H_t(a) get higher selection probabilities.

    Softmax policy: π_t(a) = exp(H_t(a)) / Σ_b exp(H_t(b))

    This creates a "soft" action selection where:
    - All actions have non-zero probability
    - Better actions (higher preferences) are selected more often
    - The distribution naturally balances exploration and exploitation

    Args:
        H: Action preferences H_t(a)

    Returns:
        action: Selected action (sampled from softmax distribution)
        probabilities: Action selection probabilities π_t(a)
    """
    # Compute softmax probabilities
    probabilities = softmax(H)

    # Sample action from probability distribution
    action = np.random.choice(K, p=probabilities)

    return action, probabilities

def update_preferences(H: np.ndarray, action: int, reward: float,
                      probabilities: np.ndarray, alpha: float,
                      baseline: Optional[float] = None) -> None:
    """
    Update action preferences using policy gradient

    This implements the REINFORCE gradient ascent update:

    For selected action A_t:
    H_{t+1}(A_t) = H_t(A_t) + α(R_t - baseline)(1 - π_t(A_t))

    For non-selected actions a ≠ A_t:
    H_{t+1}(a) = H_t(a) - α(R_t - baseline)π_t(a)

    Intuition:
    - If reward > baseline: increase preference for selected action, decrease others
    - If reward < baseline: decrease preference for selected action, increase others
    - The (1-π_t(A_t)) and π_t(a) terms weight updates by current probabilities

    Args:
        H: Action preferences (modified in place)
        action: Action that was selected
        reward: Observed reward
        probabilities: Current action probabilities π_t(a)
        alpha: Learning rate
        baseline: Baseline for variance reduction (None = no baseline)
    """
    # Compute advantage: reward relative to baseline
    if baseline is not None:
        advantage = reward - baseline
    else:
        advantage = reward  # No baseline (baseline = 0)

    # Update preferences using policy gradient
    for a in range(K):
        if a == action:
            # Selected action: increase if advantage > 0
            H[a] += alpha * advantage * (1 - probabilities[a])
        else:
            # Non-selected actions: decrease if advantage > 0
            H[a] -= alpha * advantage * probabilities[a]

def update_baseline(baseline_sum: float, baseline_count: int, reward: float) -> Tuple[float, float, int]:
    """
    Update running average baseline (sample mean of all rewards)

    The baseline is the average of all rewards seen so far:
    baseline_t = (1/t) * Σ_{i=1}^t R_i

    Using incremental computation for efficiency

    Args:
        baseline_sum: Sum of all rewards so far
        baseline_count: Number of rewards seen
        reward: New reward to include

    Returns:
        baseline: Updated average baseline
        baseline_sum: Updated sum
        baseline_count: Updated count
    """
    baseline_sum += reward
    baseline_count += 1
    baseline = baseline_sum / baseline_count
    return baseline, baseline_sum, baseline_count

pretty_print("Gradient Bandit Implemented",
             "Policy gradient method with preference learning<br>" +
             "Features: Softmax policy, REINFORCE updates, baseline for variance reduction<br>" +
             "Updates: H_t+1(a) = H_t(a) + α(R_t - baseline)∇ln(π_t(a))",
             style='success')

In [None]:
# ============================================
# CELL 8: Gradient Bandit Experiments Setup
# Purpose: Compare gradient bandits with and without baseline for Figure 2.5
# ============================================

def run_gradient_bandit_experiment(alpha: float, use_baseline: bool = True,
                                  mean_shift: float = 4.0, runs: int = 2000,
                                  steps: int = 1000) -> np.ndarray:
    """
    Run gradient bandit experiment

    Figure 2.5 uses a special testbed where q*(a) are chosen near +4 rather than 0.
    This makes all rewards positive on average, which is important for demonstrating
    the baseline effect. With all positive rewards, the non-baseline version
    always increases selected action preferences, leading to poor exploration.

    Args:
        alpha: Learning rate for preference updates
        use_baseline: Whether to use baseline for variance reduction
        mean_shift: Shift true values (4.0 for Figure 2.5)
        runs: Number of independent experiments
        steps: Number of time steps per run

    Returns:
        pct_optimal: Percentage of optimal actions at each step
    """
    all_optimal = np.zeros((runs, steps))

    for run in range(runs):
        # Create bandit with shifted mean (q* values near +4)
        q_true = create_bandit(mean_shift=mean_shift)
        optimal_action = np.argmax(q_true)

        # Initialize gradient bandit agent
        H = np.zeros(K)  # Action preferences (start at 0)

        # Baseline tracking variables
        baseline_sum = 0.0
        baseline_count = 0

        for step in range(steps):
            # Select action using softmax policy
            action, probabilities = gradient_bandit_action_selection(H)

            # Get reward from environment
            reward = get_reward(action, q_true)

            # Update baseline if using it
            current_baseline = None
            if use_baseline:
                current_baseline, baseline_sum, baseline_count = update_baseline(
                    baseline_sum, baseline_count, reward)

            # Update preferences using policy gradient
            update_preferences(H, action, reward, probabilities, alpha, current_baseline)

            # Record if optimal action was selected
            all_optimal[run, step] = (action == optimal_action)

    return np.mean(all_optimal, axis=0) * 100

pretty_print("Gradient Experiments Ready",
             "Prepared to test gradient bandits with/without baseline<br>" +
             "Will use shifted testbed (mean=+4) to demonstrate baseline importance<br>" +
             "This reproduces Figure 2.5 experimental setup",
             style='info')

In [None]:
# ============================================
# CELL 9: Execute Gradient Bandit Experiments
# Purpose: Generate data for Figure 2.5 showing baseline importance
# ============================================

pretty_print("Starting Gradient Bandit Experiments",
             "Testing gradient bandits with and without reward baseline<br>" +
             "Using shifted testbed (q* near +4) to demonstrate baseline effect",
             style='info')

# Run gradient bandit with baseline (α=0.1)
print("  Running gradient bandit WITH baseline (α=0.1)...")
gradient_with_baseline = run_gradient_bandit_experiment(
    alpha=0.1, use_baseline=True, mean_shift=4.0, runs=RUNS, steps=STEPS
)

# Run gradient bandit without baseline (α=0.1)
print("  Running gradient bandit WITHOUT baseline (α=0.1)...")
gradient_without_baseline = run_gradient_bandit_experiment(
    alpha=0.1, use_baseline=False, mean_shift=4.0, runs=RUNS, steps=STEPS
)

# Also test different learning rates to match Figure 2.5
print("  Running gradient bandit WITH baseline (α=0.4)...")
gradient_with_baseline_04 = run_gradient_bandit_experiment(
    alpha=0.4, use_baseline=True, mean_shift=4.0, runs=RUNS, steps=STEPS
)

print("  Running gradient bandit WITHOUT baseline (α=0.4)...")
gradient_without_baseline_04 = run_gradient_bandit_experiment(
    alpha=0.4, use_baseline=False, mean_shift=4.0, runs=RUNS, steps=STEPS
)

# Analyze results
with_final = gradient_with_baseline[-1]
without_final = gradient_without_baseline[-1]
improvement = ((with_final - without_final) / without_final) * 100

pretty_print("Gradient Bandit Results",
             f"With baseline (α=0.1): {with_final:.1f}% optimal<br>" +
             f"Without baseline (α=0.1): {without_final:.1f}% optimal<br>" +
             f"<strong>Baseline improvement: {improvement:.1f}%</strong>",
             style='result')

pretty_print("Why Baselines Matter",
             "With shifted rewards (all positive), no baseline means:<br>" +
             "• Every reward increases selected action preference<br>" +
             "• No relative comparison → poor exploration<br>" +
             "• Baseline provides 'better/worse than average' signal",
             style='note')

In [None]:
# ============================================
# CELL 10: Visualize Gradient Bandit Results
# Purpose: Create Figure 2.5 reproduction showing baseline importance
# ============================================

plt.figure(figsize=(10, 6))

# Plot gradient bandit with baseline (α=0.1) - blue
plt.plot(gradient_with_baseline, color=COLORS['gradient_with'],
         label='α = 0.1\nwith baseline', linewidth=2)

# Plot gradient bandit with baseline (α=0.4) - lighter blue
plt.plot(gradient_with_baseline_04, color='lightblue',
         label='α = 0.4\nwith baseline', linewidth=2)

# Plot gradient bandit without baseline (α=0.1) - brown
plt.plot(gradient_without_baseline, color=COLORS['gradient_without'],
         label='α = 0.1\nwithout baseline', linewidth=2)

# Plot gradient bandit without baseline (α=0.4) - lighter brown
plt.plot(gradient_without_baseline_04, color='tan',
         label='α = 0.4\nwithout baseline', linewidth=2)

# Formatting to match Figure 2.5
plt.xlabel('Steps', fontsize=12)
plt.ylabel('%\nOptimal\naction', fontsize=12)
plt.xlim(1, 1000)
plt.ylim(0, 100)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)

# Title matching Figure 2.5
plt.title('Figure 2.5: Average performance of the gradient bandit algorithm with and without\n' +
          'a reward baseline on the 10-armed testbed when the q*(a) are chosen to be near +4\n' +
          'rather than near zero.',
          fontsize=11, pad=20)

plt.tight_layout()
plt.show()

# Detailed analysis
analysis = (
    "<strong>Gradient Bandit Analysis:</strong><br><br>" +
    "<strong>With Baseline:</strong><br>" +
    "• Rapid learning and high performance<br>" +
    "• Baseline provides relative reward signal<br>" +
    "• Higher α (0.4) learns faster but converges to same level<br><br>" +
    "<strong>Without Baseline:</strong><br>" +
    "• Much slower learning and lower asymptotic performance<br>" +
    "• All positive rewards increase selected action preference<br>" +
    "• Poor exploration due to lack of relative comparison<br><br>" +
    "<strong>Key Insight:</strong> Baselines are crucial for gradient-based methods!"
)

pretty_print("Figure 2.5 Analysis", analysis, style='result')

## Comprehensive Method Comparison

### Parameter Study Overview

Figure 2.6 provides a **parameter study** comparing all bandit methods across their respective parameter ranges:

- **ε-greedy**: Parameter ε (exploration probability)
- **Gradient bandit**: Parameter α (learning rate)  
- **UCB**: Parameter c (confidence level)
- **Optimistic initialization**: Parameter Q₀ (initial values)

This analysis reveals the **sensitivity** of each method to its parameters and identifies optimal settings.

In [None]:
# ============================================
# CELL 11: Parameter Study Implementation
# Purpose: Comprehensive comparison of all methods across parameter ranges for Figure 2.6
# ============================================

def parameter_study_epsilon_greedy(epsilon_values: List[float], runs: int = 2000, steps: int = 1000) -> List[float]:
    """
    Test ε-greedy across different ε values

    Tests exploration probability from very low (1/128) to high (1/4)
    Expected: optimal around ε=0.1 for standard testbed
    """
    results = []

    for eps in epsilon_values:
        print(f"    Testing ε-greedy with ε={eps:.4f}...")
        avg_rewards = run_epsilon_greedy_baseline(epsilon=eps, runs=runs, steps=steps)
        # Return average reward over all 1000 steps
        results.append(np.mean(avg_rewards))

    return results

def parameter_study_ucb(c_values: List[float], runs: int = 2000, steps: int = 1000) -> List[float]:
    """
    Test UCB across different confidence parameters

    Tests from conservative (c=1/8) to aggressive (c=4) exploration
    Expected: optimal around c=2 (theoretical optimum)
    """
    results = []

    for c in c_values:
        print(f"    Testing UCB with c={c:.4f}...")
        avg_rewards = run_ucb_experiment(c=c, runs=runs, steps=steps)
        results.append(np.mean(avg_rewards))

    return results

def parameter_study_gradient(alpha_values: List[float], runs: int = 2000, steps: int = 1000) -> List[float]:
    """
    Test gradient bandit across different learning rates

    Tests from slow (α=1/32) to fast (α=4) learning
    Expected: optimal around α=0.1-0.4 range
    """
    results = []

    for alpha in alpha_values:
        print(f"    Testing gradient bandit with α={alpha:.4f}...")
        pct_optimal = run_gradient_bandit_experiment(
            alpha=alpha, use_baseline=True, mean_shift=0.0, runs=runs, steps=steps
        )
        # Convert to average reward approximation (not exact, but for comparison)
        # This is a simplification - actual Figure 2.6 uses average reward
        results.append(np.mean(pct_optimal) / 100 * 1.55)  # Rough scaling

    return results

def parameter_study_optimistic(q0_values: List[float], runs: int = 2000, steps: int = 1000) -> List[float]:
    """
    Test optimistic initialization across different initial values

    Tests from pessimistic (Q₀=1/4) to very optimistic (Q₀=4)
    Expected: optimal around Q₀=1-2 range
    """
    results = []

    for q0 in q0_values:
        print(f"    Testing optimistic with Q₀={q0:.4f}...")

        # Run optimistic greedy experiment
        all_rewards = np.zeros((runs, steps))

        for run in range(runs):
            q_true = create_bandit()
            Q = np.ones(K) * q0  # Optimistic initialization
            N = np.zeros(K)

            for step in range(steps):
                # Greedy action selection
                max_Q = np.max(Q)
                action = np.random.choice(np.where(Q == max_Q)[0])

                reward = get_reward(action, q_true)

                # Update with constant α=0.1
                N[action] += 1
                Q[action] += 0.1 * (reward - Q[action])

                all_rewards[run, step] = reward

        results.append(np.mean(all_rewards))

    return results

pretty_print("Parameter Study Functions Ready",
             "Prepared to test all methods across their parameter ranges<br>" +
             "Will generate data for Figure 2.6 parameter study comparison",
             style='info')

In [None]:
# ============================================
# CELL 12: Execute Parameter Study Experiments
# Purpose: Run comprehensive parameter sweep for all methods
# ============================================

pretty_print("Starting Parameter Study",
             "Testing all bandit methods across parameter ranges<br>" +
             "This will take several minutes...",
             style='warning')

# Define parameter ranges (matching Figure 2.6)
epsilon_values = [1/128, 1/64, 1/32, 1/16, 1/8, 1/4]  # ε-greedy parameters
alpha_values = [1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4]   # Gradient bandit parameters
c_values = [1/16, 1/8, 1/4, 1/2, 1, 2, 4]             # UCB parameters
q0_values = [1/4, 1/2, 1, 2, 4]                       # Optimistic initialization

# Reduce runs for parameter study to save time
study_runs = 1000
study_steps = 1000

# Run parameter studies
print("Testing ε-greedy methods...")
epsilon_results = parameter_study_epsilon_greedy(epsilon_values, study_runs, study_steps)

print("Testing UCB methods...")
ucb_results = parameter_study_ucb(c_values, study_runs, study_steps)

print("Testing gradient bandit methods...")
gradient_results = parameter_study_gradient(alpha_values, study_runs, study_steps)

print("Testing optimistic initialization...")
optimistic_results = parameter_study_optimistic(q0_values, study_runs, study_steps)

pretty_print("Parameter Study Complete",
             "All methods tested across parameter ranges<br>" +
             "Ready to create Figure 2.6 parameter comparison plot",
             style='success')

# Find best parameters
best_epsilon = epsilon_values[np.argmax(epsilon_results)]
best_c = c_values[np.argmax(ucb_results)]
best_alpha = alpha_values[np.argmax(gradient_results)]
best_q0 = q0_values[np.argmax(optimistic_results)]

pretty_print("Optimal Parameters Found",
             f"Best ε-greedy: ε = {best_epsilon:.4f}<br>" +
             f"Best UCB: c = {best_c:.4f}<br>" +
             f"Best gradient: α = {best_alpha:.4f}<br>" +
             f"Best optimistic: Q₀ = {best_q0:.4f}",
             style='result')

In [None]:
# ============================================
# CELL 13: Create Parameter Study Visualization
# Purpose: Generate Figure 2.6 showing method comparison across parameters
# ============================================

plt.figure(figsize=(10, 6))

# Plot ε-greedy results (red)
plt.plot(epsilon_values, epsilon_results, 'o-', color=COLORS['greedy'],
         label='ε-greedy', linewidth=2, markersize=6)

# Plot gradient bandit results (green)
plt.plot(alpha_values, gradient_results, 's-', color=COLORS['optimistic'],
         label='gradient bandit', linewidth=2, markersize=6)

# Plot UCB results (blue)
plt.plot(c_values, ucb_results, '^-', color=COLORS['ucb'],
         label='UCB', linewidth=2, markersize=6)

# Plot optimistic initialization results (black)
plt.plot(q0_values, optimistic_results, 'd-', color='black',
         label='greedy with\noptimistic\ninitialization\nα = 0.1', linewidth=2, markersize=6)

# Formatting to match Figure 2.6
plt.xlabel('Parameter Value', fontsize=12)
plt.ylabel('Average\nreward\nover first\n1000 steps', fontsize=11)
plt.xscale('log', base=2)  # Log scale with base 2
plt.xlim(1/128, 4)
plt.ylim(1.0, 1.5)

# Custom x-axis labels
xticks = [1/128, 1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4]
xtick_labels = ['1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4']
plt.xticks(xticks, xtick_labels)

# Add parameter labels
plt.text(0.03, 1.3, 'ε', fontsize=14, color=COLORS['greedy'], weight='bold')
plt.text(0.08, 1.35, 'α', fontsize=14, color=COLORS['optimistic'], weight='bold')
plt.text(0.5, 1.45, 'c', fontsize=14, color=COLORS['ucb'], weight='bold')
plt.text(1.5, 1.42, 'Q₀', fontsize=14, color='black', weight='bold')

plt.legend(loc='lower left', fontsize=10)
plt.grid(True, alpha=0.3)

# Title matching Figure 2.6
plt.title('Figure 2.6: A parameter study of the various bandit algorithms presented in this chapter.\n' +
          'Each point is the average reward obtained over 1000 steps with a particular algorithm\n' +
          'at a particular setting of its parameter.',
          fontsize=11, pad=20)

plt.tight_layout()
plt.show()

# Final analysis
final_analysis = (
    "<strong>Parameter Study Insights:</strong><br><br>" +
    "• <strong>UCB</strong>: Most robust, good performance across wide c range<br>" +
    "• <strong>Gradient Bandit</strong>: High peak performance but sensitive to α<br>" +
    "• <strong>Optimistic Initialization</strong>: Good performance, needs proper Q₀<br>" +
    "• <strong>ε-greedy</strong>: Simple but requires careful ε tuning<br><br>" +
    "<strong>Practical Recommendations:</strong><br>" +
    "• Use UCB when you want robust performance without tuning<br>" +
    "• Use gradient methods when you can tune parameters carefully<br>" +
    "• ε-greedy remains a simple, interpretable baseline"
)

pretty_print("Comprehensive Analysis", final_analysis, style='result')

<div style="background: #f8f9fa; padding: 15px 20px; margin-top: 30px; border-left: 3px solid #17a2b8;">
    <h3 style="color: #17a2b8; font-size: 14px; margin: 0 0 8px 0; text-transform: uppercase; letter-spacing: 0.5px;">Lab Summary</h3>
    <div style="color: #555; line-height: 1.6; font-size: 13px;">
        <p><strong>Advanced Methods Mastered:</strong></p>
        <ul style="margin: 10px 0; padding-left: 20px;">
            <li><strong>Upper Confidence Bound (UCB):</strong> Principled uncertainty-based exploration</li>
            <li><strong>Gradient Bandits:</strong> Policy gradient approach with preference learning</li>
            <li><strong>Baseline Importance:</strong> Critical for variance reduction in gradient methods</li>
            <li><strong>Parameter Sensitivity:</strong> Understanding optimal settings for each method</li>
        </ul>
        
        