# ðŸ“Š KL Divergence: Theory and Practice

This notebook explores:
1. KL divergence for discrete distributions
2. KL divergence for continuous distributions
3. The critical asymmetry (forward vs reverse KL)
4. Closed-form KL for Gaussians
5. Why KL matters for machine learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.special import kl_div as scipy_kl
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Set random seed for reproducibility
np.random.seed(42)

## 1. KL Divergence: Discrete Distributions

$$D_{KL}(P \| Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)}$$

In [None]:
def kl_divergence_discrete(p: np.ndarray, q: np.ndarray, eps: float = 1e-10) -> float:
    """
    Calculate KL divergence D_KL(P || Q) for discrete distributions.
    
    Args:
        p: True distribution
        q: Approximate distribution
        eps: Small constant to avoid log(0)
    
    Returns:
        KL divergence in nats (natural log)
    """
    p = np.array(p, dtype=np.float64)
    q = np.array(q, dtype=np.float64)
    
    # Ensure normalized
    p = p / p.sum()
    q = q / q.sum()
    
    # Add epsilon to avoid log(0)
    q = np.clip(q, eps, 1)
    
    # Only sum where p > 0
    mask = p > eps
    return np.sum(p[mask] * np.log(p[mask] / q[mask]))


def kl_divergence_discrete_bits(p: np.ndarray, q: np.ndarray) -> float:
    """KL divergence in bits (log base 2)."""
    return kl_divergence_discrete(p, q) / np.log(2)

In [None]:
# Example: Compare different distributions
n_outcomes = 4

distributions = {
    'P (true)': np.array([0.5, 0.3, 0.15, 0.05]),
    'Q1 (uniform)': np.array([0.25, 0.25, 0.25, 0.25]),
    'Q2 (close)': np.array([0.45, 0.32, 0.15, 0.08]),
    'Q3 (wrong mode)': np.array([0.05, 0.1, 0.35, 0.5])
}

P = distributions['P (true)']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, (name, dist) in zip(axes.flatten(), distributions.items()):
    colors = ['steelblue' if name == 'P (true)' else 'coral'] * n_outcomes
    bars = ax.bar(range(n_outcomes), dist, color=colors[0], edgecolor='black', alpha=0.7)
    
    if name != 'P (true)':
        # Overlay P as reference
        ax.bar(range(n_outcomes), P, color='steelblue', edgecolor='black', alpha=0.3, label='P (true)')
        kl = kl_divergence_discrete_bits(P, dist)
        ax.set_title(f'{name}\n$D_{{KL}}(P \\| Q)$ = {kl:.4f} bits', fontsize=12)
    else:
        ax.set_title(f'{name}\n(Reference distribution)', fontsize=12)
    
    ax.set_xlabel('Outcome')
    ax.set_ylabel('Probability')
    ax.set_ylim(0, 0.7)
    ax.set_xticks(range(n_outcomes))
    
    # Add probability labels
    for i, p in enumerate(dist):
        ax.text(i, p + 0.02, f'{p:.2f}', ha='center', fontsize=10)

plt.suptitle('KL Divergence: Extra Bits from Using Wrong Distribution', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("KL Divergence Analysis:")
print("=" * 50)
for name, dist in distributions.items():
    if name != 'P (true)':
        kl_nats = kl_divergence_discrete(P, dist)
        kl_bits = kl_divergence_discrete_bits(P, dist)
        print(f"{name}: {kl_nats:.4f} nats = {kl_bits:.4f} bits")

## 2. The Asymmetry: Forward vs Reverse KL

This is **crucial** to understand! The direction matters enormously.

In [None]:
# Create a bimodal distribution
def bimodal_pdf(x, mu1=-2, mu2=2, sigma=0.8, weight=0.5):
    """Mixture of two Gaussians."""
    return weight * stats.norm.pdf(x, mu1, sigma) + (1-weight) * stats.norm.pdf(x, mu2, sigma)

x = np.linspace(-6, 6, 1000)
p_bimodal = bimodal_pdf(x)
p_bimodal = p_bimodal / np.trapz(p_bimodal, x)  # Normalize

# Fit single Gaussians with different KL objectives
# Forward KL: mean-seeking (covers both modes)
# Reverse KL: mode-seeking (focuses on one mode)

# Forward KL solution (approximately): matches moments
mu_forward = 0  # Mean of the mixture
sigma_forward = 2.5  # Large variance to cover both modes

# Reverse KL solutions (mode-seeking): focuses on one mode
mu_reverse = -2  # Focuses on left mode
sigma_reverse = 0.8

q_forward = stats.norm.pdf(x, mu_forward, sigma_forward)
q_reverse_left = stats.norm.pdf(x, mu_reverse, sigma_reverse)
q_reverse_right = stats.norm.pdf(x, -mu_reverse, sigma_reverse)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot 1: Forward KL solution
ax = axes[0]
ax.fill_between(x, p_bimodal, alpha=0.3, color='blue', label='P (true bimodal)')
ax.plot(x, p_bimodal, 'b-', linewidth=2)
ax.plot(x, q_forward, 'r--', linewidth=2, label='Q (Gaussian fit)')
ax.set_title('Forward KL: $D_{KL}(P \\| Q)$\n"Mean-seeking" - covers both modes', fontsize=11)
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.legend()
ax.set_xlim(-6, 6)

# Plot 2: Reverse KL solution (left mode)
ax = axes[1]
ax.fill_between(x, p_bimodal, alpha=0.3, color='blue', label='P (true bimodal)')
ax.plot(x, p_bimodal, 'b-', linewidth=2)
ax.plot(x, q_reverse_left, 'r--', linewidth=2, label='Q (Gaussian fit)')
ax.set_title('Reverse KL: $D_{KL}(Q \\| P)$\n"Mode-seeking" - locks onto left mode', fontsize=11)
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.legend()
ax.set_xlim(-6, 6)

# Plot 3: Reverse KL solution (right mode)
ax = axes[2]
ax.fill_between(x, p_bimodal, alpha=0.3, color='blue', label='P (true bimodal)')
ax.plot(x, p_bimodal, 'b-', linewidth=2)
ax.plot(x, q_reverse_right, 'r--', linewidth=2, label='Q (Gaussian fit)')
ax.set_title('Reverse KL: $D_{KL}(Q \\| P)$\n"Mode-seeking" - locks onto right mode', fontsize=11)
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.legend()
ax.set_xlim(-6, 6)

plt.tight_layout()
plt.show()

print("\nðŸ”‘ Key Insight:")
print("Forward KL (used in ML): Q must cover all of P â†’ overdispersed")
print("Reverse KL (used in VI): Q can ignore parts of P â†’ underdispersed, mode-seeking")

In [None]:
# Numerical demonstration of asymmetry
def compute_kl_continuous(p_samples, q_samples, x_range):
    """Approximate KL divergence from samples using KDE."""
    p_kde = stats.gaussian_kde(p_samples)
    q_kde = stats.gaussian_kde(q_samples)
    
    x = np.linspace(x_range[0], x_range[1], 1000)
    p_vals = p_kde(x)
    q_vals = q_kde(x)
    
    # Add small epsilon to avoid log(0)
    eps = 1e-10
    p_vals = np.clip(p_vals, eps, None)
    q_vals = np.clip(q_vals, eps, None)
    
    # Numerical integration
    dx = x[1] - x[0]
    kl = np.sum(p_vals * np.log(p_vals / q_vals)) * dx
    return kl

# Generate samples from bimodal P
n_samples = 10000
p_samples = np.concatenate([
    np.random.normal(-2, 0.8, n_samples//2),
    np.random.normal(2, 0.8, n_samples//2)
])

# Different Q distributions
q_wide = np.random.normal(0, 2.5, n_samples)  # Wide, covers both
q_left = np.random.normal(-2, 0.8, n_samples)  # Left mode
q_right = np.random.normal(2, 0.8, n_samples)  # Right mode

print("KL Divergence Asymmetry Demonstration")
print("=" * 50)
print("\nP = Bimodal (modes at -2 and +2)")
print("\n1. Q = Wide Gaussian (Î¼=0, Ïƒ=2.5):")
print(f"   D_KL(P || Q) â‰ˆ {compute_kl_continuous(p_samples, q_wide, (-8, 8)):.4f} nats")
print(f"   D_KL(Q || P) â‰ˆ {compute_kl_continuous(q_wide, p_samples, (-8, 8)):.4f} nats")

print("\n2. Q = Left mode (Î¼=-2, Ïƒ=0.8):")
print(f"   D_KL(P || Q) â‰ˆ {compute_kl_continuous(p_samples, q_left, (-8, 8)):.4f} nats")
print(f"   D_KL(Q || P) â‰ˆ {compute_kl_continuous(q_left, p_samples, (-8, 8)):.4f} nats")

print("\nðŸ”‘ Notice:")
print("- Forward KL penalizes missing modes heavily (Q_left has high D_KL(P||Q))")
print("- Reverse KL allows ignoring modes (Q_left has low D_KL(Q||P))")

## 3. KL Divergence for Gaussians: Closed Form

For two Gaussians $P = \mathcal{N}(\mu_1, \sigma_1^2)$ and $Q = \mathcal{N}(\mu_2, \sigma_2^2)$:

$$D_{KL}(P \| Q) = \log\frac{\sigma_2}{\sigma_1} + \frac{\sigma_1^2 + (\mu_1 - \mu_2)^2}{2\sigma_2^2} - \frac{1}{2}$$

In [None]:
def kl_gaussian(mu1: float, sigma1: float, mu2: float, sigma2: float) -> float:
    """
    KL divergence between two univariate Gaussians.
    D_KL(N(mu1, sigma1^2) || N(mu2, sigma2^2))
    """
    return (np.log(sigma2/sigma1) + 
            (sigma1**2 + (mu1 - mu2)**2) / (2 * sigma2**2) - 
            0.5)


def kl_to_standard_normal(mu: float, sigma: float) -> float:
    """
    KL divergence from N(mu, sigma^2) to N(0, 1).
    This is the VAE latent regularization term!
    """
    return 0.5 * (mu**2 + sigma**2 - 1 - np.log(sigma**2))


# Verify formula matches numerical integration
mu1, sigma1 = 1.0, 1.5
mu2, sigma2 = 0.0, 1.0

# Analytical
kl_analytical = kl_gaussian(mu1, sigma1, mu2, sigma2)

# Numerical
x = np.linspace(-10, 10, 10000)
p = stats.norm.pdf(x, mu1, sigma1)
q = stats.norm.pdf(x, mu2, sigma2)
dx = x[1] - x[0]
kl_numerical = np.sum(p * np.log(p / (q + 1e-10))) * dx

print("Verifying Gaussian KL Formula")
print("=" * 40)
print(f"P = N({mu1}, {sigma1}Â²), Q = N({mu2}, {sigma2}Â²)")
print(f"Analytical KL: {kl_analytical:.6f} nats")
print(f"Numerical KL:  {kl_numerical:.6f} nats")
print(f"Difference:    {abs(kl_analytical - kl_numerical):.2e}")

In [None]:
# Visualize how KL changes with parameters
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Reference: Q = N(0, 1)
mu_q, sigma_q = 0, 1

# Plot 1: KL vs mean (fixed variance)
ax = axes[0]
mus = np.linspace(-4, 4, 100)
sigma_p = 1.0
kls = [kl_gaussian(mu, sigma_p, mu_q, sigma_q) for mu in mus]

ax.plot(mus, kls, 'b-', linewidth=2)
ax.set_xlabel('Î¼ of P')
ax.set_ylabel('$D_{KL}(P \\| Q)$ [nats]')
ax.set_title(f'KL vs Mean\n(P = N(Î¼, 1), Q = N(0, 1))')
ax.axvline(x=0, color='r', linestyle='--', alpha=0.5, label='Minimum at Î¼=0')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: KL vs variance (fixed mean)
ax = axes[1]
sigmas = np.linspace(0.1, 3, 100)
mu_p = 0
kls = [kl_gaussian(mu_p, sigma, mu_q, sigma_q) for sigma in sigmas]

ax.plot(sigmas, kls, 'b-', linewidth=2)
ax.set_xlabel('Ïƒ of P')
ax.set_ylabel('$D_{KL}(P \\| Q)$ [nats]')
ax.set_title(f'KL vs Std Dev\n(P = N(0, ÏƒÂ²), Q = N(0, 1))')
ax.axvline(x=1, color='r', linestyle='--', alpha=0.5, label='Minimum at Ïƒ=1')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 3: 2D heatmap
ax = axes[2]
mu_range = np.linspace(-3, 3, 50)
sigma_range = np.linspace(0.2, 2.5, 50)
MU, SIGMA = np.meshgrid(mu_range, sigma_range)
KL = np.array([[kl_gaussian(mu, sigma, 0, 1) for mu in mu_range] for sigma in sigma_range])

contour = ax.contourf(MU, SIGMA, KL, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax, label='$D_{KL}$ [nats]')
ax.plot(0, 1, 'r*', markersize=15, label='Minimum (Î¼=0, Ïƒ=1)')
ax.set_xlabel('Î¼ of P')
ax.set_ylabel('Ïƒ of P')
ax.set_title('$D_{KL}(\mathcal{N}(\mu, \sigma^2) \\| \mathcal{N}(0, 1))$')
ax.legend()

plt.tight_layout()
plt.show()

## 4. VAE Latent Space KL: The Regularization Term

In VAEs, we compute:
$$D_{KL}(q(z|x) \| p(z)) = D_{KL}(\mathcal{N}(\mu, \sigma^2) \| \mathcal{N}(0, 1))$$

$$= \frac{1}{2}\sum_j \left[\mu_j^2 + \sigma_j^2 - 1 - \log(\sigma_j^2)\right]$$

In [None]:
def vae_kl_loss(mu: np.ndarray, log_var: np.ndarray) -> float:
    """
    VAE KL loss: D_KL(N(mu, exp(log_var)) || N(0, I))
    
    This is the closed-form KL divergence used in VAEs.
    
    Args:
        mu: Mean vector of shape (batch, latent_dim) or (latent_dim,)
        log_var: Log variance vector of same shape
    
    Returns:
        KL divergence (scalar or per-sample)
    """
    # KL = 0.5 * sum(mu^2 + sigma^2 - 1 - log(sigma^2))
    #    = 0.5 * sum(mu^2 + exp(log_var) - 1 - log_var)
    return 0.5 * np.sum(mu**2 + np.exp(log_var) - 1 - log_var, axis=-1)


# Visualize the components of VAE KL loss
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Component 1: mu^2 (penalizes moving mean away from 0)
ax = axes[0]
mu_range = np.linspace(-3, 3, 100)
ax.plot(mu_range, 0.5 * mu_range**2, 'b-', linewidth=2)
ax.set_xlabel('Î¼')
ax.set_ylabel('$0.5 \cdot \mu^2$')
ax.set_title('KL Component 1: Mean Penalty\n"Stay near origin"')
ax.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3)

# Component 2: sigma^2 - 1 - log(sigma^2) (penalizes variance != 1)
ax = axes[1]
log_var_range = np.linspace(-4, 2, 100)
sigma_sq = np.exp(log_var_range)
penalty = 0.5 * (sigma_sq - 1 - log_var_range)
ax.plot(sigma_sq, penalty, 'b-', linewidth=2)
ax.set_xlabel('ÏƒÂ²')
ax.set_ylabel('$0.5 \cdot (\sigma^2 - 1 - \log \sigma^2)$')
ax.set_title('KL Component 2: Variance Penalty\n"Match unit variance"')
ax.axvline(x=1, color='r', linestyle='--', alpha=0.5, label='Minimum at ÏƒÂ²=1')
ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax.set_xlim(0, 5)
ax.legend()
ax.grid(True, alpha=0.3)

# Combined effect in 2D
ax = axes[2]
mu_grid = np.linspace(-3, 3, 50)
logvar_grid = np.linspace(-2, 2, 50)
MU, LOGVAR = np.meshgrid(mu_grid, logvar_grid)
KL = 0.5 * (MU**2 + np.exp(LOGVAR) - 1 - LOGVAR)

contour = ax.contourf(MU, np.exp(LOGVAR/2), KL, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax, label='KL [nats]')
ax.plot(0, 1, 'r*', markersize=15, label='Minimum (Î¼=0, Ïƒ=1)')
ax.set_xlabel('Î¼')
ax.set_ylabel('Ïƒ')
ax.set_title('Total VAE KL Loss\n$D_{KL}(q(z|x) \\| p(z))$')
ax.legend()

plt.tight_layout()
plt.show()

print("\nðŸ”‘ VAE KL Intuition:")
print("- Î¼Â² term: Pulls latent means toward 0 (centers the distribution)")
print("- ÏƒÂ² - 1 - log(ÏƒÂ²) term: Pulls latent variance toward 1")
print("- Together: Forces q(z|x) to stay close to standard normal p(z)")

In [None]:
# Simulate VAE latent distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Different encoder outputs (mu, log_var)
encoder_outputs = [
    (0, 0, 'Perfect match'),           # mu=0, sigma=1 â†’ KL=0
    (2, 0, 'Shifted mean'),            # mu=2, sigma=1
    (0, 1, 'Large variance'),          # mu=0, sigma=e^0.5â‰ˆ1.65
    (0, -1, 'Small variance'),         # mu=0, sigma=e^-0.5â‰ˆ0.61
    (1, 0.5, 'Both shifted'),          # mu=1, sigmaâ‰ˆ1.28
    (-1, -0.5, 'Both shifted (neg)')   # mu=-1, sigmaâ‰ˆ0.78
]

x = np.linspace(-5, 5, 1000)
prior = stats.norm.pdf(x, 0, 1)

for ax, (mu, log_var, title) in zip(axes.flatten(), encoder_outputs):
    sigma = np.exp(0.5 * log_var)
    posterior = stats.norm.pdf(x, mu, sigma)
    kl = vae_kl_loss(np.array([mu]), np.array([log_var]))
    
    ax.fill_between(x, prior, alpha=0.3, color='blue', label='p(z) = N(0,1)')
    ax.plot(x, prior, 'b-', linewidth=2)
    ax.fill_between(x, posterior, alpha=0.3, color='red', label=f'q(z|x) = N({mu},{sigma:.2f}Â²)')
    ax.plot(x, posterior, 'r--', linewidth=2)
    
    ax.set_title(f'{title}\nÎ¼={mu}, Ïƒ={sigma:.2f}, KL={kl:.3f} nats')
    ax.set_xlabel('z')
    ax.set_ylabel('Density')
    ax.legend(loc='upper right', fontsize=9)
    ax.set_xlim(-5, 5)
    ax.grid(True, alpha=0.3)

plt.suptitle('VAE: Posterior q(z|x) vs Prior p(z)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. Information-Theoretic View: KL as Information Gain

In [None]:
# Bayesian update example: KL measures information gained from data

# Prior: broad uncertainty
mu_prior, sigma_prior = 0, 3

# Posteriors after seeing different amounts of data
posteriors = [
    (0.5, 2.5, '1 data point'),
    (0.8, 1.5, '10 data points'),
    (1.0, 0.8, '100 data points'),
    (1.0, 0.3, '1000 data points')
]

fig, ax = plt.subplots(figsize=(12, 6))

x = np.linspace(-8, 8, 1000)
prior_pdf = stats.norm.pdf(x, mu_prior, sigma_prior)
ax.fill_between(x, prior_pdf, alpha=0.3, color='gray', label='Prior N(0, 9)')
ax.plot(x, prior_pdf, 'k--', linewidth=2)

colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(posteriors)))

print("Information Gained from Data (KL from Prior to Posterior)")
print("=" * 60)

for (mu, sigma, label), color in zip(posteriors, colors):
    posterior_pdf = stats.norm.pdf(x, mu, sigma)
    kl = kl_gaussian(mu, sigma, mu_prior, sigma_prior)
    
    ax.plot(x, posterior_pdf, linewidth=2, color=color, 
            label=f'{label}: KL = {kl:.2f} nats')
    
    print(f"{label}: Posterior N({mu}, {sigma}Â²) â†’ KL = {kl:.3f} nats = {kl/np.log(2):.3f} bits")

ax.axvline(x=1, color='green', linestyle=':', alpha=0.7, label='True value = 1')
ax.set_xlabel('Parameter value')
ax.set_ylabel('Density')
ax.set_title('Bayesian Learning: KL Divergence = Information Gained\n(More data â†’ more information â†’ higher KL from prior)')
ax.legend(loc='upper right')
ax.set_xlim(-8, 8)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ”‘ Key Insight:")
print("KL divergence from prior to posterior = information gained from data")
print("More data â†’ narrower posterior â†’ larger KL")

## 6. Maximum Likelihood = Minimizing Forward KL

In [None]:
# Demonstrate that ML = minimizing D_KL(P_data || Q_model)

# True data distribution (unknown to model)
true_mu, true_sigma = 2.0, 1.5

# Generate data
np.random.seed(42)
data = np.random.normal(true_mu, true_sigma, 1000)

# Fit Gaussian by maximum likelihood
ml_mu = np.mean(data)
ml_sigma = np.std(data, ddof=0)  # ML estimate (not unbiased)

# Compare different model parameters
mu_range = np.linspace(0, 4, 50)
sigma_range = np.linspace(0.5, 3, 50)

def neg_log_likelihood(mu, sigma, data):
    """Negative log-likelihood (equivalent to cross-entropy)."""
    return -np.mean(stats.norm.logpdf(data, mu, sigma))

# Compute NLL surface
NLL = np.zeros((len(sigma_range), len(mu_range)))
for i, sigma in enumerate(sigma_range):
    for j, mu in enumerate(mu_range):
        NLL[i, j] = neg_log_likelihood(mu, sigma, data)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# NLL surface
ax = axes[0]
MU, SIGMA = np.meshgrid(mu_range, sigma_range)
contour = ax.contourf(MU, SIGMA, NLL, levels=30, cmap='viridis')
plt.colorbar(contour, ax=ax, label='Negative Log-Likelihood')
ax.plot(ml_mu, ml_sigma, 'r*', markersize=15, label=f'ML estimate ({ml_mu:.2f}, {ml_sigma:.2f})')
ax.plot(true_mu, true_sigma, 'g^', markersize=12, label=f'True params ({true_mu}, {true_sigma})')
ax.set_xlabel('Î¼')
ax.set_ylabel('Ïƒ')
ax.set_title('Negative Log-Likelihood Surface\n(Minimizing NLL = Minimizing KL)')
ax.legend()

# Show distributions
ax = axes[1]
x = np.linspace(-3, 7, 200)

ax.hist(data, bins=50, density=True, alpha=0.5, color='gray', label='Data')
ax.plot(x, stats.norm.pdf(x, true_mu, true_sigma), 'g-', linewidth=2, label=f'True: N({true_mu}, {true_sigma}Â²)')
ax.plot(x, stats.norm.pdf(x, ml_mu, ml_sigma), 'r--', linewidth=2, label=f'ML fit: N({ml_mu:.2f}, {ml_sigma:.2f}Â²)')

ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Maximum Likelihood Fit')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTrue parameters: Î¼ = {true_mu}, Ïƒ = {true_sigma}")
print(f"ML estimates:    Î¼ = {ml_mu:.4f}, Ïƒ = {ml_sigma:.4f}")
print(f"\nMaximum Likelihood finds parameters that minimize KL(P_data || Q_model)")

## Summary

### Key Formulas

| Distribution | KL Divergence Formula |
|-------------|----------------------|
| Discrete | $D_{KL}(P \| Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)}$ |
| Continuous | $D_{KL}(P \| Q) = \int p(x) \log \frac{p(x)}{q(x)} dx$ |
| Gaussians | $\log\frac{\sigma_2}{\sigma_1} + \frac{\sigma_1^2 + (\mu_1 - \mu_2)^2}{2\sigma_2^2} - \frac{1}{2}$ |
| VAE (to N(0,1)) | $\frac{1}{2}\sum_j (\mu_j^2 + \sigma_j^2 - 1 - \log\sigma_j^2)$ |

### Key Insights

1. **KL is asymmetric**: Forward vs reverse KL give very different results
2. **Forward KL** (ML): Mean-seeking, covers all modes
3. **Reverse KL** (VI): Mode-seeking, can ignore modes
4. **VAE KL term**: Regularizes latent space toward standard normal
5. **Information view**: KL = information gained when updating beliefs