# üìà Normal & Multivariate Normal Distributions

This notebook explores:
1. Why Gaussians maximize entropy
2. Univariate normal distribution
3. Multivariate normal distribution
4. Covariance matrices and their geometry
5. Mahalanobis distance
6. The reparameterization trick

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.linalg import cholesky
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

np.random.seed(42)

## 1. Why Gaussians? Maximum Entropy!

Among all distributions with fixed mean and variance, Gaussian has **maximum entropy**.

In [None]:
def differential_entropy_gaussian(sigma):
    """Differential entropy of Gaussian N(0, sigma^2)."""
    return 0.5 * np.log(2 * np.pi * np.e * sigma**2)

def differential_entropy_uniform(a, b):
    """Differential entropy of Uniform(a, b)."""
    return np.log(b - a)

def differential_entropy_laplace(b):
    """Differential entropy of Laplace(0, b)."""
    return 1 + np.log(2 * b)

# Compare distributions with same variance
target_variance = 1.0

# Gaussian: variance = sigma^2
sigma_gaussian = np.sqrt(target_variance)
h_gaussian = differential_entropy_gaussian(sigma_gaussian)

# Uniform: variance = (b-a)^2 / 12, so b-a = sqrt(12 * var)
width = np.sqrt(12 * target_variance)
h_uniform = differential_entropy_uniform(-width/2, width/2)

# Laplace: variance = 2*b^2, so b = sqrt(var/2)
b_laplace = np.sqrt(target_variance / 2)
h_laplace = differential_entropy_laplace(b_laplace)

print("Comparing Distributions with Variance = 1")
print("=" * 50)
print(f"Gaussian N(0, 1):    H = {h_gaussian:.4f} nats")
print(f"Laplace(0, {b_laplace:.3f}):  H = {h_laplace:.4f} nats")
print(f"Uniform(-{width/2:.2f}, {width/2:.2f}): H = {h_uniform:.4f} nats")
print(f"\nüîë Gaussian has maximum entropy!")

In [None]:
# Visualize the distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

x = np.linspace(-4, 4, 1000)

# PDFs
ax = axes[0]
ax.plot(x, stats.norm.pdf(x, 0, sigma_gaussian), 'b-', linewidth=2, 
        label=f'Gaussian (H={h_gaussian:.3f})')
ax.plot(x, stats.laplace.pdf(x, 0, b_laplace), 'r--', linewidth=2, 
        label=f'Laplace (H={h_laplace:.3f})')
ax.plot(x, stats.uniform.pdf(x, -width/2, width), 'g:', linewidth=2, 
        label=f'Uniform (H={h_uniform:.3f})')

ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Distributions with Same Variance\n(Gaussian has Maximum Entropy)')
ax.legend()
ax.set_xlim(-4, 4)
ax.grid(True, alpha=0.3)

# Entropy vs variance
ax = axes[1]
variances = np.linspace(0.1, 5, 100)
h_gauss = [differential_entropy_gaussian(np.sqrt(v)) for v in variances]

ax.plot(variances, h_gauss, 'b-', linewidth=2, label='Gaussian entropy')
ax.fill_between(variances, h_gauss, -2, alpha=0.2, color='blue', 
                label='Region of lower entropy\n(more assumptions)')
ax.set_xlabel('Variance œÉ¬≤')
ax.set_ylabel('Differential Entropy H [nats]')
ax.set_title('Gaussian Entropy: $H = \\frac{1}{2}\\log(2\\pi e \\sigma^2)$')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Univariate Normal Distribution

$$\mathcal{N}(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)$$

In [None]:
def gaussian_pdf(x, mu, sigma):
    """Univariate Gaussian PDF."""
    return (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-0.5 * ((x - mu) / sigma)**2)

def gaussian_log_pdf(x, mu, sigma):
    """Log of Gaussian PDF (more numerically stable)."""
    return -0.5 * np.log(2 * np.pi * sigma**2) - 0.5 * ((x - mu) / sigma)**2

# Visualize different Gaussians
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
x = np.linspace(-8, 8, 1000)

# Varying mean
ax = axes[0]
for mu in [-2, 0, 2]:
    ax.plot(x, gaussian_pdf(x, mu, 1), linewidth=2, label=f'Œº={mu}')
ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.set_title('Varying Mean (œÉ=1)\n"Location parameter"')
ax.legend()
ax.grid(True, alpha=0.3)

# Varying variance
ax = axes[1]
for sigma in [0.5, 1, 2]:
    ax.plot(x, gaussian_pdf(x, 0, sigma), linewidth=2, label=f'œÉ={sigma}')
ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.set_title('Varying Std Dev (Œº=0)\n"Scale parameter"')
ax.legend()
ax.grid(True, alpha=0.3)

# 68-95-99.7 rule
ax = axes[2]
mu, sigma = 0, 1
ax.plot(x, gaussian_pdf(x, mu, sigma), 'b-', linewidth=2)

# Fill regions
x1 = np.linspace(-1, 1, 100)
x2 = np.linspace(-2, 2, 100)
x3 = np.linspace(-3, 3, 100)

ax.fill_between(x3, gaussian_pdf(x3, mu, sigma), alpha=0.2, color='green', label='99.7% (¬±3œÉ)')
ax.fill_between(x2, gaussian_pdf(x2, mu, sigma), alpha=0.3, color='yellow', label='95% (¬±2œÉ)')
ax.fill_between(x1, gaussian_pdf(x1, mu, sigma), alpha=0.4, color='red', label='68% (¬±1œÉ)')

ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.set_title('The 68-95-99.7 Rule')
ax.legend()
ax.set_xlim(-4, 4)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# MSE Loss = Gaussian Likelihood
print("üîë Key Insight: MSE Loss assumes Gaussian noise!")
print("")
print("If y = f(x) + Œµ, where Œµ ~ N(0, œÉ¬≤)")
print("Then: p(y|x) = N(y | f(x), œÉ¬≤)")
print("")
print("Negative log-likelihood:")
print("-log p(y|x) = (1/2)log(2œÄœÉ¬≤) + (y - f(x))¬≤ / (2œÉ¬≤)")
print("            ‚àù (y - f(x))¬≤  [MSE!]")

# Demonstrate
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Generate noisy data
np.random.seed(42)
x_data = np.linspace(0, 10, 50)
y_true = 2 * x_data + 1  # True line
y_noisy = y_true + np.random.normal(0, 2, len(x_data))  # Add Gaussian noise

ax = axes[0]
ax.scatter(x_data, y_noisy, alpha=0.6, label='Noisy data')
ax.plot(x_data, y_true, 'r-', linewidth=2, label='True function')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Data with Gaussian Noise')
ax.legend()
ax.grid(True, alpha=0.3)

# Show residuals are Gaussian
ax = axes[1]
residuals = y_noisy - y_true
ax.hist(residuals, bins=15, density=True, alpha=0.7, color='steelblue', edgecolor='black')

x_fit = np.linspace(-6, 6, 100)
ax.plot(x_fit, gaussian_pdf(x_fit, 0, 2), 'r-', linewidth=2, label='N(0, 2¬≤)')

ax.set_xlabel('Residual (y - f(x))')
ax.set_ylabel('Density')
ax.set_title('Residuals ‚âà Gaussian\n‚Üí MSE is appropriate loss')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Multivariate Normal Distribution

$$\mathcal{N}(\mathbf{x} | \boldsymbol{\mu}, \boldsymbol{\Sigma}) = \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} \exp\left(-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^T \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right)$$

In [None]:
def multivariate_gaussian_pdf(x, mu, sigma):
    """Multivariate Gaussian PDF."""
    d = len(mu)
    diff = x - mu
    sigma_inv = np.linalg.inv(sigma)
    det = np.linalg.det(sigma)
    
    norm_const = 1 / ((2 * np.pi)**(d/2) * np.sqrt(det))
    exponent = -0.5 * diff.T @ sigma_inv @ diff
    
    return norm_const * np.exp(exponent)


def plot_2d_gaussian(ax, mu, sigma, color='blue', label='', n_std=3):
    """Plot 2D Gaussian contours."""
    # Create grid
    x = np.linspace(mu[0] - n_std*3, mu[0] + n_std*3, 100)
    y = np.linspace(mu[1] - n_std*3, mu[1] + n_std*3, 100)
    X, Y = np.meshgrid(x, y)
    
    # Compute PDF
    pos = np.dstack((X, Y))
    rv = stats.multivariate_normal(mu, sigma)
    Z = rv.pdf(pos)
    
    # Plot contours
    ax.contour(X, Y, Z, levels=5, colors=color, alpha=0.7)
    ax.plot(*mu, 'o', color=color, markersize=10)
    
    # Add confidence ellipses
    for n in [1, 2]:
        add_confidence_ellipse(ax, mu, sigma, n, color=color)


def add_confidence_ellipse(ax, mu, sigma, n_std, color='blue'):
    """Add confidence ellipse to plot."""
    eigenvalues, eigenvectors = np.linalg.eig(sigma)
    
    # Sort by eigenvalue
    order = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[order]
    eigenvectors = eigenvectors[:, order]
    
    # Compute angle
    angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
    
    # Width and height are 2*n_std*sqrt(eigenvalue)
    width, height = 2 * n_std * np.sqrt(eigenvalues)
    
    ellipse = Ellipse(mu, width, height, angle=angle, 
                      fill=False, color=color, linestyle='--', linewidth=1.5)
    ax.add_patch(ellipse)

In [None]:
# Visualize different covariance structures
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

mu = np.array([0, 0])

# Spherical covariance
ax = axes[0]
sigma_spherical = np.array([[1, 0], [0, 1]])
samples = np.random.multivariate_normal(mu, sigma_spherical, 500)
ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10)
plot_2d_gaussian(ax, mu, sigma_spherical, 'red')
ax.set_title('Spherical: Œ£ = I\n(Uncorrelated, equal variance)')
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_aspect('equal')
ax.set_xlim(-4, 4)
ax.set_ylim(-4, 4)
ax.grid(True, alpha=0.3)

# Diagonal covariance
ax = axes[1]
sigma_diagonal = np.array([[2, 0], [0, 0.5]])
samples = np.random.multivariate_normal(mu, sigma_diagonal, 500)
ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10)
plot_2d_gaussian(ax, mu, sigma_diagonal, 'red')
ax.set_title('Diagonal: Œ£ = diag(œÉ‚ÇÅ¬≤, œÉ‚ÇÇ¬≤)\n(Uncorrelated, different variance)')
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_aspect('equal')
ax.set_xlim(-4, 4)
ax.set_ylim(-4, 4)
ax.grid(True, alpha=0.3)

# Full covariance
ax = axes[2]
sigma_full = np.array([[2, 1.2], [1.2, 1]])
samples = np.random.multivariate_normal(mu, sigma_full, 500)
ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10)
plot_2d_gaussian(ax, mu, sigma_full, 'red')
ax.set_title('Full: Œ£ with off-diagonal terms\n(Correlated)')
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_aspect('equal')
ax.set_xlim(-4, 4)
ax.set_ylim(-4, 4)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Covariance matrices:")
print(f"\nSpherical:\n{sigma_spherical}")
print(f"\nDiagonal:\n{sigma_diagonal}")
print(f"\nFull:\n{sigma_full}")

In [None]:
# 3D visualization of bivariate Gaussian
fig = plt.figure(figsize=(14, 5))

# Create grid
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))

# Correlated Gaussian
mu = np.array([0, 0])
sigma = np.array([[1, 0.8], [0.8, 1]])
rv = stats.multivariate_normal(mu, sigma)
Z = rv.pdf(pos)

# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x‚ÇÅ')
ax1.set_ylabel('x‚ÇÇ')
ax1.set_zlabel('p(x)')
ax1.set_title('3D View of Bivariate Gaussian\n(œÅ = 0.8)')

# Contour plot
ax2 = fig.add_subplot(122)
contour = ax2.contourf(X, Y, Z, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax2, label='p(x)')
ax2.set_xlabel('x‚ÇÅ')
ax2.set_ylabel('x‚ÇÇ')
ax2.set_title('Contour View\n(Elliptical level sets)')
ax2.set_aspect('equal')

plt.tight_layout()
plt.show()

## 4. Mahalanobis Distance

$$d_M(\mathbf{x}) = \sqrt{(\mathbf{x}-\boldsymbol{\mu})^T \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})}$$

Points with equal Mahalanobis distance have equal probability density.

In [None]:
def mahalanobis_distance(x, mu, sigma):
    """Compute Mahalanobis distance."""
    diff = x - mu
    sigma_inv = np.linalg.inv(sigma)
    return np.sqrt(diff.T @ sigma_inv @ diff)


def euclidean_distance(x, mu):
    """Compute Euclidean distance."""
    return np.linalg.norm(x - mu)


# Compare Euclidean vs Mahalanobis
mu = np.array([0, 0])
sigma = np.array([[4, 2], [2, 2]])

# Two points at same Euclidean distance but different Mahalanobis
point_a = np.array([2, 0])  # Along major axis
point_b = np.array([0, 2])  # Along minor axis

print("Comparing Euclidean vs Mahalanobis Distance")
print("=" * 50)
print(f"\nCovariance matrix:")
print(sigma)
print(f"\nPoint A = {point_a}:")
print(f"  Euclidean:    {euclidean_distance(point_a, mu):.3f}")
print(f"  Mahalanobis:  {mahalanobis_distance(point_a, mu, sigma):.3f}")
print(f"\nPoint B = {point_b}:")
print(f"  Euclidean:    {euclidean_distance(point_b, mu):.3f}")
print(f"  Mahalanobis:  {mahalanobis_distance(point_b, mu, sigma):.3f}")
print(f"\nüîë Same Euclidean distance, but different Mahalanobis!")
print(f"Point B is more 'surprising' given the covariance structure.")

In [None]:
# Visualize the difference
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Create grid
x = np.linspace(-4, 4, 100)
y = np.linspace(-4, 4, 100)
X, Y = np.meshgrid(x, y)

# Euclidean distance contours
ax = axes[0]
Z_euclidean = np.sqrt(X**2 + Y**2)
contour = ax.contourf(X, Y, Z_euclidean, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax, label='Euclidean Distance')

# Add points
ax.plot(*point_a, 'r^', markersize=15, label=f'A: d_E={euclidean_distance(point_a, mu):.1f}')
ax.plot(*point_b, 'rs', markersize=15, label=f'B: d_E={euclidean_distance(point_b, mu):.1f}')
ax.plot(*mu, 'ko', markersize=10)
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_title('Euclidean Distance\n(Ignores covariance structure)')
ax.legend()
ax.set_aspect('equal')

# Mahalanobis distance contours
ax = axes[1]
sigma_inv = np.linalg.inv(sigma)
Z_mahal = np.zeros_like(X)
for i in range(X.shape[0]):
    for j in range(X.shape[1]):
        pt = np.array([X[i,j], Y[i,j]])
        Z_mahal[i,j] = mahalanobis_distance(pt, mu, sigma)

contour = ax.contourf(X, Y, Z_mahal, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax, label='Mahalanobis Distance')

# Add points
ax.plot(*point_a, 'r^', markersize=15, label=f'A: d_M={mahalanobis_distance(point_a, mu, sigma):.2f}')
ax.plot(*point_b, 'rs', markersize=15, label=f'B: d_M={mahalanobis_distance(point_b, mu, sigma):.2f}')
ax.plot(*mu, 'ko', markersize=10)
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_title('Mahalanobis Distance\n(Accounts for covariance structure)')
ax.legend()
ax.set_aspect('equal')

plt.tight_layout()
plt.show()

## 5. The Reparameterization Trick

To sample $\mathbf{x} \sim \mathcal{N}(\boldsymbol{\mu}, \boldsymbol{\Sigma})$:

1. Sample $\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$
2. Compute $\mathbf{x} = \boldsymbol{\mu} + \mathbf{L}\boldsymbol{\epsilon}$ where $\boldsymbol{\Sigma} = \mathbf{L}\mathbf{L}^T$

For diagonal covariance (VAE): $\mathbf{x} = \boldsymbol{\mu} + \boldsymbol{\sigma} \odot \boldsymbol{\epsilon}$

In [None]:
def sample_reparameterized(mu, sigma, n_samples):
    """
    Sample from N(mu, sigma) using reparameterization trick.
    
    For full covariance: x = mu + L @ epsilon
    where L is Cholesky decomposition of sigma.
    """
    d = len(mu)
    L = cholesky(sigma, lower=True)
    
    # Sample from standard normal
    epsilon = np.random.randn(n_samples, d)
    
    # Transform
    samples = mu + (L @ epsilon.T).T
    
    return samples, epsilon


def sample_diagonal_reparameterized(mu, sigma_diag, n_samples):
    """
    Sample from N(mu, diag(sigma^2)) - the VAE case.
    x = mu + sigma * epsilon
    """
    d = len(mu)
    epsilon = np.random.randn(n_samples, d)
    samples = mu + sigma_diag * epsilon
    return samples, epsilon

In [None]:
# Visualize the reparameterization trick
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

n_samples = 1000

# Step 1: Sample from standard normal
ax = axes[0]
mu_standard = np.array([0, 0])
sigma_standard = np.eye(2)
epsilon = np.random.randn(n_samples, 2)

ax.scatter(epsilon[:, 0], epsilon[:, 1], alpha=0.3, s=10, c='blue')
ax.set_xlim(-4, 4)
ax.set_ylim(-4, 4)
ax.set_aspect('equal')
ax.set_xlabel('Œµ‚ÇÅ')
ax.set_ylabel('Œµ‚ÇÇ')
ax.set_title('Step 1: Œµ ~ N(0, I)\n(Standard normal noise)')
ax.grid(True, alpha=0.3)

# Step 2: Transform
ax = axes[1]
mu_target = np.array([1, 2])
sigma_target = np.array([[2, 1], [1, 1.5]])
L = cholesky(sigma_target, lower=True)

# Show transformation
ax.annotate('', xy=(1.5, 0), xytext=(0, 0),
            arrowprops=dict(arrowstyle='->', color='red', lw=2))
ax.annotate('', xy=(0.7, 1.2), xytext=(0, 0),
            arrowprops=dict(arrowstyle='->', color='green', lw=2))

ax.text(0.8, -0.3, 'L[:, 0]', color='red', fontsize=12)
ax.text(-0.5, 0.6, 'L[:, 1]', color='green', fontsize=12)

ax.set_xlim(-2, 3)
ax.set_ylim(-2, 3)
ax.set_aspect('equal')
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_title('Step 2: Transform\nx = Œº + L @ Œµ')
ax.grid(True, alpha=0.3)

# Show Cholesky matrix
ax.text(0.5, 2.5, f'L = \n{L.round(2)}', fontsize=10, family='monospace',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Step 3: Result
ax = axes[2]
samples = mu_target + (L @ epsilon.T).T

ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10, c='purple')
plot_2d_gaussian(ax, mu_target, sigma_target, 'red')
ax.set_xlim(-4, 6)
ax.set_ylim(-2, 6)
ax.set_aspect('equal')
ax.set_xlabel('x‚ÇÅ')
ax.set_ylabel('x‚ÇÇ')
ax.set_title(f'Step 3: x ~ N(Œº, Œ£)\nŒº={mu_target}, det(Œ£)={np.linalg.det(sigma_target):.2f}')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# VAE-style reparameterization (diagonal covariance)
print("VAE Reparameterization Trick")
print("=" * 50)
print("\nEncoder outputs: Œº(x) and log_var(x)")
print("We want to sample: z ~ N(Œº, diag(exp(log_var)))")
print("\nReparameterization:")
print("  1. Œµ ~ N(0, I)")
print("  2. œÉ = exp(0.5 * log_var)")
print("  3. z = Œº + œÉ ‚äô Œµ")
print("\nüîë Key benefit: Gradients flow through Œº and log_var!")

# Demonstrate
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Without reparameterization (can't backprop)
ax = axes[0]
ax.text(0.5, 0.8, 'x', fontsize=20, ha='center', transform=ax.transAxes)
ax.annotate('', xy=(0.5, 0.65), xytext=(0.5, 0.75),
            arrowprops=dict(arrowstyle='->', color='black'), 
            xycoords='axes fraction', textcoords='axes fraction')
ax.text(0.5, 0.55, 'Encoder\nŒº(x), œÉ(x)', fontsize=12, ha='center', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightblue'))
ax.annotate('', xy=(0.5, 0.4), xytext=(0.5, 0.5),
            arrowprops=dict(arrowstyle='->', color='black'),
            xycoords='axes fraction', textcoords='axes fraction')
ax.text(0.5, 0.3, 'z ~ N(Œº, œÉ¬≤)', fontsize=14, ha='center', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightyellow'))
ax.text(0.5, 0.15, '‚ùå Can\'t backprop\nthrough sampling!', fontsize=12, ha='center', 
        transform=ax.transAxes, color='red')
ax.axis('off')
ax.set_title('Without Reparameterization', fontsize=14)

# With reparameterization (can backprop)
ax = axes[1]
ax.text(0.5, 0.9, 'x', fontsize=20, ha='center', transform=ax.transAxes)
ax.annotate('', xy=(0.5, 0.8), xytext=(0.5, 0.87),
            arrowprops=dict(arrowstyle='->', color='black'),
            xycoords='axes fraction', textcoords='axes fraction')
ax.text(0.5, 0.7, 'Encoder\nŒº(x), œÉ(x)', fontsize=12, ha='center', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightblue'))

# Œµ branch
ax.text(0.15, 0.55, 'Œµ ~ N(0,I)', fontsize=11, ha='center', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightgray'))
ax.annotate('', xy=(0.35, 0.45), xytext=(0.2, 0.52),
            arrowprops=dict(arrowstyle='->', color='gray'),
            xycoords='axes fraction', textcoords='axes fraction')

# Œº, œÉ branch
ax.annotate('', xy=(0.4, 0.55), xytext=(0.5, 0.65),
            arrowprops=dict(arrowstyle='->', color='blue'),
            xycoords='axes fraction', textcoords='axes fraction')
ax.annotate('', xy=(0.6, 0.55), xytext=(0.5, 0.65),
            arrowprops=dict(arrowstyle='->', color='blue'),
            xycoords='axes fraction', textcoords='axes fraction')
ax.text(0.35, 0.52, 'Œº', fontsize=14, ha='center', transform=ax.transAxes, color='blue')
ax.text(0.65, 0.52, 'œÉ', fontsize=14, ha='center', transform=ax.transAxes, color='blue')

# Combine
ax.annotate('', xy=(0.5, 0.35), xytext=(0.35, 0.48),
            arrowprops=dict(arrowstyle='->', color='blue'),
            xycoords='axes fraction', textcoords='axes fraction')
ax.annotate('', xy=(0.5, 0.35), xytext=(0.65, 0.48),
            arrowprops=dict(arrowstyle='->', color='blue'),
            xycoords='axes fraction', textcoords='axes fraction')

ax.text(0.5, 0.38, 'z = Œº + œÉ ‚äô Œµ', fontsize=14, ha='center', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightgreen'))
ax.text(0.5, 0.2, '‚úÖ Gradients flow\nthrough Œº and œÉ!', fontsize=12, ha='center',
        transform=ax.transAxes, color='green')
ax.axis('off')
ax.set_title('With Reparameterization', fontsize=14)

plt.tight_layout()
plt.show()

## 6. KL Divergence Between Gaussians

In [None]:
def kl_gaussian_multivariate(mu1, sigma1, mu2, sigma2):
    """
    KL divergence between two multivariate Gaussians.
    D_KL(N(mu1, sigma1) || N(mu2, sigma2))
    """
    d = len(mu1)
    sigma2_inv = np.linalg.inv(sigma2)
    
    term1 = np.log(np.linalg.det(sigma2) / np.linalg.det(sigma1))
    term2 = -d
    term3 = np.trace(sigma2_inv @ sigma1)
    term4 = (mu2 - mu1).T @ sigma2_inv @ (mu2 - mu1)
    
    return 0.5 * (term1 + term2 + term3 + term4)


def kl_to_standard_normal_multivariate(mu, log_var):
    """
    VAE KL loss: D_KL(N(mu, diag(exp(log_var))) || N(0, I))
    """
    return 0.5 * np.sum(mu**2 + np.exp(log_var) - 1 - log_var)


# Verify the closed-form formula
mu1 = np.array([1.0, 2.0])
sigma1 = np.array([[1.5, 0.5], [0.5, 1.0]])

mu2 = np.array([0.0, 0.0])
sigma2 = np.eye(2)

kl_analytical = kl_gaussian_multivariate(mu1, sigma1, mu2, sigma2)

# Monte Carlo estimate
n_samples = 100000
samples = np.random.multivariate_normal(mu1, sigma1, n_samples)
log_p1 = stats.multivariate_normal.logpdf(samples, mu1, sigma1)
log_p2 = stats.multivariate_normal.logpdf(samples, mu2, sigma2)
kl_monte_carlo = np.mean(log_p1 - log_p2)

print("KL Divergence Between Multivariate Gaussians")
print("=" * 50)
print(f"P = N({mu1}, Œ£‚ÇÅ)")
print(f"Q = N({mu2}, I)")
print(f"\nAnalytical: {kl_analytical:.6f} nats")
print(f"Monte Carlo: {kl_monte_carlo:.6f} nats")
print(f"Difference: {abs(kl_analytical - kl_monte_carlo):.6f}")

In [None]:
# VAE KL loss visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# KL loss surface for 1D case
ax = axes[0]
mu_range = np.linspace(-3, 3, 100)
logvar_range = np.linspace(-3, 2, 100)
MU, LOGVAR = np.meshgrid(mu_range, logvar_range)

# KL for each point
KL = 0.5 * (MU**2 + np.exp(LOGVAR) - 1 - LOGVAR)

contour = ax.contourf(MU, np.exp(LOGVAR/2), KL, levels=20, cmap='viridis')
plt.colorbar(contour, ax=ax, label='KL [nats]')
ax.plot(0, 1, 'r*', markersize=15, label='Minimum (Œº=0, œÉ=1)')
ax.set_xlabel('Œº')
ax.set_ylabel('œÉ')
ax.set_title('VAE KL Loss: $D_{KL}(q(z|x) \\| p(z))$\nPulls toward N(0,1)')
ax.legend()

# Decomposition of KL loss
ax = axes[1]
sigma_range = np.linspace(0.1, 3, 100)

# Components for mu=0
kl_total = 0.5 * (sigma_range**2 - 1 - np.log(sigma_range**2))
variance_term = 0.5 * (sigma_range**2 - 1)
log_term = -0.5 * np.log(sigma_range**2)

ax.plot(sigma_range, kl_total, 'b-', linewidth=2, label='Total KL')
ax.plot(sigma_range, variance_term, 'r--', linewidth=2, label='$\\frac{1}{2}(\\sigma^2 - 1)$')
ax.plot(sigma_range, log_term, 'g--', linewidth=2, label='$-\\frac{1}{2}\\log(\\sigma^2)$')

ax.axvline(x=1, color='gray', linestyle=':', alpha=0.7, label='œÉ=1 (minimum)')
ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax.set_xlabel('œÉ')
ax.set_ylabel('Value [nats]')
ax.set_title('KL Loss Components (Œº=0)\nVariance + Log terms balance at œÉ=1')
ax.legend()
ax.set_xlim(0.1, 3)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

### Key Formulas

| Concept | Formula |
|---------|--------|
| Univariate PDF | $\frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)$ |
| Multivariate PDF | $\frac{1}{(2\pi)^{d/2}|\Sigma|^{1/2}} \exp\left(-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x-\mu)\right)$ |
| Mahalanobis | $d_M = \sqrt{(x-\mu)^T\Sigma^{-1}(x-\mu)}$ |
| Entropy | $\frac{d}{2}(1 + \log 2\pi) + \frac{1}{2}\log|\Sigma|$ |
| Reparameterization | $x = \mu + \sigma \odot \epsilon$, $\epsilon \sim N(0,I)$ |
| VAE KL | $\frac{1}{2}\sum_j(\mu_j^2 + \sigma_j^2 - 1 - \log\sigma_j^2)$ |

### Key Insights

1. **Gaussians maximize entropy** given mean and variance constraints
2. **Covariance structure** determines the shape of equal-probability contours
3. **Mahalanobis distance** is the natural metric that accounts for covariance
4. **Reparameterization trick** enables gradient-based learning with stochastic nodes
5. **Closed-form KL** makes VAE training efficient