# Tutorial 08: Positive Definite Matrices

Interactive visualizations for understanding positive definite matrices in ML.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)

## 1. What Makes a Matrix Positive Definite?

In [None]:
def check_positive_definiteness(A, name="A"):
    """
    Check if a matrix is positive definite using multiple tests.
    """
    print(f"Matrix {name}:")
    print(A)
    print()
    
    # Test 1: Eigenvalues
    eigenvalues = np.linalg.eigvalsh(A)
    print(f"Eigenvalues: {eigenvalues}")
    
    all_positive = np.all(eigenvalues > 0)
    all_nonneg = np.all(eigenvalues >= 0)
    
    if all_positive:
        status = "POSITIVE DEFINITE"
    elif all_nonneg:
        status = "POSITIVE SEMI-DEFINITE"
    else:
        status = "INDEFINITE"
    
    print(f"Status: {status}")
    
    # Test 2: Try Cholesky
    try:
        L = np.linalg.cholesky(A)
        print(f"Cholesky exists: Yes")
    except np.linalg.LinAlgError:
        print(f"Cholesky exists: No")
    
    # Test 3: Leading principal minors (Sylvester)
    n = A.shape[0]
    minors = [np.linalg.det(A[:k, :k]) for k in range(1, n+1)]
    print(f"Leading principal minors: {minors}")
    
    print("-" * 40)
    return status

# Test different matrices
A_pd = np.array([[2, 1], [1, 2]])  # Positive definite
A_psd = np.array([[1, 1], [1, 1]])  # Positive semi-definite
A_indef = np.array([[1, 2], [2, 1]])  # Indefinite

check_positive_definiteness(A_pd, "A (should be PD)")
check_positive_definiteness(A_psd, "B (should be PSD)")
check_positive_definiteness(A_indef, "C (should be indefinite)")

## 2. Geometric Visualization: Quadratic Forms

In [None]:
def visualize_quadratic_form(A, title="Quadratic Form"):
    """
    Visualize f(x) = x^T A x for a 2x2 symmetric matrix.
    """
    fig = plt.figure(figsize=(15, 5))
    
    # Create grid
    x = np.linspace(-2, 2, 100)
    y = np.linspace(-2, 2, 100)
    X, Y = np.meshgrid(x, y)
    
    # Compute quadratic form
    Z = A[0, 0] * X**2 + (A[0, 1] + A[1, 0]) * X * Y + A[1, 1] * Y**2
    
    # Eigendecomposition
    eigenvalues, eigenvectors = np.linalg.eigh(A)
    
    # 3D Surface
    ax1 = fig.add_subplot(131, projection='3d')
    
    # Color based on definiteness
    if np.all(eigenvalues > 0):
        cmap = 'Greens'
        status = 'PD (bowl up)'
    elif np.all(eigenvalues >= 0):
        cmap = 'Blues'
        status = 'PSD'
    elif np.all(eigenvalues < 0):
        cmap = 'Reds'
        status = 'ND (bowl down)'
    else:
        cmap = 'RdYlBu'
        status = 'Indefinite (saddle)'
    
    ax1.plot_surface(X, Y, Z, cmap=cmap, alpha=0.8)
    ax1.set_xlabel('x₁')
    ax1.set_ylabel('x₂')
    ax1.set_zlabel('f(x)')
    ax1.set_title(f'Surface\n{status}')
    
    # Contour plot
    ax2 = fig.add_subplot(132)
    levels = np.linspace(Z.min(), Z.max(), 20)
    contour = ax2.contour(X, Y, Z, levels=levels, cmap=cmap)
    ax2.clabel(contour, inline=True, fontsize=8)
    
    # Draw eigenvectors
    scale = 1.5
    colors = ['red', 'blue']
    for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
        ax2.arrow(0, 0, scale*vec[0], scale*vec[1], head_width=0.1,
                  head_length=0.05, fc=colors[i], ec=colors[i], linewidth=2)
        ax2.text(scale*vec[0]*1.15, scale*vec[1]*1.15, f'λ={val:.2f}',
                fontsize=10, color=colors[i], fontweight='bold')
    
    ax2.set_xlim(-2, 2)
    ax2.set_ylim(-2, 2)
    ax2.set_aspect('equal')
    ax2.set_xlabel('x₁')
    ax2.set_ylabel('x₂')
    ax2.set_title('Contours with Eigenvectors')
    ax2.grid(True, alpha=0.3)
    
    # Energy along eigenvector directions
    ax3 = fig.add_subplot(133)
    t = np.linspace(-2, 2, 100)
    
    for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
        # f along eigenvector direction
        f_along = val * t**2
        ax3.plot(t, f_along, colors[i], linewidth=2, label=f'Along v{i+1} (λ={val:.2f})')
    
    ax3.axhline(y=0, color='k', linewidth=0.5)
    ax3.set_xlabel('t (position along eigenvector)')
    ax3.set_ylabel('f(t·v)')
    ax3.set_title('Quadratic Form Along Eigenvectors')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    plt.suptitle(f'{title}\nA = {A.tolist()}\nEigenvalues: {eigenvalues}', fontsize=12, y=1.02)
    plt.tight_layout()
    plt.show()

# Positive definite (bowl shape)
print("POSITIVE DEFINITE: All eigenvalues > 0")
visualize_quadratic_form(np.array([[3, 1], [1, 2]]), "Positive Definite Matrix")

# Indefinite (saddle shape)
print("\nINDEFINITE: Mixed sign eigenvalues")
visualize_quadratic_form(np.array([[1, 2], [2, 1]]), "Indefinite Matrix")

# Effect of condition number
print("\nEFFECT OF CONDITION NUMBER:")
visualize_quadratic_form(np.array([[4, 0], [0, 1]]), "Well-conditioned (κ=4)")
visualize_quadratic_form(np.array([[10, 0], [0, 1]]), "Ill-conditioned (κ=10)")

## 3. Cholesky Decomposition

In [None]:
def cholesky_step_by_step(A):
    """
    Compute Cholesky decomposition with detailed steps.
    """
    print("Cholesky Decomposition: A = L @ L.T")
    print("="*50)
    print(f"Input A:\n{A}\n")
    
    n = A.shape[0]
    L = np.zeros((n, n))
    
    for i in range(n):
        for j in range(i + 1):
            if i == j:
                # Diagonal element
                sum_sq = sum(L[i, k]**2 for k in range(j))
                val = A[i, i] - sum_sq
                L[i, j] = np.sqrt(val)
                print(f"L[{i},{j}] = sqrt(A[{i},{i}] - sum(L[{i},0:{j}]²))")
                print(f"       = sqrt({A[i,i]} - {sum_sq}) = sqrt({val}) = {L[i,j]:.4f}")
            else:
                # Off-diagonal element
                sum_prod = sum(L[i, k] * L[j, k] for k in range(j))
                L[i, j] = (A[i, j] - sum_prod) / L[j, j]
                print(f"L[{i},{j}] = (A[{i},{j}] - sum(L[{i},k]*L[{j},k])) / L[{j},{j}]")
                print(f"       = ({A[i,j]} - {sum_prod}) / {L[j,j]:.4f} = {L[i,j]:.4f}")
        print()
    
    print(f"Result L:\n{L}\n")
    print(f"Verification L @ L.T:\n{L @ L.T}")
    
    return L

# Example
A = np.array([[4, 2, -2],
              [2, 5, -4],
              [-2, -4, 14]], dtype=float)

L = cholesky_step_by_step(A)

# Compare with numpy
print("\nNumPy Cholesky:")
print(np.linalg.cholesky(A))

## 4. Application: Sampling from Multivariate Gaussian

In [None]:
def sample_and_visualize_gaussian(mu, Sigma, n_samples=1000):
    """
    Sample from N(mu, Sigma) using Cholesky and visualize.
    """
    # Cholesky decomposition
    L = np.linalg.cholesky(Sigma)
    
    # Sample from standard normal
    z = np.random.randn(n_samples, 2)
    
    # Transform: x = mu + L @ z
    samples = mu + z @ L.T
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Plot standard normal samples
    ax1 = axes[0]
    ax1.scatter(z[:, 0], z[:, 1], alpha=0.3, s=10)
    ax1.set_xlim(-4, 4)
    ax1.set_ylim(-4, 4)
    ax1.set_aspect('equal')
    ax1.set_title('Standard Normal N(0, I)\n(Before transformation)')
    ax1.set_xlabel('z₁')
    ax1.set_ylabel('z₂')
    circle = plt.Circle((0, 0), 2, fill=False, color='red', linewidth=2)
    ax1.add_patch(circle)
    
    # Plot transformation matrix effect
    ax2 = axes[1]
    ax2.text(0.1, 0.9, f'L (Cholesky):', transform=ax2.transAxes, fontsize=12)
    ax2.text(0.1, 0.7, f'{L}', transform=ax2.transAxes, fontsize=10, family='monospace')
    ax2.text(0.1, 0.5, f'\nΣ = L @ L.T:', transform=ax2.transAxes, fontsize=12)
    ax2.text(0.1, 0.3, f'{Sigma}', transform=ax2.transAxes, fontsize=10, family='monospace')
    ax2.text(0.1, 0.1, f'\nμ = {mu}', transform=ax2.transAxes, fontsize=12)
    ax2.axis('off')
    ax2.set_title('Transformation Parameters')
    
    # Plot transformed samples
    ax3 = axes[2]
    ax3.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10, color='green')
    
    # Draw covariance ellipse
    eigenvalues, eigenvectors = np.linalg.eigh(Sigma)
    theta = np.linspace(0, 2*np.pi, 100)
    
    for n_std in [1, 2, 3]:
        ellipse = np.array([np.cos(theta), np.sin(theta)])
        scaled = np.diag(n_std * np.sqrt(eigenvalues)) @ ellipse
        rotated = eigenvectors @ scaled
        shifted = rotated + np.array(mu).reshape(-1, 1)
        ax3.plot(shifted[0], shifted[1], 'r-', linewidth=2, 
                label=f'{n_std}σ' if n_std == 1 else None)
    
    ax3.scatter([mu[0]], [mu[1]], color='red', s=100, marker='x', linewidths=3)
    ax3.set_aspect('equal')
    ax3.set_title(f'Transformed N(μ, Σ)\n(x = μ + L @ z)')
    ax3.set_xlabel('x₁')
    ax3.set_ylabel('x₂')
    
    plt.suptitle('Sampling from Multivariate Gaussian via Cholesky', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.show()
    
    # Verify sample statistics
    print(f"Target mean: {mu}")
    print(f"Sample mean: {samples.mean(axis=0)}")
    print(f"\nTarget covariance:\n{Sigma}")
    print(f"Sample covariance:\n{np.cov(samples.T)}")

# Example with correlated Gaussian
mu = np.array([1, 2])
Sigma = np.array([[2.0, 0.8],
                  [0.8, 1.0]])

sample_and_visualize_gaussian(mu, Sigma)

## 5. Application: Covariance Matrices

In [None]:
def visualize_covariance_matrix():
    """
    Show why covariance matrices are positive semi-definite.
    """
    # Generate data
    np.random.seed(42)
    n, d = 100, 2
    
    # Correlated data
    true_cov = np.array([[2, 1], [1, 1.5]])
    L = np.linalg.cholesky(true_cov)
    X = np.random.randn(n, d) @ L.T
    
    # Center data
    X_centered = X - X.mean(axis=0)
    
    # Sample covariance
    Sigma = X_centered.T @ X_centered / (n - 1)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Plot data
    ax1 = axes[0]
    ax1.scatter(X_centered[:, 0], X_centered[:, 1], alpha=0.5)
    ax1.axhline(y=0, color='k', linewidth=0.5)
    ax1.axvline(x=0, color='k', linewidth=0.5)
    ax1.set_aspect('equal')
    ax1.set_title('Centered Data')
    ax1.set_xlabel('x₁')
    ax1.set_ylabel('x₂')
    
    # Show covariance matrix
    ax2 = axes[1]
    im = ax2.imshow(Sigma, cmap='RdBu_r', vmin=-2, vmax=2)
    ax2.set_title(f'Sample Covariance Σ\n(det = {np.linalg.det(Sigma):.3f})')
    for i in range(2):
        for j in range(2):
            ax2.text(j, i, f'{Sigma[i,j]:.2f}', ha='center', va='center', fontsize=14)
    ax2.set_xticks([0, 1])
    ax2.set_yticks([0, 1])
    plt.colorbar(im, ax=ax2)
    
    # Eigenvalues
    eigenvalues = np.linalg.eigvalsh(Sigma)
    ax3 = axes[2]
    ax3.bar([0, 1], eigenvalues, color=['blue', 'orange'])
    ax3.axhline(y=0, color='k', linewidth=0.5)
    ax3.set_xticks([0, 1])
    ax3.set_xticklabels(['λ₁', 'λ₂'])
    ax3.set_title(f'Eigenvalues\nAll ≥ 0 → PSD')
    ax3.set_ylabel('Eigenvalue')
    
    for i, ev in enumerate(eigenvalues):
        ax3.text(i, ev + 0.1, f'{ev:.3f}', ha='center', fontsize=12)
    
    plt.suptitle('Covariance Matrix is Always Positive Semi-Definite\n'
                 'Σ = XᵀX/(n-1), so vᵀΣv = ||Xv||²/(n-1) ≥ 0', fontsize=12, y=1.02)
    plt.tight_layout()
    plt.show()
    
    print(f"Sample covariance:\n{Sigma}")
    print(f"\nEigenvalues: {eigenvalues}")
    print(f"All eigenvalues ≥ 0: {np.all(eigenvalues >= -1e-10)}")

visualize_covariance_matrix()

## 6. Application: Optimization and Hessian

In [None]:
def visualize_optimization_landscape(A, b):
    """
    Visualize optimization of f(x) = 1/2 x^T A x - b^T x.
    Show how positive definiteness guarantees unique minimum.
    """
    fig = plt.figure(figsize=(15, 5))
    
    # Create grid
    x = np.linspace(-3, 5, 100)
    y = np.linspace(-3, 5, 100)
    X, Y = np.meshgrid(x, y)
    
    # Compute objective function
    Z = np.zeros_like(X)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            pt = np.array([X[i,j], Y[i,j]])
            Z[i,j] = 0.5 * pt @ A @ pt - b @ pt
    
    # Optimal solution
    x_opt = np.linalg.solve(A, b)
    f_opt = 0.5 * x_opt @ A @ x_opt - b @ x_opt
    
    # Eigenvalues of Hessian (= A)
    eigenvalues = np.linalg.eigvalsh(A)
    
    # 3D surface
    ax1 = fig.add_subplot(131, projection='3d')
    ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
    ax1.scatter([x_opt[0]], [x_opt[1]], [f_opt], color='red', s=100, marker='*')
    ax1.set_xlabel('x₁')
    ax1.set_ylabel('x₂')
    ax1.set_zlabel('f(x)')
    ax1.set_title('Objective Function')
    
    # Contours with gradient descent
    ax2 = fig.add_subplot(132)
    ax2.contour(X, Y, Z, levels=30, cmap='viridis')
    
    # Gradient descent
    x_gd = np.array([4.0, 4.0])
    lr = 0.1
    path = [x_gd.copy()]
    
    for _ in range(50):
        grad = A @ x_gd - b
        x_gd = x_gd - lr * grad
        path.append(x_gd.copy())
        
        if np.linalg.norm(grad) < 1e-6:
            break
    
    path = np.array(path)
    ax2.plot(path[:, 0], path[:, 1], 'r.-', markersize=8, label='GD path')
    ax2.scatter([x_opt[0]], [x_opt[1]], color='green', s=100, marker='*', label='Optimal')
    ax2.set_xlabel('x₁')
    ax2.set_ylabel('x₂')
    ax2.set_title(f'Gradient Descent\n(converged in {len(path)-1} steps)')
    ax2.legend()
    ax2.set_aspect('equal')
    
    # Eigenvalue analysis
    ax3 = fig.add_subplot(133)
    ax3.bar([0, 1], eigenvalues, color=['blue', 'orange'])
    ax3.axhline(y=0, color='k', linewidth=0.5)
    ax3.set_xticks([0, 1])
    ax3.set_xticklabels(['λ₁', 'λ₂'])
    ax3.set_title(f'Hessian Eigenvalues\nκ = {max(eigenvalues)/min(eigenvalues):.1f}')
    ax3.set_ylabel('Eigenvalue')
    
    for i, ev in enumerate(eigenvalues):
        ax3.text(i, ev + 0.1, f'{ev:.2f}', ha='center', fontsize=12)
    
    status = "UNIQUE MINIMUM" if np.all(eigenvalues > 0) else "NO UNIQUE MIN"
    ax3.text(0.5, 0.1, f'All λ > 0 → {status}', transform=ax3.transAxes, 
             ha='center', fontsize=10, fontweight='bold')
    
    plt.suptitle(f'Optimization: f(x) = ½xᵀAx - bᵀx\n'
                 f'A = {A.tolist()}, b = {b.tolist()}\n'
                 f'Optimal x* = A⁻¹b = {x_opt}', fontsize=11, y=1.05)
    plt.tight_layout()
    plt.show()

# Well-conditioned problem
print("Well-conditioned problem (κ ≈ 2):")
A_good = np.array([[2, 0.5], [0.5, 1]])
b = np.array([1, 1])
visualize_optimization_landscape(A_good, b)

# Ill-conditioned problem
print("\nIll-conditioned problem (κ = 10):")
A_bad = np.array([[10, 0], [0, 1]])
visualize_optimization_landscape(A_bad, b)

## 7. Regularization: Making Matrices Positive Definite

In [None]:
def demonstrate_regularization():
    """
    Show how regularization fixes non-positive-definite matrices.
    """
    # Create a nearly singular matrix
    np.random.seed(42)
    n = 50
    k = 5  # True rank
    
    # Low-rank matrix + small noise
    U = np.random.randn(n, k)
    A = U @ U.T / k + 0.01 * np.eye(n)
    
    # Make it nearly singular by subtracting small amount
    eigenvalues_orig = np.linalg.eigvalsh(A)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Original eigenvalue spectrum
    ax1 = axes[0]
    ax1.semilogy(range(1, n+1), np.sort(eigenvalues_orig)[::-1], 'b.-', markersize=5)
    ax1.axhline(y=0, color='r', linewidth=0.5, linestyle='--')
    ax1.set_xlabel('Index')
    ax1.set_ylabel('Eigenvalue (log scale)')
    ax1.set_title(f'Original Eigenvalues\nMin = {eigenvalues_orig.min():.2e}\n'
                 f'Condition = {eigenvalues_orig.max()/eigenvalues_orig.min():.2e}')
    ax1.grid(True, alpha=0.3)
    
    # Ridge regularization
    lambda_reg = 0.1
    A_ridge = A + lambda_reg * np.eye(n)
    eigenvalues_ridge = np.linalg.eigvalsh(A_ridge)
    
    ax2 = axes[1]
    ax2.semilogy(range(1, n+1), np.sort(eigenvalues_orig)[::-1], 'b.-', 
                markersize=5, alpha=0.5, label='Original')
    ax2.semilogy(range(1, n+1), np.sort(eigenvalues_ridge)[::-1], 'g.-', 
                markersize=5, label=f'Ridge (λ={lambda_reg})')
    ax2.set_xlabel('Index')
    ax2.set_ylabel('Eigenvalue (log scale)')
    ax2.set_title(f'After Ridge Regularization\nMin = {eigenvalues_ridge.min():.2e}\n'
                 f'Condition = {eigenvalues_ridge.max()/eigenvalues_ridge.min():.2e}')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Effect of regularization strength
    ax3 = axes[2]
    lambdas = np.logspace(-4, 0, 50)
    conditions = []
    min_eigs = []
    
    for lam in lambdas:
        A_reg = A + lam * np.eye(n)
        eigs = np.linalg.eigvalsh(A_reg)
        conditions.append(eigs.max() / eigs.min())
        min_eigs.append(eigs.min())
    
    ax3.loglog(lambdas, conditions, 'b-', linewidth=2, label='Condition Number')
    ax3.set_xlabel('Regularization λ')
    ax3.set_ylabel('Condition Number', color='blue')
    ax3.tick_params(axis='y', labelcolor='blue')
    
    ax3_twin = ax3.twinx()
    ax3_twin.loglog(lambdas, min_eigs, 'r-', linewidth=2, label='Min Eigenvalue')
    ax3_twin.set_ylabel('Min Eigenvalue', color='red')
    ax3_twin.tick_params(axis='y', labelcolor='red')
    
    ax3.set_title('Effect of Regularization Strength')
    ax3.grid(True, alpha=0.3)
    
    plt.suptitle('Regularization: A + λI\n'
                 'Shifts all eigenvalues by λ, improving conditioning', fontsize=12, y=1.02)
    plt.tight_layout()
    plt.show()

demonstrate_regularization()

## 8. Mahalanobis Distance

In [None]:
def visualize_mahalanobis_distance():
    """
    Compare Euclidean and Mahalanobis distances.
    """
    np.random.seed(42)
    
    # Generate correlated data
    mu = np.array([0, 0])
    Sigma = np.array([[2, 1.5], [1.5, 2]])
    L = np.linalg.cholesky(Sigma)
    
    n_samples = 300
    samples = np.random.randn(n_samples, 2) @ L.T
    
    # Test points
    test_points = np.array([
        [2, 0],      # Along minor axis
        [0, 2],      # Along minor axis (different)
        [1.5, 1.5],  # Along major axis
        [-1.5, -1.5] # Along major axis
    ])
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Euclidean distance
    ax1 = axes[0]
    ax1.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=20)
    
    # Euclidean distance circles
    for r in [1, 2, 3]:
        circle = plt.Circle((0, 0), r, fill=False, color='red', linewidth=2, linestyle='--')
        ax1.add_patch(circle)
    
    # Mark test points with Euclidean distances
    for pt in test_points:
        eucl_dist = np.linalg.norm(pt)
        ax1.scatter([pt[0]], [pt[1]], s=100, marker='*', c='green', edgecolors='black', linewidths=1)
        ax1.annotate(f'd_E={eucl_dist:.2f}', xy=pt, xytext=(pt[0]+0.3, pt[1]+0.3),
                    fontsize=10, fontweight='bold')
    
    ax1.set_xlim(-5, 5)
    ax1.set_ylim(-5, 5)
    ax1.set_aspect('equal')
    ax1.set_title('Euclidean Distance\n(Ignores correlation structure)')
    ax1.set_xlabel('x₁')
    ax1.set_ylabel('x₂')
    ax1.grid(True, alpha=0.3)
    
    # Mahalanobis distance
    ax2 = axes[1]
    ax2.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=20)
    
    # Mahalanobis distance ellipses
    eigenvalues, eigenvectors = np.linalg.eigh(Sigma)
    theta = np.linspace(0, 2*np.pi, 100)
    
    for d_M in [1, 2, 3]:
        ellipse = np.array([np.cos(theta), np.sin(theta)])
        scaled = np.diag(d_M * np.sqrt(eigenvalues)) @ ellipse
        rotated = eigenvectors @ scaled
        ax2.plot(rotated[0], rotated[1], 'r-', linewidth=2)
    
    # Compute and show Mahalanobis distances
    Sigma_inv = np.linalg.inv(Sigma)
    for pt in test_points:
        mahal_dist = np.sqrt(pt @ Sigma_inv @ pt)
        ax2.scatter([pt[0]], [pt[1]], s=100, marker='*', c='green', edgecolors='black', linewidths=1)
        ax2.annotate(f'd_M={mahal_dist:.2f}', xy=pt, xytext=(pt[0]+0.3, pt[1]+0.3),
                    fontsize=10, fontweight='bold')
    
    ax2.set_xlim(-5, 5)
    ax2.set_ylim(-5, 5)
    ax2.set_aspect('equal')
    ax2.set_title('Mahalanobis Distance\n(Accounts for correlation)')
    ax2.set_xlabel('x₁')
    ax2.set_ylabel('x₂')
    ax2.grid(True, alpha=0.3)
    
    plt.suptitle('Mahalanobis vs Euclidean Distance\n'
                 'd_M(x) = √(xᵀΣ⁻¹x) — uses covariance structure', fontsize=12, y=1.02)
    plt.tight_layout()
    plt.show()
    
    print("Note: Points equidistant in Euclidean space have different Mahalanobis distances.")
    print("Points along the major axis of the ellipse (high variance direction) have smaller d_M.")

visualize_mahalanobis_distance()

## 9. Summary

In [None]:
print("""
KEY CONCEPTS: POSITIVE DEFINITE MATRICES
=========================================

1. DEFINITION AND TESTS
   - x^T A x > 0 for all x ≠ 0
   - All eigenvalues > 0
   - All leading principal minors > 0 (Sylvester)
   - Cholesky decomposition exists (A = LL^T)

2. GEOMETRIC MEANING
   - Quadratic form x^T A x is bowl-shaped
   - Level sets are ellipsoids
   - Unique minimum at origin

3. CHOLESKY DECOMPOSITION
   - A = L L^T (L lower triangular, positive diagonal)
   - "Square root" of a matrix
   - 2× faster than LU, no pivoting needed
   - Use for: solving systems, sampling Gaussians

4. ML APPLICATIONS
   - Covariance matrices: always PSD (Σ = X^T X / n)
   - Optimization: PD Hessian → unique minimum
   - Gaussian sampling: x = μ + L z where z ~ N(0,I)
   - Ridge regression: X^T X + λI is always PD

5. CONDITION NUMBER
   - κ(A) = λ_max / λ_min
   - Large κ → ill-conditioned → slow optimization
   - Regularization (A + λI) improves conditioning

6. MAHALANOBIS DISTANCE
   - d_M(x) = √(x^T Σ^{-1} x)
   - Accounts for covariance structure
   - Standard measure for Gaussian distributions
""")