# Tutorial 07: Orthogonality and Projections

Interactive visualizations for understanding orthogonality in ML.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('seaborn-v0_8-whitegrid')

## 1. Orthogonal Vectors

In [None]:
def visualize_orthogonal_vectors():
    """
    Visualize orthogonal vs non-orthogonal vectors.
    """
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    cases = [
        ('Orthogonal', [[1, 0], [0, 1]]),
        ('Orthogonal (rotated)', [[1, 1], [1, -1]]),
        ('Not orthogonal', [[1, 0], [1, 1]])
    ]
    
    for ax, (name, vectors) in zip(axes, cases):
        v1, v2 = np.array(vectors[0]), np.array(vectors[1])
        
        # Normalize for visualization
        v1_norm = v1 / np.linalg.norm(v1)
        v2_norm = v2 / np.linalg.norm(v2)
        
        # Draw vectors
        ax.quiver(0, 0, v1_norm[0], v1_norm[1], angles='xy', scale_units='xy', scale=1,
                 color='blue', width=0.03, label='v₁')
        ax.quiver(0, 0, v2_norm[0], v2_norm[1], angles='xy', scale_units='xy', scale=1,
                 color='red', width=0.03, label='v₂')
        
        # Compute dot product and angle
        dot = np.dot(v1, v2)
        cos_angle = dot / (np.linalg.norm(v1) * np.linalg.norm(v2))
        angle = np.degrees(np.arccos(np.clip(cos_angle, -1, 1)))
        
        ax.set_xlim(-1.5, 1.5)
        ax.set_ylim(-1.5, 1.5)
        ax.set_aspect('equal')
        ax.axhline(y=0, color='k', linewidth=0.5)
        ax.axvline(x=0, color='k', linewidth=0.5)
        ax.set_title(f'{name}\nv₁·v₂ = {dot:.2f}, angle = {angle:.1f}°')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Draw angle arc
        if angle > 1:
            theta = np.linspace(np.arctan2(v1_norm[1], v1_norm[0]),
                              np.arctan2(v2_norm[1], v2_norm[0]), 20)
            r = 0.3
            ax.plot(r * np.cos(theta), r * np.sin(theta), 'g-', linewidth=2)
    
    plt.tight_layout()
    plt.show()

visualize_orthogonal_vectors()

## 2. Orthogonal Matrices: Rotations and Reflections

In [None]:
def visualize_orthogonal_matrices():
    """
    Show how orthogonal matrices preserve geometry.
    """
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Original shape (square)
    square = np.array([[0, 1, 1, 0, 0], [0, 0, 1, 1, 0]])
    
    # Rotation by 45 degrees
    theta = np.pi / 4
    R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
    
    # Reflection across y-axis
    F = np.array([[-1, 0], [0, 1]])
    
    transformations = [
        ('Original', np.eye(2)),
        (f'Rotation (det=+1)', R),
        ('Reflection (det=-1)', F)
    ]
    
    for ax, (name, Q) in zip(axes, transformations):
        transformed = Q @ square
        
        ax.plot(transformed[0], transformed[1], 'b-', linewidth=2)
        ax.fill(transformed[0, :-1], transformed[1, :-1], alpha=0.3)
        
        # Mark a vertex to show orientation
        ax.plot(transformed[0, 1], transformed[1, 1], 'ro', markersize=10)
        
        ax.set_xlim(-2, 2)
        ax.set_ylim(-1, 2)
        ax.set_aspect('equal')
        ax.axhline(y=0, color='k', linewidth=0.5)
        ax.axvline(x=0, color='k', linewidth=0.5)
        ax.set_title(f'{name}\ndet(Q) = {np.linalg.det(Q):.0f}')
        ax.grid(True, alpha=0.3)
    
    plt.suptitle('Orthogonal Matrices Preserve Shape and Size', fontsize=14)
    plt.tight_layout()
    plt.show()

visualize_orthogonal_matrices()

## 3. Projection onto a Line

In [None]:
def visualize_projection_onto_line(b, a):
    """
    Visualize projection of b onto the line spanned by a.
    """
    b, a = np.array(b), np.array(a)
    
    # Projection formula: proj = (a^T b / a^T a) * a
    proj = (np.dot(a, b) / np.dot(a, a)) * a
    perp = b - proj
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Draw the line spanned by a
    t = np.linspace(-1, 2, 100)
    line = np.outer(a / np.linalg.norm(a), t)
    ax.plot(line[0] * 3, line[1] * 3, 'k--', alpha=0.3, linewidth=2, label='Line span(a)')
    
    # Draw vectors
    ax.quiver(0, 0, a[0], a[1], angles='xy', scale_units='xy', scale=1,
             color='blue', width=0.02, label=f'a = {a}')
    ax.quiver(0, 0, b[0], b[1], angles='xy', scale_units='xy', scale=1,
             color='green', width=0.02, label=f'b = {b}')
    ax.quiver(0, 0, proj[0], proj[1], angles='xy', scale_units='xy', scale=1,
             color='red', width=0.02, label=f'proj = ({np.dot(a,b)/np.dot(a,a):.2f})a')
    ax.quiver(proj[0], proj[1], perp[0], perp[1], angles='xy', scale_units='xy', scale=1,
             color='orange', width=0.02, label=f'b - proj')
    
    # Draw right angle marker
    if np.linalg.norm(proj) > 0.1 and np.linalg.norm(perp) > 0.1:
        scale = 0.15 * min(np.linalg.norm(proj), np.linalg.norm(perp))
        a_unit = a / np.linalg.norm(a) * scale
        perp_unit = perp / np.linalg.norm(perp) * scale
        corner = proj - a_unit + perp_unit
        ax.plot([proj[0] - a_unit[0], corner[0]], 
               [proj[1] - a_unit[1], corner[1]], 'k-', linewidth=1.5)
        ax.plot([corner[0], proj[0] + perp_unit[0]], 
               [corner[1], proj[1] + perp_unit[1]], 'k-', linewidth=1.5)
    
    # Dashed line from b to proj
    ax.plot([b[0], proj[0]], [b[1], proj[1]], 'g--', alpha=0.5)
    
    max_val = max(np.abs([*a, *b]).max() * 1.3, 1)
    ax.set_xlim(-max_val, max_val)
    ax.set_ylim(-max_val, max_val)
    ax.set_aspect('equal')
    ax.axhline(y=0, color='k', linewidth=0.5)
    ax.axvline(x=0, color='k', linewidth=0.5)
    ax.legend(loc='upper left')
    ax.set_title(f'Projection onto a Line\nproj · (b-proj) = {np.dot(proj, perp):.6f} (should be ≈0)')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Examples
visualize_projection_onto_line([3, 4], [1, 0])  # Onto x-axis
visualize_projection_onto_line([3, 4], [1, 1])  # Onto diagonal

## 4. Projection onto a Plane (3D)

In [None]:
def visualize_projection_onto_plane():
    """
    Visualize projection of a 3D vector onto a plane.
    """
    fig = plt.figure(figsize=(12, 5))
    
    # Define plane by two basis vectors
    a1 = np.array([1, 0, 0])
    a2 = np.array([0, 1, 0])
    A = np.column_stack([a1, a2])
    
    # Vector to project
    b = np.array([2, 3, 4])
    
    # Projection: P = A(A^T A)^{-1} A^T
    P = A @ np.linalg.inv(A.T @ A) @ A.T
    proj = P @ b
    perp = b - proj
    
    ax = fig.add_subplot(121, projection='3d')
    
    # Draw plane
    xx, yy = np.meshgrid(np.linspace(-1, 5, 10), np.linspace(-1, 5, 10))
    zz = np.zeros_like(xx)
    ax.plot_surface(xx, yy, zz, alpha=0.3, color='blue')
    
    # Draw vectors
    ax.quiver(0, 0, 0, b[0], b[1], b[2], color='green', linewidth=2, label='b')
    ax.quiver(0, 0, 0, proj[0], proj[1], proj[2], color='red', linewidth=2, label='proj')
    ax.quiver(proj[0], proj[1], proj[2], perp[0], perp[1], perp[2], 
             color='orange', linewidth=2, label='b - proj')
    
    # Dashed line
    ax.plot([b[0], proj[0]], [b[1], proj[1]], [b[2], proj[2]], 'g--', alpha=0.5)
    
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title(f'Projection onto xy-plane\nb = {b}, proj = {proj}')
    ax.legend()
    
    # Show projection matrix
    ax2 = fig.add_subplot(122)
    ax2.text(0.5, 0.8, 'Projection Matrix P = A(AᵀA)⁻¹Aᵀ', ha='center', fontsize=12,
            transform=ax2.transAxes, fontweight='bold')
    ax2.text(0.5, 0.6, f'P = \n{P}', ha='center', fontsize=10, transform=ax2.transAxes,
            family='monospace')
    ax2.text(0.5, 0.35, 'Properties:', ha='center', fontsize=11, transform=ax2.transAxes)
    ax2.text(0.5, 0.25, f'P² = P? {np.allclose(P @ P, P)}', ha='center', fontsize=10,
            transform=ax2.transAxes)
    ax2.text(0.5, 0.15, f'Pᵀ = P? {np.allclose(P.T, P)}', ha='center', fontsize=10,
            transform=ax2.transAxes)
    ax2.text(0.5, 0.05, f'Eigenvalues: {np.linalg.eigvals(P).round(2)}', ha='center', fontsize=10,
            transform=ax2.transAxes)
    ax2.axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_projection_onto_plane()

## 5. Gram-Schmidt Process

In [None]:
def visualize_gram_schmidt():
    """
    Step-by-step visualization of Gram-Schmidt orthonormalization.
    """
    # Original vectors (not orthogonal)
    v1 = np.array([2, 1])
    v2 = np.array([1, 2])
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Step 1: Original vectors
    ax1 = axes[0]
    ax1.quiver(0, 0, v1[0], v1[1], angles='xy', scale_units='xy', scale=1,
              color='blue', width=0.03, label='v₁')
    ax1.quiver(0, 0, v2[0], v2[1], angles='xy', scale_units='xy', scale=1,
              color='red', width=0.03, label='v₂')
    ax1.set_xlim(-0.5, 3)
    ax1.set_ylim(-0.5, 3)
    ax1.set_aspect('equal')
    dot = np.dot(v1, v2)
    ax1.set_title(f'Step 1: Original vectors\nv₁·v₂ = {dot} ≠ 0 (not orthogonal)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    ax1.axhline(y=0, color='k', linewidth=0.5)
    ax1.axvline(x=0, color='k', linewidth=0.5)
    
    # Step 2: Normalize v1, subtract projection from v2
    q1 = v1 / np.linalg.norm(v1)
    proj = np.dot(q1, v2) * q1
    u2 = v2 - proj
    
    ax2 = axes[1]
    ax2.quiver(0, 0, q1[0], q1[1], angles='xy', scale_units='xy', scale=1,
              color='blue', width=0.03, label='q₁ = v₁/||v₁||')
    ax2.quiver(0, 0, v2[0], v2[1], angles='xy', scale_units='xy', scale=1,
              color='red', width=0.03, alpha=0.3, label='v₂ (original)')
    ax2.quiver(0, 0, proj[0], proj[1], angles='xy', scale_units='xy', scale=1,
              color='green', width=0.03, label='proj of v₂ onto q₁')
    ax2.quiver(0, 0, u2[0], u2[1], angles='xy', scale_units='xy', scale=1,
              color='orange', width=0.03, label='u₂ = v₂ - proj')
    ax2.set_xlim(-0.5, 3)
    ax2.set_ylim(-1, 2.5)
    ax2.set_aspect('equal')
    ax2.set_title(f'Step 2: Subtract projection\nu₂ = v₂ - (q₁·v₂)q₁')
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)
    ax2.axhline(y=0, color='k', linewidth=0.5)
    ax2.axvline(x=0, color='k', linewidth=0.5)
    
    # Step 3: Final orthonormal basis
    q2 = u2 / np.linalg.norm(u2)
    
    ax3 = axes[2]
    ax3.quiver(0, 0, q1[0], q1[1], angles='xy', scale_units='xy', scale=1,
              color='blue', width=0.03, label='q₁')
    ax3.quiver(0, 0, q2[0], q2[1], angles='xy', scale_units='xy', scale=1,
              color='orange', width=0.03, label='q₂ = u₂/||u₂||')
    
    # Draw unit circle
    theta = np.linspace(0, 2*np.pi, 100)
    ax3.plot(np.cos(theta), np.sin(theta), 'k--', alpha=0.3)
    
    ax3.set_xlim(-1.5, 1.5)
    ax3.set_ylim(-1.5, 1.5)
    ax3.set_aspect('equal')
    dot_final = np.dot(q1, q2)
    ax3.set_title(f'Step 3: Orthonormal basis\nq₁·q₂ = {dot_final:.2e} ≈ 0')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    ax3.axhline(y=0, color='k', linewidth=0.5)
    ax3.axvline(x=0, color='k', linewidth=0.5)
    
    plt.suptitle('Gram-Schmidt Orthonormalization', fontsize=14)
    plt.tight_layout()
    plt.show()

visualize_gram_schmidt()

## 6. QR Decomposition

In [None]:
def demonstrate_qr():
    """
    Show QR decomposition and its properties.
    """
    np.random.seed(42)
    
    # Create a matrix
    A = np.array([[1, 1, 0], [1, 0, 1], [0, 1, 1]])
    
    # QR decomposition
    Q, R = np.linalg.qr(A)
    
    print("QR Decomposition: A = QR")
    print("="*50)
    print(f"\nA =\n{A}")
    print(f"\nQ (orthonormal columns) =\n{Q.round(4)}")
    print(f"\nR (upper triangular) =\n{R.round(4)}")
    print(f"\nVerification:")
    print(f"QᵀQ =\n{(Q.T @ Q).round(10)}")
    print(f"\nQR =\n{(Q @ R).round(10)}")
    
    # Visualize Q columns
    fig = plt.figure(figsize=(12, 5))
    
    # 3D plot of Q columns
    ax1 = fig.add_subplot(121, projection='3d')
    colors = ['blue', 'red', 'green']
    for i in range(3):
        ax1.quiver(0, 0, 0, Q[0, i], Q[1, i], Q[2, i], color=colors[i],
                  linewidth=2, label=f'q{i+1}')
    ax1.set_xlabel('X')
    ax1.set_ylabel('Y')
    ax1.set_zlabel('Z')
    ax1.set_title('Columns of Q (orthonormal)')
    ax1.legend()
    
    # Show R structure
    ax2 = fig.add_subplot(122)
    im = ax2.imshow(np.abs(R), cmap='Blues')
    plt.colorbar(im, ax=ax2)
    for i in range(3):
        for j in range(3):
            ax2.text(j, i, f'{R[i,j]:.2f}', ha='center', va='center', fontsize=12)
    ax2.set_title('R (upper triangular)')
    ax2.set_xticks(range(3))
    ax2.set_yticks(range(3))
    
    plt.tight_layout()
    plt.show()

demonstrate_qr()

## 7. Least Squares via QR

In [None]:
def least_squares_comparison():
    """
    Compare least squares methods: Normal equations vs QR.
    """
    np.random.seed(42)
    
    # Generate data
    x = np.linspace(0, 5, 20)
    y_true = 2 + 0.5 * x
    y = y_true + 0.5 * np.random.randn(len(x))
    
    # Design matrix
    A = np.column_stack([np.ones_like(x), x])
    
    # Method 1: Normal equations
    theta_normal = np.linalg.inv(A.T @ A) @ A.T @ y
    
    # Method 2: QR decomposition
    Q, R = np.linalg.qr(A)
    theta_qr = np.linalg.solve(R, Q.T @ y)
    
    # Method 3: numpy lstsq
    theta_lstsq, _, _, _ = np.linalg.lstsq(A, y, rcond=None)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot data and fits
    ax1 = axes[0]
    ax1.scatter(x, y, s=50, label='Data')
    x_line = np.linspace(0, 5, 100)
    ax1.plot(x_line, theta_normal[0] + theta_normal[1] * x_line, 'r-',
            linewidth=2, label=f'Normal eq: y = {theta_normal[0]:.3f} + {theta_normal[1]:.3f}x')
    ax1.plot(x_line, theta_qr[0] + theta_qr[1] * x_line, 'g--',
            linewidth=2, label=f'QR: y = {theta_qr[0]:.3f} + {theta_qr[1]:.3f}x')
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.set_title('Least Squares Fit (both methods give same result)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Show orthogonality of residual
    residual = y - A @ theta_qr
    ax2 = axes[1]
    ax2.bar(['Aᵀr[0] (const)', 'Aᵀr[1] (x)'], A.T @ residual)
    ax2.axhline(y=0, color='r', linestyle='--')
    ax2.set_ylabel('Value')
    ax2.set_title('Residual is Orthogonal to Column Space\nAᵀ(y - Aθ) ≈ 0')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Normal equations: θ = {theta_normal}")
    print(f"QR decomposition: θ = {theta_qr}")
    print(f"NumPy lstsq:      θ = {theta_lstsq}")
    print(f"\nMax difference: {np.max(np.abs(theta_normal - theta_qr)):.2e}")

least_squares_comparison()

## 8. Ill-Conditioned Systems: QR vs Normal Equations

In [None]:
def ill_conditioned_comparison():
    """
    Show when normal equations fail but QR succeeds.
    """
    # Create ill-conditioned matrix (nearly collinear columns)
    epsilon = 1e-8
    A = np.array([[1, 1], [1, 1 + epsilon], [1, 1 + 2*epsilon]])
    b = np.array([2, 2.1, 2.2])
    
    print("Ill-Conditioned Least Squares")
    print("="*50)
    print(f"\nA =\n{A}")
    print(f"\nCondition number of A: {np.linalg.cond(A):.2e}")
    print(f"Condition number of AᵀA: {np.linalg.cond(A.T @ A):.2e}")
    print("(Note: condition number squared!)")
    
    # Normal equations (may be inaccurate)
    try:
        ATA = A.T @ A
        ATb = A.T @ b
        x_normal = np.linalg.solve(ATA, ATb)
        print(f"\nNormal equations solution: {x_normal}")
        print(f"Residual norm: {np.linalg.norm(A @ x_normal - b):.6f}")
    except Exception as e:
        print(f"\nNormal equations failed: {e}")
    
    # QR (more stable)
    Q, R = np.linalg.qr(A)
    x_qr = np.linalg.solve(R, Q.T @ b)
    print(f"\nQR solution: {x_qr}")
    print(f"Residual norm: {np.linalg.norm(A @ x_qr - b):.6f}")
    
    # NumPy lstsq (uses SVD, most stable)
    x_lstsq, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
    print(f"\nNumPy lstsq (SVD): {x_lstsq}")
    print(f"Residual norm: {np.linalg.norm(A @ x_lstsq - b):.6f}")

ill_conditioned_comparison()

## 9. Orthogonal Initialization for Neural Networks

In [None]:
def orthogonal_init_demo():
    """
    Show why orthogonal initialization helps deep networks.
    """
    np.random.seed(42)
    
    n = 100  # Layer size
    depth = 50  # Number of layers
    
    def random_init(n):
        return np.random.randn(n, n) / np.sqrt(n)
    
    def orthogonal_init(n):
        A = np.random.randn(n, n)
        Q, _ = np.linalg.qr(A)
        return Q
    
    # Track singular values of product of matrices
    random_singular = []
    ortho_singular = []
    
    W_random = np.eye(n)
    W_ortho = np.eye(n)
    
    for i in range(depth):
        W_random = random_init(n) @ W_random
        W_ortho = orthogonal_init(n) @ W_ortho
        
        random_singular.append(np.linalg.svd(W_random, compute_uv=False))
        ortho_singular.append(np.linalg.svd(W_ortho, compute_uv=False))
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Largest singular value
    ax1 = axes[0]
    ax1.semilogy([s[0] for s in random_singular], 'b-', label='Random init')
    ax1.semilogy([s[0] for s in ortho_singular], 'r-', label='Orthogonal init')
    ax1.set_xlabel('Number of layers')
    ax1.set_ylabel('Largest singular value')
    ax1.set_title('Largest Singular Value of W₁W₂...Wₖ')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Smallest singular value
    ax2 = axes[1]
    ax2.semilogy([s[-1] for s in random_singular], 'b-', label='Random init')
    ax2.semilogy([s[-1] for s in ortho_singular], 'r-', label='Orthogonal init')
    ax2.set_xlabel('Number of layers')
    ax2.set_ylabel('Smallest singular value')
    ax2.set_title('Smallest Singular Value')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Condition number
    ax3 = axes[2]
    ax3.semilogy([s[0]/s[-1] for s in random_singular], 'b-', label='Random init')
    ax3.semilogy([s[0]/s[-1] for s in ortho_singular], 'r-', label='Orthogonal init')
    ax3.set_xlabel('Number of layers')
    ax3.set_ylabel('Condition number')
    ax3.set_title('Condition Number (σ_max/σ_min)')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    plt.suptitle('Orthogonal Init Preserves Signal Through Deep Networks', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("Orthogonal matrices preserve norms: ||Qx|| = ||x||")
    print("So signal magnitude stays constant through layers!")

orthogonal_init_demo()

## 10. Summary

In [None]:
print("""
KEY CONCEPTS SUMMARY
====================

1. ORTHOGONAL VECTORS
   - u · v = 0 means u and v are independent directions
   - Orthogonal sets are always linearly independent

2. ORTHOGONAL MATRICES
   - Q^T Q = I (columns are orthonormal)
   - Preserve norms: ||Qx|| = ||x||
   - Represent rotations (det=+1) or reflections (det=-1)

3. PROJECTION
   - Onto line: proj = (a^T b / a^T a) * a
   - Onto subspace: proj = A(A^T A)^{-1} A^T b
   - Residual is orthogonal to subspace

4. GRAM-SCHMIDT
   - Orthogonalizes a set of vectors
   - q_k = (v_k - sum of projections) / norm
   - Modified version is more numerically stable

5. QR DECOMPOSITION
   - A = QR where Q orthonormal, R upper triangular
   - Gram-Schmidt in matrix form
   - More stable than normal equations for least squares

6. ML APPLICATIONS
   - Least squares: More stable via QR than normal equations
   - Neural networks: Orthogonal init preserves signal magnitude
   - Regularization: Soft orthogonality constraints
""")