# Tutorial 05: Why Logarithm is Fundamental in ML

This notebook demonstrates **experimentally** why logarithm is not just a computational convenience, but mathematically fundamental to how neural networks learn.

**Central Question**: If we had perfect computers with infinite precision, would we still need log?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from decimal import Decimal, getcontext
import torch
import torch.nn as nn
import torch.optim as optim

# Set high precision for Decimal experiments
getcontext().prec = 100

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## Part 1: The Uniqueness of Logarithm

**Theorem**: Log is the ONLY continuous function satisfying $f(xy) = f(x) + f(y)$

Let's verify this numerically and see why it matters.

In [None]:
# Test various functions for the multiplicative-to-additive property
def test_additivity(f, name, x_vals):
    """Test if f(xy) = f(x) + f(y)"""
    errors = []
    for x in x_vals:
        for y in x_vals:
            if x > 0 and y > 0:  # Avoid domain issues
                lhs = f(x * y)
                rhs = f(x) + f(y)
                error = abs(lhs - rhs)
                errors.append(error)
    return np.mean(errors)

x_vals = np.linspace(0.1, 5, 20)

functions = {
    'log(x)': np.log,
    'sqrt(x)': np.sqrt,
    'x^2': lambda x: x**2,
    '1/x': lambda x: 1/x,
    'x': lambda x: x,
    'exp(x)': np.exp,
}

print("Testing f(xy) = f(x) + f(y):")
print("-" * 40)
for name, f in functions.items():
    error = test_additivity(f, name, x_vals)
    status = "✓ SATISFIES" if error < 1e-10 else "✗ Fails"
    print(f"{name:12} | Mean error: {error:.2e} | {status}")

**Only logarithm** converts multiplication to addition. This is the key property we need for probabilities.

## Part 2: The Gradient Scaling Problem

### The Core Issue: Gradients Scale with Likelihood

Let's see what happens to gradients when we DON'T use log.

In [None]:
def compute_gradients_comparison(n_samples, prob_per_sample):
    """
    Compare gradients for direct likelihood vs log-likelihood.
    
    Simplified model: L = prod(p_i) where p_i = prob_per_sample
    We compute dL/dp for direct and d(log L)/dp for log version.
    """
    # Direct likelihood
    L = prob_per_sample ** n_samples
    
    # Gradient of direct likelihood: dL/dp = n * p^(n-1)
    grad_direct = n_samples * (prob_per_sample ** (n_samples - 1))
    
    # Log-likelihood: log L = n * log(p)
    log_L = n_samples * np.log(prob_per_sample)
    
    # Gradient of log-likelihood: d(log L)/dp = n/p
    grad_log = n_samples / prob_per_sample
    
    return {
        'L': L,
        'log_L': log_L,
        'grad_direct': grad_direct,
        'grad_log': grad_log,
        'grad_ratio': grad_direct / grad_log if grad_log != 0 else np.inf
    }

# Test with different number of samples
print("Gradient comparison (probability per sample = 0.9):")
print("=" * 80)
print(f"{'n_samples':>10} | {'L':>15} | {'grad_direct':>15} | {'grad_log':>12} | {'ratio':>12}")
print("-" * 80)

for n in [10, 100, 1000, 10000]:
    result = compute_gradients_comparison(n, 0.9)
    print(f"{n:>10} | {result['L']:>15.2e} | {result['grad_direct']:>15.2e} | {result['grad_log']:>12.2f} | {result['grad_ratio']:>12.2e}")

### Key Observation

- **Direct gradient** shrinks exponentially with more samples
- **Log gradient** stays O(n) — perfectly usable!
- This happens even with very confident predictions (p=0.9)

This is NOT a numerical precision issue — it's a **mathematical scaling problem**.

In [None]:
# Visualize the gradient scaling
n_samples_range = np.arange(1, 501)
prob = 0.9

grad_direct = [n * (prob ** (n-1)) for n in n_samples_range]
grad_log = [n / prob for n in n_samples_range]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale (log gradient)
axes[0].plot(n_samples_range, grad_log, 'b-', linewidth=2, label='∇ log L')
axes[0].set_xlabel('Number of samples', fontsize=12)
axes[0].set_ylabel('Gradient magnitude', fontsize=12)
axes[0].set_title('Log-Likelihood Gradient (Linear Scale)', fontsize=14)
axes[0].legend(fontsize=12)

# Log scale comparison
axes[1].semilogy(n_samples_range, grad_direct, 'r-', linewidth=2, label='∇ L (direct)')
axes[1].semilogy(n_samples_range, grad_log, 'b-', linewidth=2, label='∇ log L')
axes[1].set_xlabel('Number of samples', fontsize=12)
axes[1].set_ylabel('Gradient magnitude (log scale)', fontsize=12)
axes[1].set_title('Gradient Comparison (Log Scale)', fontsize=14)
axes[1].legend(fontsize=12)
axes[1].axhline(y=1e-300, color='gray', linestyle='--', alpha=0.5, label='Float64 limit')

plt.tight_layout()
plt.show()

print(f"\nAt n=500 samples with p=0.9:")
print(f"  Direct gradient: {500 * (0.9 ** 499):.2e}")
print(f"  Log gradient: {500 / 0.9:.2f}")
print(f"  Ratio: {(500 * (0.9 ** 499)) / (500 / 0.9):.2e}")

## Part 3: High-Precision Experiment

Let's use Python's `Decimal` for arbitrary precision to show the problem persists even with perfect arithmetic.

In [None]:
# Use 100 digits of precision
getcontext().prec = 100

def high_precision_gradient_comparison(n_samples, prob):
    """Compute gradients with arbitrary precision."""
    p = Decimal(str(prob))
    n = Decimal(str(n_samples))
    
    # Direct likelihood: L = p^n
    L = p ** int(n_samples)
    
    # Direct gradient: dL/dp = n * p^(n-1)
    grad_direct = n * (p ** (int(n_samples) - 1))
    
    # Log gradient: d(log L)/dp = n/p
    grad_log = n / p
    
    return L, grad_direct, grad_log

print("High-Precision Gradient Comparison (100 decimal digits):")
print("=" * 90)

for n in [100, 1000, 10000]:
    L, grad_d, grad_l = high_precision_gradient_comparison(n, 0.9)
    
    # Convert to float for display (may lose precision for very small numbers)
    print(f"\nn = {n}:")
    print(f"  L = {float(L):.6e}")
    print(f"  ∇L (direct) = {float(grad_d):.6e}")
    print(f"  ∇log(L) = {float(grad_l):.2f}")
    
    # The key insight: gradient ratio
    if grad_l != 0:
        ratio = grad_d / grad_l
        print(f"  Ratio ∇L/∇log(L) = {float(ratio):.6e}")

### The Problem ISN'T Precision

Even with 100 decimal digits:
- Direct gradients still vanish exponentially
- The RELATIVE magnitude between good and bad models is astronomically different

**This breaks learning**, not because we can't represent the numbers, but because the gradient landscape is pathological.

## Part 4: Actual Training Comparison

Let's train a simple model using:
1. Cross-entropy loss (uses log)
2. Direct probability product (no log)

We'll use PyTorch's autograd for exact gradients.

In [None]:
# Create a simple binary classification problem
np.random.seed(42)
torch.manual_seed(42)

# Generate linearly separable data
n_samples = 100
X = torch.randn(n_samples, 2)
y = (X[:, 0] + X[:, 1] > 0).float()

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='Class 0', alpha=0.6)
plt.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', alpha=0.6)
plt.xlabel('$x_1$', fontsize=12)
plt.ylabel('$x_2$', fontsize=12)
plt.title('Binary Classification Dataset', fontsize=14)
plt.legend()
plt.show()

In [None]:
class SimpleLogisticRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2, 1)
        # Initialize with small weights
        nn.init.normal_(self.linear.weight, std=0.01)
        nn.init.zeros_(self.linear.bias)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x)).squeeze()

def train_with_log_loss(X, y, epochs=100, lr=0.1):
    """Train using cross-entropy (log-based) loss."""
    model = SimpleLogisticRegression()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    losses = []
    grad_norms = []
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        probs = model(X)
        # Cross-entropy loss (uses log internally)
        loss = -torch.mean(y * torch.log(probs + 1e-10) + (1-y) * torch.log(1 - probs + 1e-10))
        
        loss.backward()
        
        # Record gradient norm
        grad_norm = model.linear.weight.grad.norm().item()
        grad_norms.append(grad_norm)
        
        optimizer.step()
        losses.append(loss.item())
    
    return model, losses, grad_norms

def train_with_direct_likelihood(X, y, epochs=100, lr=1e10):
    """Train using direct likelihood (no log) - maximize product of probabilities."""
    model = SimpleLogisticRegression()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    losses = []  # We'll track negative log-likelihood for comparison
    grad_norms = []
    likelihoods = []
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        probs = model(X)
        
        # Direct likelihood: product of P(correct class)
        correct_probs = y * probs + (1-y) * (1 - probs)
        likelihood = torch.prod(correct_probs)  # This is what we'd maximize
        
        # Negative likelihood for minimization
        neg_likelihood = -likelihood
        neg_likelihood.backward()
        
        # Record gradient norm
        grad_norm = model.linear.weight.grad.norm().item()
        grad_norms.append(grad_norm)
        likelihoods.append(likelihood.item())
        
        optimizer.step()
        
        # Track equivalent cross-entropy for comparison
        with torch.no_grad():
            ce_loss = -torch.mean(y * torch.log(probs + 1e-10) + (1-y) * torch.log(1 - probs + 1e-10))
            losses.append(ce_loss.item())
    
    return model, losses, grad_norms, likelihoods

# Train both
model_log, losses_log, grads_log = train_with_log_loss(X, y, epochs=200, lr=1.0)
model_direct, losses_direct, grads_direct, likelihoods = train_with_direct_likelihood(X, y, epochs=200, lr=1e20)

In [None]:
# Compare training
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Loss comparison
axes[0, 0].plot(losses_log, 'b-', linewidth=2, label='Log loss (cross-entropy)')
axes[0, 0].plot(losses_direct, 'r-', linewidth=2, label='Direct likelihood', alpha=0.7)
axes[0, 0].set_xlabel('Epoch', fontsize=12)
axes[0, 0].set_ylabel('Cross-Entropy Loss', fontsize=12)
axes[0, 0].set_title('Training Loss Comparison', fontsize=14)
axes[0, 0].legend(fontsize=11)
axes[0, 0].set_ylim(0, max(losses_log[0], losses_direct[0]) * 1.1)

# Gradient norms
axes[0, 1].semilogy(grads_log, 'b-', linewidth=2, label='Log loss gradients')
axes[0, 1].semilogy(grads_direct, 'r-', linewidth=2, label='Direct likelihood gradients', alpha=0.7)
axes[0, 1].set_xlabel('Epoch', fontsize=12)
axes[0, 1].set_ylabel('Gradient Norm (log scale)', fontsize=12)
axes[0, 1].set_title('Gradient Magnitude Comparison', fontsize=14)
axes[0, 1].legend(fontsize=11)

# Likelihood values
axes[1, 0].semilogy(likelihoods, 'r-', linewidth=2)
axes[1, 0].set_xlabel('Epoch', fontsize=12)
axes[1, 0].set_ylabel('Likelihood (log scale)', fontsize=12)
axes[1, 0].set_title('Direct Likelihood Values', fontsize=14)
axes[1, 0].axhline(y=1e-300, color='gray', linestyle='--', label='Float64 underflow')
axes[1, 0].legend(fontsize=11)

# Final decision boundaries
def plot_decision_boundary(model, ax, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    with torch.no_grad():
        Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]))
        Z = Z.numpy().reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.6)
    ax.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    ax.scatter(X[y==0, 0], X[y==0, 1], c='blue', edgecolors='white', s=50)
    ax.scatter(X[y==1, 0], X[y==1, 1], c='red', edgecolors='white', s=50)
    ax.set_title(title, fontsize=14)

plot_decision_boundary(model_log, axes[1, 1], 'Log Loss - Decision Boundary')

plt.tight_layout()
plt.show()

# Final accuracies
with torch.no_grad():
    acc_log = ((model_log(X) > 0.5) == y).float().mean().item()
    acc_direct = ((model_direct(X) > 0.5) == y).float().mean().item()

print(f"\nFinal Accuracies:")
print(f"  Log loss model: {acc_log:.1%}")
print(f"  Direct likelihood model: {acc_direct:.1%}")

### Analysis

Notice:
1. **Direct likelihood gradients** are astronomically small (often underflow to 0)
2. **Log loss gradients** stay in a reasonable range throughout training
3. Even with a huge learning rate (1e20!), direct likelihood barely learns
4. The problem isn't precision — it's that gradients scale with likelihood magnitude

## Part 5: The Additivity Requirement

Independent samples MUST contribute additively to learning. Let's demonstrate why.

In [None]:
# Demonstrate that log makes contributions additive

# Three independent samples with probabilities
p1, p2, p3 = 0.8, 0.6, 0.9

# Direct likelihood
L = p1 * p2 * p3
print("Direct Likelihood (multiplicative):")
print(f"  L = p1 × p2 × p3 = {p1} × {p2} × {p3} = {L:.4f}")
print(f"  Contribution of sample 1: p1 = {p1}")
print(f"  Contribution of sample 2: p2 = {p2}")
print(f"  How do they combine? Multiplication (not additive!)\n")

# Log-likelihood
log_L = np.log(p1) + np.log(p2) + np.log(p3)
print("Log-Likelihood (additive):")
print(f"  log L = log(p1) + log(p2) + log(p3)")
print(f"        = {np.log(p1):.4f} + {np.log(p2):.4f} + {np.log(p3):.4f} = {log_L:.4f}")
print(f"  Contribution of sample 1: log(p1) = {np.log(p1):.4f}")
print(f"  Contribution of sample 2: log(p2) = {np.log(p2):.4f}")
print(f"  How do they combine? Addition (additive!)")

In [None]:
# Visualize: per-sample contributions
np.random.seed(42)
n_samples = 20
probs = np.random.uniform(0.5, 0.95, n_samples)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Direct: cumulative product
cumulative_L = np.cumprod(probs)
axes[0].bar(range(n_samples), probs, alpha=0.5, label='Individual p_i')
axes[0].plot(range(n_samples), cumulative_L, 'r-o', linewidth=2, markersize=6, label='Cumulative L')
axes[0].set_xlabel('Sample index', fontsize=12)
axes[0].set_ylabel('Value', fontsize=12)
axes[0].set_title('Direct Likelihood: Products → Vanishing', fontsize=14)
axes[0].legend(fontsize=11)
axes[0].set_yscale('log')

# Log: cumulative sum
log_probs = np.log(probs)
cumulative_log_L = np.cumsum(log_probs)
axes[1].bar(range(n_samples), log_probs, alpha=0.5, label='Individual log(p_i)')
axes[1].plot(range(n_samples), cumulative_log_L, 'b-o', linewidth=2, markersize=6, label='Cumulative log L')
axes[1].set_xlabel('Sample index', fontsize=12)
axes[1].set_ylabel('Value', fontsize=12)
axes[1].set_title('Log-Likelihood: Sums → Well-behaved', fontsize=14)
axes[1].legend(fontsize=11)

plt.tight_layout()
plt.show()

print(f"After {n_samples} samples:")
print(f"  Direct likelihood: {cumulative_L[-1]:.2e}")
print(f"  Log-likelihood: {cumulative_log_L[-1]:.2f}")

## Part 6: Information Theory Perspective

Shannon PROVED that information MUST be logarithmic. Let's see why.

In [None]:
# Shannon's requirements for information measure I(p):
# 1. I(p) ≥ 0 (non-negative)
# 2. I(1) = 0 (certain events have no information)
# 3. I(p) decreasing in p (rarer events have more information)
# 4. I(p·q) = I(p) + I(q) for independent events (ADDITIVITY)

# Let's test candidate functions
def test_information_axioms(f, name, p_vals):
    """Test if f(p) satisfies Shannon's axioms."""
    results = {
        'non_negative': True,
        'I(1)=0': abs(f(1.0)) < 1e-10,
        'decreasing': True,
        'additive': True,
    }
    
    for p in p_vals:
        if p > 0:
            if f(p) < -1e-10:
                results['non_negative'] = False
    
    # Check decreasing
    for i in range(len(p_vals) - 1):
        if p_vals[i] < p_vals[i+1] and p_vals[i] > 0 and p_vals[i+1] > 0:
            if f(p_vals[i]) < f(p_vals[i+1]):
                results['decreasing'] = False
    
    # Check additivity: I(p·q) = I(p) + I(q)
    for p in [0.2, 0.5, 0.8]:
        for q in [0.3, 0.6, 0.9]:
            if abs(f(p*q) - f(p) - f(q)) > 1e-10:
                results['additive'] = False
    
    return results

p_vals = np.linspace(0.01, 1.0, 100)

# Test different functions
candidates = {
    '-log(p)': lambda p: -np.log(p),
    '1-p': lambda p: 1 - p,
    '1/p - 1': lambda p: 1/p - 1,
    '-p·log(p)': lambda p: -p * np.log(p) if p > 0 else 0,
    'sqrt(1-p)': lambda p: np.sqrt(1 - p),
}

print("Testing Shannon's Axioms for Information Measure:")
print("=" * 70)
print(f"{'Function':>15} | {'I≥0':>6} | {'I(1)=0':>7} | {'Decreasing':>10} | {'Additive':>10}")
print("-" * 70)

for name, f in candidates.items():
    results = test_information_axioms(f, name, p_vals)
    print(f"{name:>15} | {'✓' if results['non_negative'] else '✗':>6} | {'✓' if results['I(1)=0'] else '✗':>7} | {'✓' if results['decreasing'] else '✗':>10} | {'✓' if results['additive'] else '✗':>10}")

print("\n→ Only -log(p) satisfies ALL axioms!")

In [None]:
# Visualize the candidates
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

p_vals = np.linspace(0.01, 1.0, 100)

for name, f in candidates.items():
    y_vals = [f(p) for p in p_vals]
    axes[0].plot(p_vals, y_vals, linewidth=2, label=name)

axes[0].set_xlabel('Probability p', fontsize=12)
axes[0].set_ylabel('Information I(p)', fontsize=12)
axes[0].set_title('Candidate Information Functions', fontsize=14)
axes[0].legend(fontsize=10)
axes[0].set_ylim(-0.5, 5)

# Additivity test visualization
p_range = np.linspace(0.1, 0.9, 50)
q = 0.5

for name, f in [('-log(p)', lambda p: -np.log(p)), ('1-p', lambda p: 1-p)]:
    # I(p·q) vs I(p) + I(q)
    lhs = [f(p * q) for p in p_range]  # I(p·q)
    rhs = [f(p) + f(q) for p in p_range]  # I(p) + I(q)
    error = [abs(l - r) for l, r in zip(lhs, rhs)]
    axes[1].plot(p_range, error, linewidth=2, label=f'{name}: |I(pq) - I(p) - I(q)|')

axes[1].set_xlabel('Probability p (with q=0.5 fixed)', fontsize=12)
axes[1].set_ylabel('Additivity Error', fontsize=12)
axes[1].set_title('Additivity Test: I(p·q) = I(p) + I(q)?', fontsize=14)
axes[1].legend(fontsize=11)

plt.tight_layout()
plt.show()

## Part 7: The Fisher Information Connection

The logarithm has a special property: the **Fisher Information** only exists because of log.

In [None]:
# Fisher Information: I(θ) = Var[∇_θ log p(x|θ)] = -E[∇²_θ log p(x|θ)]
# This fundamental quantity in statistics ONLY works with log!

# Example: Bernoulli distribution p(x|θ) = θ^x (1-θ)^(1-x)
# log p = x log(θ) + (1-x) log(1-θ)
# ∇_θ log p = x/θ - (1-x)/(1-θ)
# Fisher Info = 1/(θ(1-θ))

theta_vals = np.linspace(0.01, 0.99, 100)
fisher_info = 1 / (theta_vals * (1 - theta_vals))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Fisher Information
axes[0].plot(theta_vals, fisher_info, 'b-', linewidth=2)
axes[0].set_xlabel('θ (probability parameter)', fontsize=12)
axes[0].set_ylabel('Fisher Information I(θ)', fontsize=12)
axes[0].set_title('Fisher Information for Bernoulli', fontsize=14)
axes[0].fill_between(theta_vals, fisher_info, alpha=0.3)

# Score function variance = Fisher Information
# Simulate: compute variance of ∇log p for samples from Bernoulli(θ)
n_simulations = 10000
theta_test = [0.2, 0.5, 0.8]
colors = ['red', 'green', 'blue']

for theta, color in zip(theta_test, colors):
    # Generate samples
    samples = np.random.binomial(1, theta, n_simulations)
    
    # Compute score (gradient of log-likelihood)
    scores = samples / theta - (1 - samples) / (1 - theta)
    
    # Variance should equal Fisher Information
    empirical_var = np.var(scores)
    theoretical_fi = 1 / (theta * (1 - theta))
    
    axes[1].hist(scores, bins=50, alpha=0.5, density=True, color=color,
                 label=f'θ={theta}: Var={empirical_var:.2f}, I(θ)={theoretical_fi:.2f}')

axes[1].set_xlabel('Score: ∇_θ log p(x|θ)', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('Score Function Distribution\n(Variance = Fisher Information)', fontsize=14)
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

print("Key insight: The Fisher Information defines the FUNDAMENTAL LIMIT")
print("on how well we can estimate θ. This only works because of log!")

## Part 8: The Softmax + Cross-Entropy Magic

The log in cross-entropy exactly cancels the exp in softmax, giving beautiful linear gradients.

In [None]:
# Softmax: p_i = exp(z_i) / Σ exp(z_j)
# Cross-entropy: L = -Σ y_i log(p_i)
# For one-hot y with class k: L = -log(p_k) = -z_k + log(Σ exp(z_j))

# Gradient: ∂L/∂z_i = p_i - y_i
# This is BEAUTIFUL: gradient = predicted - actual

# Let's verify this
def softmax(z):
    exp_z = np.exp(z - np.max(z))  # Numerical stability
    return exp_z / exp_z.sum()

def cross_entropy_loss(z, y_true_idx):
    p = softmax(z)
    return -np.log(p[y_true_idx])

def gradient_numerical(z, y_true_idx, eps=1e-5):
    """Compute gradient numerically."""
    grad = np.zeros_like(z)
    for i in range(len(z)):
        z_plus = z.copy()
        z_plus[i] += eps
        z_minus = z.copy()
        z_minus[i] -= eps
        grad[i] = (cross_entropy_loss(z_plus, y_true_idx) - cross_entropy_loss(z_minus, y_true_idx)) / (2 * eps)
    return grad

def gradient_analytical(z, y_true_idx):
    """Compute gradient analytically: p - y"""
    p = softmax(z)
    y = np.zeros_like(z)
    y[y_true_idx] = 1
    return p - y

# Test
z = np.array([2.0, 1.0, 0.5, -1.0])  # Logits
y_true = 2  # True class

grad_num = gradient_numerical(z, y_true)
grad_ana = gradient_analytical(z, y_true)

print("Softmax + Cross-Entropy Gradient:")
print("=" * 50)
print(f"Logits z: {z}")
print(f"True class: {y_true}")
print(f"Softmax probabilities: {softmax(z).round(4)}")
print(f"")
print(f"Numerical gradient:  {grad_num.round(6)}")
print(f"Analytical (p - y):  {grad_ana.round(6)}")
print(f"Match: {np.allclose(grad_num, grad_ana)}")
print(f"")
print("The gradient is simply: (predicted probability) - (true label)")
print("This elegant formula exists ONLY because of log!")

In [None]:
# Visualize the gradient
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
x_pos = np.arange(len(z))
width = 0.35

p = softmax(z)
y_onehot = np.zeros_like(z)
y_onehot[y_true] = 1

axes[0].bar(x_pos - width/2, p, width, label='Predicted p', color='steelblue', alpha=0.7)
axes[0].bar(x_pos + width/2, y_onehot, width, label='True y', color='coral', alpha=0.7)
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Probability', fontsize=12)
axes[0].set_title('Predicted vs True', fontsize=14)
axes[0].legend(fontsize=11)
axes[0].set_xticks(x_pos)

# Gradient = p - y
axes[1].bar(x_pos, grad_ana, color='green', alpha=0.7)
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_xlabel('Class', fontsize=12)
axes[1].set_ylabel('Gradient (∂L/∂z)', fontsize=12)
axes[1].set_title('Gradient = p - y\n(Decrease logits for overconfident classes)', fontsize=14)
axes[1].set_xticks(x_pos)

plt.tight_layout()
plt.show()

print("\nInterpretation:")
for i in range(len(z)):
    direction = "↑ increase" if grad_ana[i] < 0 else "↓ decrease"
    print(f"  Class {i}: gradient = {grad_ana[i]:.4f} → {direction} logit")

## Summary

### The Answer to Our Central Question

> **"Would MLE still work without logarithm if we had perfect computers?"**

**NO.** The logarithm is **mathematically fundamental**, not just a computational convenience.

### Why Logarithm is Fundamental:

| Reason | Why It Matters |
|--------|----------------|
| **Gradient scaling** | Without log, gradients scale with $L$ → bad models get no learning signal |
| **Additivity** | Independent samples must contribute additively; only log converts × to + |
| **Shannon's proof** | Information MUST be logarithmic (uniqueness theorem) |
| **Fisher Information** | Fundamental statistics rely on log-likelihood |
| **Softmax gradient** | Log cancels exp → beautiful p - y gradients |

### The Deep Insight

The logarithm is the **unique mathematical bridge** between:
- The **multiplicative** world of probabilities
- The **additive** world of learning and information

Without this bridge, neural networks as we know them would not work — regardless of computational precision.