<a href="https://colab.research.google.com/github/mahirbarot/thrifty-ai/blob/main/task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# backprop implementation for a 2-layer binary classifier (NumPy only)

import numpy as np

'''  Creating utilities '''

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

# ---------- Forward pass ----------
def forward(x, params, y=None):
    """
    x: shape (3,)
    params: dict with 'W1' (4x3), 'b1' (4,), 'W2' (1x4), 'b2' (1,)
    Returns cache and optionally loss if y provided.
    """
    W1, b1, W2, b2 = params['W1'], params['b1'], params['W2'], params['b2']
    # Hidden pre-activation (4,)
    z1 = W1.dot(x) + b1
    # Hidden activation (4,)
    a1 = relu(z1)
    # Output pre-activation (scalar inside 1-array)
    z2 = W2.dot(a1) + b2  # shape (1,)
    # Output activation (pred prob)
    a2 = sigmoid(z2)      # shape (1,)
    cache = {'x': x, 'z1': z1, 'a1': a1, 'z2': z2, 'a2': a2}
    if y is None:
        return cache
    # Binary cross-entropy for single sample (scalar)
    eps = 1e-12
    a2_clip = np.clip(a2, eps, 1 - eps)
    loss = - (y * np.log(a2_clip) + (1 - y) * np.log(1 - a2_clip)).item()
    return cache, loss

# backward pass
def backward(x, y, params, cache):

    """
    Returns gradients: dW1 (4x3), db1 (4,), dW2 (1x4), db2 (1,)
    Key derivatives:
    - For sigmoid + BCE output, dL/dz2 = a2 - y
    - dW2 = (dL/dz2) * a1^T
    - For ReLU hidden, derivative is 1 where z1>0 else 0: dz1 = (W2^T * dz2) * relu'(z1)
    """

    W1, b1, W2, b2 = params['W1'], params['b1'], params['W2'], params['b2']
    x = cache['x']
    z1, a1, z2, a2 = cache['z1'], cache['a1'], cache['z2'], cache['a2']
    # dL/dz2: for sigmoid output with BCE loss, simplifies to (a2 - y)
    dz2 = (a2 - y).reshape(1,)  # shape (1,)
    # Gradients for W2 and b2
    dW2 = dz2.reshape(1,1) * a1.reshape(1, -1)  # shape (1,4)
    db2 = dz2.copy()                             # shape (1,)
    # Backprop into hidden layer: dL/da1 = W2^T * dz2 (shape (4,))
    da1 = W2.T.reshape(-1) * dz2.item()  # shape (4,)
    # dz1 = da1 * relu'(z1)
    dz1 = da1 * relu_derivative(z1)  # shape (4,)
    # Gradients for W1 and b1
    dW1 = dz1.reshape(-1,1) * x.reshape(1,-1)  # (4,1) * (1,3) -> (4,3)
    db1 = dz1.copy()                            # shape (4,)
    grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2}
    return grads

# sgd update
def sgd_update(params, grads, lr=0.1):
    params['W1'] -= lr * grads['dW1']
    params['b1'] -= lr * grads['db1']
    params['W2'] -= lr * grads['dW2']
    params['b2'] -= lr * grads['db2']

# numerical gradient check
def numerical_gradient(params, x, y, eps=1e-5):
    """
    Compute numerical gradients for W1 and W2 by central differences.
    Returns num_dW1 (4x3) and num_dW2 (1x4).
    """
    num_dW1 = np.zeros_like(params['W1'])
    num_dW2 = np.zeros_like(params['W2'])

    # Helper to compute loss given params
    def loss_given_params(p):
        _, loss = forward(x, p, y)
        return loss

    # W1
    it = np.nditer(params['W1'], flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        orig = params['W1'][idx].copy()
        params['W1'][idx] = orig + eps
        loss_plus = loss_given_params(params)
        params['W1'][idx] = orig - eps
        loss_minus = loss_given_params(params)
        num_dW1[idx] = (loss_plus - loss_minus) / (2 * eps)
        params['W1'][idx] = orig
        it.iternext()

    # W2
    it = np.nditer(params['W2'], flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        orig = params['W2'][idx].copy()
        params['W2'][idx] = orig + eps
        loss_plus = loss_given_params(params)
        params['W2'][idx] = orig - eps
        loss_minus = loss_given_params(params)
        num_dW2[idx] = (loss_plus - loss_minus) / (2 * eps)
        params['W2'][idx] = orig
        it.iternext()

    return num_dW1, num_dW2

# ---------- Script to run everything ----------
def run_demo():
    rs = np.random.RandomState(42)
    # Initialize params
    params = {
        'W1': rs.randn(4, 3),   # shape (4,3)
        'b1': np.zeros(4),
        'W2': rs.randn(1, 4),   # shape (1,4)
        'b2': np.zeros(1)
    }
    # Single sample
    x = np.array([0.2, -0.4, 0.1])
    y = 1
    lr = 0.1
    eps = 1e-5

    # Forward before update
    cache, init_loss = forward(x, params, y)
    print("Initial loss:", init_loss)

    # Backprop (analytic)
    grads = backward(x, y, params, cache)
    print("\nAnalytic gradient dW1 (4x3):\n", grads['dW1'])
    print("\nAnalytic gradient dW2 (1x4):\n", grads['dW2'])

    # Numerical gradient check
    num_dW1, num_dW2 = numerical_gradient(params, x, y, eps=eps)
    max_diff_W1 = np.max(np.abs(grads['dW1'] - num_dW1))
    max_diff_W2 = np.max(np.abs(grads['dW2'] - num_dW2))
    print(f"\nMax absolute difference between analytic and numerical gradients:")
    print("W1:", max_diff_W1)
    print("W2:", max_diff_W2)

    # SGD update
    sgd_update(params, grads, lr=lr)
    print("\nUpdated W1:\n", params['W1'])
    print("\nUpdated W2:\n", params['W2'])

    # Forward after update (final loss)
    _, final_loss = forward(x, params, y)
    print("\nFinal loss after one SGD step:", final_loss)
    if final_loss < init_loss:
        print("Loss decreased after the SGD update.")
    else:
        print("Loss did not decrease after the SGD update. (Possible with a single step)")

# Run the demo
run_demo()


Initial loss: 1.1732865037089109

Analytic gradient dW1 (4x3):
 [[-0.03342232  0.06684463 -0.01671116]
 [ 0.26428194 -0.52856388  0.13214097]
 [ 0.         -0.          0.        ]
 [ 0.07766893 -0.15533787  0.03883447]]

Analytic gradient dW2 (1x4):
 [[-0.15154094 -0.25889318 -0.         -0.17080231]]

Max absolute difference between analytic and numerical gradients:
W1: 1.5192513913575567e-11
W2: 6.006972697036872e-12

Updated W1:
 [[ 0.50005638 -0.14494876  0.64935965]
 [ 1.49660166 -0.18129699 -0.24735105]
 [ 1.57921282  0.76743473 -0.46947439]
 [ 0.53479315 -0.44788391 -0.4696132 ]]

Updated W2:
 [[ 0.25711637 -1.88739093 -1.72491783 -0.5452073 ]]

Final loss after one SGD step: 0.9036177230801858
Loss decreased after the SGD update.
