In [1]:
import numpy as np

np.random.seed(42)
X = np.random.uniform(-2, 2, (400, 3))
y = (
    np.sin(X[:, 0]) +
    0.5 * (X[:, 1] ** 2) -
    0.8 * X[:, 2]
)
y = y.reshape(-1, 1)

X = X.T
y = y.T

def relu(z):
    return np.maximum(0, z)

def relu_deriv(z):
    return (z > 0).astype(float)

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def sigmoid_deriv(z):
    sig = sigmoid(z)
    return sig * (1 - sig)

def tanh(z):
    return np.tanh(z)
def tanh_deriv(z):
    return 1 - np.tanh(z)**2

def leaky_relu(z, alpha=0.01):
    return np.where(z > 0, z, alpha * z)

def leaky_relu_deriv(z, alpha=0.01):
    return np.where(z > 0, 1.0, alpha)

def softplus(z):
    z = np.clip(z, -500, 500)
    return np.log(1 + np.exp(z))

def softplus_deriv(z):
    return sigmoid(z)

In [2]:
class Layer:
    def __init__(self, neurons_in, neurons_out, activation_name="relu"):
        self.W = np.random.uniform(-0.5, 0.5, (neurons_out, neurons_in))
        self.b = np.zeros((neurons_out, 1))
        self.activation_name = activation_name

        self.Z = None
        self.A_prev = None
        self.dW = None
        self.db = None

    def activate(self, Z):
        if self.activation_name == "relu": return relu(Z)
        elif self.activation_name == "sigmoid": return sigmoid(Z)
        elif self.activation_name == "linear": return Z

    def activate_deriv(self, Z):
        if self.activation_name == "relu": return relu_deriv(Z)
        elif self.activation_name == "sigmoid": return sigmoid_deriv(Z)
        elif self.activation_name == "linear": return np.ones_like(Z)

class DeepNetwork:
    def __init__(self, layer_dims, hidden_activation="relu"):
        self.layers = []
        for i in range(len(layer_dims) - 1):
            act = hidden_activation if i < len(layer_dims) - 2 else "linear"
            self.layers.append(Layer(layer_dims[i], layer_dims[i+1], act))

    def forward(self, X):
        A = X
        for layer in self.layers:
            layer.A_prev = A
            layer.Z = np.dot(layer.W, layer.A_prev) + layer.b
            A = layer.activate(layer.Z)
        return A

    def backward(self, y, y_hat):
        N = y.shape[1]
        dA = -2 * (y - y_hat) / N
        for i in reversed(range(len(self.layers))):
            layer = self.layers[i]
            dZ = dA * layer.activate_deriv(layer.Z)
            layer.dW = np.dot(dZ, layer.A_prev.T)
            layer.db = np.sum(dZ, axis=1, keepdims=True)
            if i > 0:
                dA = np.dot(layer.W.T, dZ)

    def update(self, lr):
        for layer in self.layers:
            layer.W -= lr * layer.dW
            layer.b -= lr * layer.db

In [3]:
def train_model(model_name, layer_dims, hidden_act="relu"):
    print(f"\nTraining {model_name} with {hidden_act.capitalize()}...")
    model = DeepNetwork(layer_dims, hidden_activation=hidden_act)
    epochs = 1000
    lr = 0.01
    loss_200 = None

    for epoch in range(epochs):
        y_hat = model.forward(X)
        loss = np.mean((y - y_hat) ** 2)

        if epoch == 199:
            loss_200 = loss

        model.backward(y, y_hat)
        model.update(lr)

    final_loss = np.mean((y - model.forward(X)) ** 2)

    first_hidden_dW = model.layers[0].dW
    last_hidden_dW = model.layers[-2].dW

    grad_norm_L1 = np.sqrt(np.sum(first_hidden_dW ** 2))
    grad_norm_Last = np.sqrt(np.sum(last_hidden_dW ** 2))

    print(f"Loss @ 200: {loss_200:.4f}")
    print(f"Final Loss: {final_loss:.4f}")
    print(f"Grad Norm L1: {grad_norm_L1:.6f}")
    print(f"Grad Norm Last: {grad_norm_Last:.6f}")

In [4]:
# Model architectures (input=3, output=1)
model_A = [3, 4, 1]
model_B = [3, 6, 6, 1]
model_C = [3, 8, 8, 8, 8, 1]
model_D = [3, 8, 8, 8, 8, 8, 8, 8, 8, 1]

train_model("Model A - Shallow", model_A, "relu")
train_model("Model B - Medium", model_B, "relu")
train_model("Model C - Deep", model_C, "relu")
train_model("Model D - Very Deep (ReLU)", model_D, "relu")
train_model("Model D - Very Deep (Sigmoid)", model_D, "sigmoid")


Training Model A - Shallow with Relu...
Loss @ 200: 0.4938
Final Loss: 0.1115
Grad Norm L1: 0.045217
Grad Norm Last: 0.045217

Training Model B - Medium with Relu...
Loss @ 200: 0.3220
Final Loss: 0.0728
Grad Norm L1: 0.036609
Grad Norm Last: 0.021441

Training Model C - Deep with Relu...
Loss @ 200: 0.8620
Final Loss: 0.0304
Grad Norm L1: 0.023876
Grad Norm Last: 0.016801

Training Model D - Very Deep (ReLU) with Relu...
Loss @ 200: 1.6349
Final Loss: 0.0528
Grad Norm L1: 0.429784
Grad Norm Last: 0.621290

Training Model D - Very Deep (Sigmoid) with Sigmoid...
Loss @ 200: 1.7439
Final Loss: 1.7439
Grad Norm L1: 0.000006
Grad Norm Last: 0.000006


## Reflections

- Did deeper always reduce loss faster: No, While deep networks have more capacity, Model D likely struggled or learned very slowly at first compared to A or B because gradients take longer to propagate effectively through 8 layers.

- Did gradients in early layers stay similar to later layers: No, The Grad Norm L1 will be noticeably smaller than the Grad Norm Last in deep models. Because each additional layer multiplies gradients during backpropagation, gradients shrink as they reach earlier layers.

- Was training equally stable for all activations: No, Model D with Sigmoid likely failed to learn much at all (loss barely decreased), while Model D with ReLU made progress.

- Which activation behaved more stable in deeper networks: ReLU was significantly more stable. Sigmoid's derivative has a maximum value of 0.25, so multiplying that 8 times causes the gradient at Layer 1 to vanish entirely.

- Did some models improve very slowly even though the learning rate was same: Yes, Model D (especially with Sigmoid) likely improved at a snail's pace because the gradients updating the early layers became infinitesimally small, practically freezing those layers.