In [1]:
import numpy as np

# Task 1.1 - Input generation
# X represents a single feature (e.g., time, distance) measured from 0 to 10.
# This range ensures we see both early rapid change and later saturation, which a linear model cannot fit well.
X = np.linspace(0, 10, 50).reshape(-1, 1)

# Task 1.2 - Target generation
# The logarithmic shape grows fast initially and slows later.
# A straight line has only one slope and cannot match both regions simultaneously.
noise = np.random.normal(0, 0.2, size=(50, 1))
y = np.log(X + 1) + noise

In [2]:
# How many hidden units: 3.
# Why more than 1: Hidden units allow multiple pieces or angles of behavior.
# Why not too many: Too many units increase complexity and instability and too few cannot bend enough.

In [3]:
np.random.seed(42)

W1 = np.random.uniform(-1, 1, size=(1, 3))
b1 = np.zeros((1, 3))

W2 = np.random.uniform(-1, 1, size=(3, 1))
b2 = np.zeros((1, 1))

In [4]:
def activation(z):
    return np.maximum(0, z)

def activation_slope(z):
    return (z > 0).astype(float)

In [5]:
epochs = 2000
learning_rate = 0.01

for epoch in range(epochs):
    z1 = X @ W1 + b1
    h = activation(z1)
    y_hat = h @ W2 + b2
    error = y_hat - y
    loss = np.mean(error ** 2)
    dL_dy = 2 * error / len(X)
    dL_dW2 = h.T @ dL_dy
    dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
    dL_dh = dL_dy @ W2.T
    dL_dz1 = dL_dh * activation_slope(z1)
    dL_dW1 = X.T @ dL_dz1
    dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)
    W1 -= learning_rate * dL_dW1
    b1 -= learning_rate * dL_db1
    W2 -= learning_rate * dL_dW2
    b2 -= learning_rate * dL_db2

    if epoch % 200 == 0:
        print(f"Epoch {epoch} | Loss: {loss:.4f}")

print(f"Final Loss: {loss:.4f}")

Epoch 0 | Loss: 51.6074
Epoch 200 | Loss: 0.1032
Epoch 400 | Loss: 0.0985
Epoch 600 | Loss: 0.0980
Epoch 800 | Loss: 0.0979
Epoch 1000 | Loss: 0.0979
Epoch 1200 | Loss: 0.0979
Epoch 1400 | Loss: 0.0979
Epoch 1600 | Loss: 0.0979
Epoch 1800 | Loss: 0.0979
Final Loss: 0.0979
