In [1]:
!pip install numpy 



In [2]:
import numpy as np
print(np.__version__)

2.4.0


In [3]:

class Linear: 
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim) * 0.01
        self.b = np.zeros((1, output_dim))

    def forward(self, X):
        self.X = X
        return np.dot(X, self.W) + self.b

    def backward(self, dZ, learning_rate=0.01):
        m = self.X.shape[0]
        dW = np.dot(self.X.T, dZ) / m
        db = np.sum(dZ, axis=0, keepdims=True) / m
        dX = np.dot(dZ, self.W.T)

        # Update weights and biases
        self.W -= learning_rate * dW
        self.b -= learning_rate * db

        return dX

In [4]:
class Sequential:
    def __init__(self, *layers):
        self.layers = layers
        self._activations = None

    def forward(self, x):
        # Save activations (inputs to each layer) for correct backward pass
        activations = [x]
        for layer in self.layers:
            x = layer.forward(x)
            activations.append(x)
        self._activations = activations
        return x

    def backward(self, grad_output, learning_rate=0.01):
        # Call each layer's backward using its own stored inputs (layers handle their inputs)
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output, learning_rate)
        return grad_output

In [5]:
def mse_loss(y_pred, y_true):
    return np.mean((y_pred - y_true) ** 2)
def mse_loss_grad(y_pred, y_true):
    return 2 * (y_pred - y_true) / y_true.size

In [6]:
class Adam_optimizer:
    def __init__(self, parameters, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.parameters = parameters
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = [np.zeros_like(p) for p in parameters]
        self.v = [np.zeros_like(p) for p in parameters]
        self.t = 0
    
    def step(self, grads):
        self.t += 1
        for i, (param, grad) in enumerate(zip(self.parameters, grads)):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)

            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)

            param -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

In [7]:

x = np.array([[1,2], [3,4], [5,6], [7,8], [9,10], [11,12], [13,14], [15,16]])
y = np.array([[3], [7], [11], [15], [19], [23], [27], [31]])

In [8]:
model = Sequential(
    Linear(2, 4),
    Linear(4, 1) )


# use a safer learning rate and fewer epochs for quick verification
learning_rate = 1e-3
num_epochs = 1000

for epoch in range(num_epochs):
    # Forward pass
    y_pred = model.forward(x)

    # Compute loss
    loss = mse_loss(y_pred, y)

    # Compute gradient of loss w.r.t. predictions
    grad_loss = mse_loss_grad(y_pred, y) 

    # Backprop (layers update their own parameters)
    model.backward(grad_loss, learning_rate)

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

Epoch 0, Loss: 372.9603290634257
Epoch 100, Loss: 0.09759495755232186
Epoch 200, Loss: 0.03790063751543473
Epoch 300, Loss: 0.036616410873902425
Epoch 400, Loss: 0.035375485083339625
Epoch 500, Loss: 0.03417639901527445
Epoch 600, Loss: 0.033017741701052074
Epoch 700, Loss: 0.031898150514211715
Epoch 800, Loss: 0.03081630942684192
Epoch 900, Loss: 0.02977094733628844
