In [2]:
!pip install numpy 

Collecting numpy
  Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.4.0


In [32]:
import numpy as np
print(np.__version__)

2.4.0


In [33]:
class Linear: 
    def __init__(self, input_dim, output_dim):

        # Use smaller random initialization to reduce chance of overflow/exploding gradients
        self.weights = 0.01 * np.random.randn(input_dim, output_dim)
        self.bias = 0.01 * np.random.randn(output_dim)
    
    def forward(self, x):
        return np.dot(x, self.weights) + self.bias
    
    def backward(self, x, grad_output, learning_rate=0.01, clip_value=1.0):
        # grad_input: gradient w.r.t. this layer's input
        grad_input = np.dot(grad_output, self.weights.T)
        # grad_weights: gradient w.r.t. weights (input^T dot grad_output)
        grad_weights = np.dot(x.T, grad_output)
        grad_bias = np.sum(grad_output, axis=0)
        
        # Clip gradients elementwise to avoid exploding updates
        if clip_value is not None:
            grad_weights = np.clip(grad_weights, -clip_value, clip_value)
            grad_bias = np.clip(grad_bias, -clip_value, clip_value)
            grad_input = np.clip(grad_input, -1e6, 1e6)
        
        self.weights -= learning_rate * grad_weights
        self.bias -= learning_rate * grad_bias
        
        return grad_input

In [34]:
class Sequential:
    def __init__(self, *layers):
        self.layers = layers
        self._activations = None

    def forward(self, x):
        # Save activations (inputs to each layer) for correct backward pass
        activations = [x]
        for layer in self.layers:
            x = layer.forward(x)
            activations.append(x)
        self._activations = activations
        return x

    def backward(self, grad_output, learning_rate=0.01):
        # Use saved activations: activations[i] is the input to layer i
        if self._activations is None:
            raise RuntimeError("No forward pass found. Call forward() before backward().")
        for i in range(len(self.layers) - 1, -1, -1):
            layer = self.layers[i]
            input_to_layer = self._activations[i]
            grad_output = layer.backward(input_to_layer, grad_output, learning_rate)
        return grad_output

In [35]:
def mse_loss(y_pred, y_true):
    return np.mean((y_pred - y_true) ** 2)

In [23]:
class Adam_optimizer:
    def __init__(self, parameters, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.parameters = parameters
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = [np.zeros_like(p) for p in parameters]
        self.v = [np.zeros_like(p) for p in parameters]
        self.t = 0
    
    def step(self, grads):
        self.t += 1
        for i, (param, grad) in enumerate(zip(self.parameters, grads)):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (grad ** 2)

            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)

            param -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

In [36]:
#data 

x = np.array([[1,2], [3,4], [5,6], [7,8], [9,10], [11,12], [13,14], [15,16]])
y = np.array([[3], [7], [11], [15], [19], [23], [27], [31]])

In [39]:
#traning Time

model = Sequential(
    Linear(input_dim=2, output_dim=4),
    Linear(input_dim=4, output_dim=1)
 )
# lower learning rate to reduce risk of overflow/exploding gradients
learning_rate = 1e-3
epochs = 100000
for epoch in range(epochs):
    y_pred = model.forward(x)
    loss = mse_loss(y_pred, y)
    
    grad_output = 2 * (y_pred - y) / y.shape[0]
    # pass learning_rate through; Layer.backward supports gradient clipping
    model.backward(grad_output, learning_rate)
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

Epoch 0, Loss: 373.63489705741847
Epoch 100, Loss: 341.02181106096344
Epoch 200, Loss: 255.7477014586863
Epoch 300, Loss: 141.71018164890438
Epoch 400, Loss: 38.59272336021151
Epoch 500, Loss: 0.39889969748568455
Epoch 600, Loss: 0.31901020059700796
Epoch 700, Loss: 0.2564899326807999
Epoch 800, Loss: 0.20704375267953778
Epoch 900, Loss: 0.1676287876373398
Epoch 1000, Loss: 0.13602303841774402
Epoch 1100, Loss: 0.11056472785314651
Epoch 1200, Loss: 0.08998736946400729
Epoch 1300, Loss: 0.0733111953106719
Epoch 1400, Loss: 0.05976920547375527
Epoch 1500, Loss: 0.04875529165957233
Epoch 1600, Loss: 0.039786906436145644
Epoch 1700, Loss: 0.032477603414941054
Epoch 1800, Loss: 0.026516454973062188
Epoch 1900, Loss: 0.021652376376192377
Epoch 2000, Loss: 0.017682024644671095
Epoch 2100, Loss: 0.014440351034181152
Epoch 2200, Loss: 0.011793155930787507
Epoch 2300, Loss: 0.009631176452646193
Epoch 2400, Loss: 0.007865361679470806
Epoch 2500, Loss: 0.006423077737004531
Epoch 2600, Loss: 0.0052