In [15]:
import numpy as np 

np.random.seed(42)

N = 2000
X = np.random.randn(N,2)
y = (X[:,0] * X[:,1] > 0).astype(int)

print(X.shape, y.shape)

(2000, 2) (2000,)


In [16]:
def one_hot(y, num_classes):
    Y = np.zeros((y.size, num_classes))
    Y[np.arange(y.size),y] = 1
    return Y

def softmax(z):
    z -= np.max(z, axis=1, keepdims=True)
    exp = np.exp(z)
    return exp/ np.sum(exp, axis=1, keepdims=True)


In [17]:
class Linear:
    def __init__(self, in_dims, out_dims):
        self.W = np.random.randn(in_dims, out_dims) * np.sqrt(2. / in_dims)
        self.b = np.zeros((1, out_dims))
    def forward(self,x):
        self.x = x
        return x @ self.W + self.b
    def backward(self, grad_out, lr):
        dW = self.x.T @ grad_out / self.x.shape[0]
        db = np.mean(grad_out, axis=0, keepdims=True)
        grad_x = grad_out @ self.W.T
        # update weights
        self.W -= lr * dW
        self.b -= lr * db 
        return grad_x 

class ReLU:
    def forward(self, x):
        self.mask = x > 0
        return x * self.mask 
    def backward(self, grad_out, lr):
        return grad_out * self.mask

In [18]:
class SoftmaxCrossEntropy:
    def forward(self, logits, y_true):
        self.probs = softmax(logits)
        self.y_true = y_true 
        N = logits.shape[0]
        # compute loss
        log_likelihood = -np.log(self.probs[np.arange(N),y_true] + 1e-12) #basically we will take the value from probs , at the index at y_true where it is 1
        return np.sum(log_likelihood) / N
    def backward(self):
        N = self.y_true.shape[0]
        grad = self.probs.copy()
        grad[np.arange(N), self.y_true] -= 1
        return grad / N

In [19]:
class MLP:
    def __init__(self, in_dims, hidden_dims, out_dims, lr=0.1):
        self.lr = lr
        self.l1 = Linear(in_dims, hidden_dims)
        self.relu = ReLU()
        self.l2 = Linear(hidden_dims, out_dims)
        self.loss_fn = SoftmaxCrossEntropy()

    def forward(self,x):
        out = self.l1.forward(x)
        out = self.relu.forward(out)
        out = self.l2.forward(out)
        return out 
    
    def backward(self,grad):
        grad = self.l2.backward(grad, self.lr)
        grad = self.relu.backward(grad, self.lr)
        grad = self.l1.backward(grad, self.lr)
        return grad
    
    def predict(self, x):
        logits = self.forward(x)
        probs = softmax(logits)
        return np.argmax(probs, axis=1)
    
    def train_batch(self, x, y):
        logits = self.forward(x)
        loss = self.loss_fn.forward(logits, y)
        grad = self.loss_fn.backward()
        self.backward(grad)
        return loss

In [20]:
# Hyperparameters
in_dim = 2
hidden_dim = 8
out_dim = 2
lr = 0.1
epochs = 2000

model = MLP(in_dim, hidden_dim, out_dim, lr)

for epoch in range(epochs):
    loss = model.train_batch(X, y)
    if (epoch + 1) % 20 == 0:
        preds = model.predict(X)
        acc = np.mean(preds == y)
        print(f"Epoch {epoch+1:03d}: Loss={loss:.4f}, Acc={acc:.3f}")


Epoch 020: Loss=0.9945, Acc=0.608
Epoch 040: Loss=0.9941, Acc=0.608
Epoch 060: Loss=0.9936, Acc=0.608
Epoch 080: Loss=0.9931, Acc=0.609
Epoch 100: Loss=0.9927, Acc=0.609
Epoch 120: Loss=0.9922, Acc=0.609
Epoch 140: Loss=0.9917, Acc=0.609
Epoch 160: Loss=0.9913, Acc=0.609
Epoch 180: Loss=0.9908, Acc=0.609
Epoch 200: Loss=0.9904, Acc=0.609
Epoch 220: Loss=0.9899, Acc=0.609
Epoch 240: Loss=0.9894, Acc=0.609
Epoch 260: Loss=0.9890, Acc=0.609
Epoch 280: Loss=0.9885, Acc=0.609
Epoch 300: Loss=0.9881, Acc=0.609
Epoch 320: Loss=0.9876, Acc=0.609
Epoch 340: Loss=0.9872, Acc=0.609
Epoch 360: Loss=0.9867, Acc=0.609
Epoch 380: Loss=0.9862, Acc=0.609
Epoch 400: Loss=0.9858, Acc=0.610
Epoch 420: Loss=0.9853, Acc=0.610
Epoch 440: Loss=0.9849, Acc=0.610
Epoch 460: Loss=0.9844, Acc=0.610
Epoch 480: Loss=0.9840, Acc=0.609
Epoch 500: Loss=0.9835, Acc=0.609
Epoch 520: Loss=0.9830, Acc=0.610
Epoch 540: Loss=0.9826, Acc=0.610
Epoch 560: Loss=0.9821, Acc=0.610
Epoch 580: Loss=0.9816, Acc=0.610
Epoch 600: Los

In [21]:
def numerical_grad_check(layer, x, grad_out, eps=1e-5):
    analytic = layer.x.T @ grad_out / x.shape[0]
    num_grad = np.zeros_like(layer.W)
    for i in range(layer.W.shape[0]):
        for j in range(layer.W.shape[1]):
            old = layer.W[i, j]
            layer.W[i, j] = old + eps
            loss1 = np.sum(layer.forward(x))
            layer.W[i, j] = old - eps
            loss2 = np.sum(layer.forward(x))
            layer.W[i, j] = old
            num_grad[i, j] = (loss1 - loss2) / (2 * eps)
    print("Diff:", np.linalg.norm(analytic - num_grad))
