In [2]:
import numpy as np

class Module:
    def forward(self, input):
        raise NotImplementedError("Forward pass not implemented.")

    def backward(self, grad_output):
        raise NotImplementedError("Backward pass not implemented.")


class Linear(Module):
    def __init__(self, in_features, out_features):
        self.W = np.random.randn(out_features, in_features) * 0.01
        self.b = np.zeros((out_features, 1))
        self.x = None  # Store input for backward

    def forward(self, x):
        """
        x: (in_features, batch_size)
        Returns: (out_features, batch_size)
        """
        # TODO: Implement forward pass for Linear
        self.x = x
        z = self.W @ x + self.b
        return z 

    def backward(self, grad_output):
        """
        grad_output: (out_features, batch_size)
        Returns: (in_features, batch_size)
        Also computes gradients w.r.t W and b
        """
        # TODO: Implement backward pass for Linear
        x = self.x 
        self.dW = grad_output @ x.T
        self.db = grad_output.sum(axis=1, keepdims=True)

        backward_pass = self.W.T @ grad_output  
        return backward_pass


class ReLU(Module):
    def __init__(self):
        self.mask = None

    def forward(self, x):
        # TODO: Implement ReLU activation
        self.mask = (x > 0)
        return np.maximum(0, x)

    def backward(self, grad_output):
        # TODO: Implement gradient of ReLU
        return grad_output * self.mask


class CrossEntropy(Module):
    def __init__(self):
        self.y_pred = None
        self.y_true = None

    def softmax(self, x):
        shift = x - np.max(x, axis=0, keepdims=True)
        exps = np.exp(shift)
        return exps / np.sum(exps, axis=0, keepdims=True)

    def forward(self, logits, labels):
        """
        logits: (num_classes, batch_size)
        labels: (num_classes, batch_size) one-hot encoded
        Returns: scalar loss
        """
        # TODO: Implement forward pass for cross-entropy
        self.y_true = labels
        self.y_pred = self.softmax(logits)            

        eps = 1e-12                             
        B = logits.shape[1]
        loss = -np.sum(labels * np.log(self.y_pred + eps)) / B
        return loss

    def backward(self):
        """
        Returns: gradient of loss w.r.t. logits
        """
        # TODO: Implement backward pass for cross-entropy
        B = self.y_pred.shape[1]
        return (self.y_pred - self.y_true) / B