Here, I have implemented the structure of layers in class and object format of the ones mentioned in the question 1.
Then for Question 3, I have loaded the data.
Then I have made the layers one after the other and trained the model.
Then I have tested the model on same test data, separated in the start.

In [8]:
import numpy as np

class MatrixMultiplicationLayer:
    def __init__(self, W):
        self.W = W

    def forward(self, X):
        return np.dot(X, self.W)

    def backward(self, X, grad_output):
        grad_input = np.dot(grad_output, self.W.T)
        grad_W = np.dot(X.T, grad_output)
        return grad_input, grad_W

class BiasAdditionLayer:
    def __init__(self, b):
        self.b = b

    def forward(self, X):
        return X + self.b

    def backward(self, X, grad_output):
        grad_input = grad_output
        grad_b = np.sum(grad_output, axis=1)
        return grad_input, grad_b
    
class MeanSquaredLoss:
    def forward(self, y_pred, y_true):
        return 0.5 * np.mean((y_pred - y_true) ** 2)

    def backward(self, y_pred, y_true):
        return y_pred - y_true

class Softmax:
    def forward(self, X):
        exp_X = np.exp(X)
        return exp_X / np.sum(exp_X, axis=1, keepdims=True)

    def backward(self, X, grad_output):
        softmax = self.forward(X)
        return grad_output * (softmax * (1 - softmax))

class Sigmoid:
    def forward(self, X):
        return 1 / (1 + np.exp(-X))

    def backward(self, X, grad_output):
        sigmoid = self.forward(X)
        return grad_output * sigmoid * (1 - sigmoid)

class CrossEntropyLoss:
    def forward(self, y_pred, y_true):
        y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def backward(self, y_pred, y_true):
        return y_pred - y_true




In [2]:
from sklearn import datasets
iris = datasets.load_iris()
import numpy as np

In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
y = iris.target

# One-hot encode the target labels
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.33, random_state=42)

# Define the weight matrix and bias
np.random.seed(0)
W = np.random.randn(4, 3)
b = np.random.randn(1, 3)

# Initialize the layers
matrix_mult_layer = MatrixMultiplicationLayer(W)
bias_add_layer = BiasAdditionLayer(b)
softmax_layer = Softmax()
cross_entropy_loss = CrossEntropyLoss()

# Train the model
for i in range(1000):
    # Forward pass
    X_train_out = matrix_mult_layer.forward(X_train)
    X_train_out = bias_add_layer.forward(X_train_out)
    y_train_pred = softmax_layer.forward(X_train_out)

    # Calculate the loss
    loss = cross_entropy_loss.forward(y_train_pred, y_train)

    # Backward pass
    grad_output = cross_entropy_loss.backward(y_train_pred, y_train)
    grad_input, grad_W = matrix_mult_layer.backward(X_train, grad_output)
    grad_input, grad_b = bias_add_layer.backward(X_train_out, grad_input)
    
    # Update the weight matrix and bias
    W -= 0.01 * grad_W
    grad_b = np.sum(grad_output, axis=0, keepdims=True)
    b -= 0.01 * grad_b



In [10]:
# Use the test data to evaluate the model

X_test_out = matrix_mult_layer.forward(X_test)
X_test_out = bias_add_layer.forward(X_test_out)
y_test_pred = softmax_layer.forward(X_test_out)

In [11]:
# Use argmax to select the class with highest probability as the predicted class

y_test_pred = np.argmax(y_test_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

In [12]:
# Accuracy score to check the accuracy of the model

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_test_pred))

Accuracy: 1.0
