In [1]:
class DeepNN:
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, learning_rate=0.01):
        # Initialize weights and biases
        self.weights1 = np.random.rand(input_size, hidden_size1) - 0.5
        self.bias1 = np.random.rand(1, hidden_size1) - 0.5
        self.weights2 = np.random.rand(hidden_size1, hidden_size2) - 0.5
        self.bias2 = np.random.rand(1, hidden_size2) - 0.5
        self.weights3 = np.random.rand(hidden_size2, output_size) - 0.5
        self.bias3 = np.random.rand(1, output_size) - 0.5
        self.learning_rate = learning_rate

    def forward(self, X):
        # Forward pass
        self.z1 = np.dot(X, self.weights1) + self.bias1
        self.a1 = relu(self.z1)
        self.z2 = np.dot(self.a1, self.weights2) + self.bias2
        self.a2 = relu(self.z2)
        self.z3 = np.dot(self.a2, self.weights3) + self.bias3
        self.a3 = relu(self.z3)  # Using ReLU in output layer (generally not common, but following your request)
        return self.a3

    def backward(self, X, y, output):
        # Backward pass
        # Derivative of the loss with respect to the output
        output_error = y - output  # Error in output
        output_delta = output_error * relu_derivative(output)  # dL/dz3 = (y - a3) * d(a3)/d(z3)
        
        # Derivative of the loss with respect to the second hidden layer
        a2_error = output_delta.dot(self.weights3.T)  # dL/da2 = output_delta . weights3.T
        a2_delta = a2_error * relu_derivative(self.a2)  # dL/dz2 = dL/da2 * d(a2)/d(z2)
        
        # Derivative of the loss with respect to the first hidden layer
        a1_error = a2_delta.dot(self.weights2.T)  # dL/da1 = a2_delta . weights2.T
        a1_delta = a1_error * relu_derivative(self.a1)  # dL/dz1 = dL/da1 * d(a1)/d(z1)

        # Update weights and biases
        self.weights3 += self.learning_rate * self.a2.T.dot(output_delta)  # weights3 += learning_rate * a2.T . output_delta
        self.bias3 += self.learning_rate * np.sum(output_delta, axis=0, keepdims=True)  # bias3 += learning_rate * sum(output_delta)
        self.weights2 += self.learning_rate * self.a1.T.dot(a2_delta)  # weights2 += learning_rate * a1.T . a2_delta
        self.bias2 += self.learning_rate * np.sum(a2_delta, axis=0)  # bias2 += learning_rate * sum(a2_delta)
        self.weights1 += self.learning_rate * X.T.dot(a1_delta)  # weights1 += learning_rate * X.T . a1_delta
        self.bias1 += self.learning_rate * np.sum(a1_delta, axis=0)  # bias1 += learning_rate * sum(a1_delta)

    def fit(self, X, y, epochs=10000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i + batch_size]
                y_batch = y[i:i + batch_size]
                output = self.forward(X_batch)
                self.backward(X_batch, y_batch, output)

    def predict(self, X):
        output = self.forward(X)
        return np.argmax(output, axis=1)

    def evaluate(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == np.argmax(y, axis=1))
        return accuracy