In [1]:
import numpy as np

In [9]:
class DeepMLP:
    def __init__(self, layers, learning_rate=0.01, activation='relu'):
        self.layers = layers
        self.learning_rate = learning_rate
        self.activation = activation
        self.weights = []
        self.biases = []
        
        # He initialization for ReLU
        for i in range(len(layers) - 1):
            if activation == 'relu':
                std = np.sqrt(2.0 / layers[i])
            else:
                std = np.sqrt(1.0 / layers[i])
            w = np.random.randn(layers[i], layers[i+1]) * std
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        return (x > 0).astype(float)

    def forward(self, x):
        activations = [x]
        for i in range(len(self.weights)):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            if self.activation == 'relu':
                a = self.relu(z)
            else:
                a = 1 / (1 + np.exp(-np.clip(x, -250, 250)))
            activations.append(a)
        return activations
    
    def backward(self, activations, y):
        m = y.shape[0]
        gradients_w = []
        gradients_b = []
        
        error = activations[-1] - y
        
        if self.activation == 'relu':
            delta = error * self.relu_derivative(activations[-1])
        else:
            delta = error * activations[-1] * (1 - activations[-1])
        
        for i in range(len(self.weights) - 1, -1, -1):
            grad_w = np.dot(activations[i].T, delta) / m
            grad_b = np.sum(delta, axis=0, keepdims=True) / m
            gradients_w.insert(0, grad_w)
            gradients_b.insert(0, grad_b)
            
            if i > 0:
                delta = np.dot(delta, self.weights[i].T)
                if self.activation == 'relu':
                    delta = delta * self.relu_derivative(activations[i])
                else:
                    delta = delta * activations[i] * (1 - activations[i])
                    
        return gradients_w, gradients_b
    
    def fit(self, x, y, epochs=1000, batch_size=32):
        n_samples = len(x)
        for epoch in range(epochs):
            indices = np.random.permutation(n_samples)
            for i in range(0, n_samples, batch_size):
                batch_indices = indices[i:i+batch_size]
                batch_x = x[batch_indices]
                batch_y = y[batch_indices]
            
                activations = self.forward(batch_x)
                grad_w, grad_b = self.backward(activations, batch_y)
                
                for j in range(len(self.weights)):
                    self.weights[j] -= self.learning_rate * grad_w[j]
                    self.biases[j] -= self.learning_rate * grad_b[j]
            
            if epoch % 100 == 0:
                activations = self.forward(x)
                loss = np.mean((activations[-1] - y) ** 2)
                print(f"Epoch: {epoch}, Loss: {loss:.4f}")
                
    def predict(self, x):
        activations = self.forward(x)
        return activations[-1]

In [10]:
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_xor = np.array([0, 1, 1, 0]).reshape(-1, 1)

deepmlp = DeepMLP(layers=[2, 4, 1], learning_rate=0.1)
deepmlp.fit(X_xor, Y_xor, epochs=100)
deepmlp_xor = deepmlp.predict(X_xor)

pred_labels = (deepmlp_xor >= 0.5).astype(int) 

for i, (x, y_true, y_pred) in enumerate(zip(X_xor, Y_xor, pred_labels)):
    print(f"  Input: {x}, Target: {y_true}, Prediction: {y_pred}, {'✓' if y_true == y_pred else '✗'}")

Epoch: 0, Loss: 0.3499
  Input: [0 0], Target: [0], Prediction: [0], ✓
  Input: [0 1], Target: [1], Prediction: [1], ✓
  Input: [1 0], Target: [1], Prediction: [1], ✓
  Input: [1 1], Target: [0], Prediction: [0], ✓
