# Multi-Layer Perceptron for the XOR problem in MLX

Markus Enzweiler, markus.enzweiler@hs-esslingen.de

This is a demo used in a Computer Vision & Machine Learning lecture. Feel free to use and contribute.

We build and train a perceptron to act as a simple XOR gate with two inputs and one output. 
XOR gates have the following behavior:

If both inputs are identical, the output is 0 (off)
If both inputs are different, the output is 1 (on)

| observation # | input 1 | input 2 | output |
|---------------|---------|---------|--------|
| 0             | 0       | 0       | 0      |
| 1             | 0       | 1       | 1      |
| 2             | 1       | 0       | 1      |
| 3             | 1       | 1       | 0      |


**Note: This requires a machine with an Apple SoC, e.g. M1/M2/M3 etc.**

See: https://github.com/ml-explore/mlx

## Setup

Adapt `packagePath` to point to the directory containing this notebeook.

In [2]:
# Imports
import sys
import os

In [3]:
# Package Path
package_path = "./" # local
print(f"Package path: {package_path}")

Package path: ./


In [4]:
# Install requirements in the current Jupyter kernel
req_file = os.path.join(package_path, "requirements.txt")
if os.path.exists(req_file):
    !{sys.executable} -m pip install -r {req_file}
else:
    print(f"Requirements file not found: {req_file}")



In [5]:
# Now we should be able to import the additional packages
import numpy as np
import matplotlib.pyplot as plt
import mlx
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim

# Set the random seed for reproducibility
np.random.seed(42)
mx.random.seed(42)


## Create the training data

In [6]:
# Define the training data for the OR problem in numpy


# Define the training data for the OR problem
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = np.array([0, 1, 1, 0])

# Convert numpy arrays to mlx tensors
X = mx.array(X, dtype=mx.float32)
Y = mx.array(Y, dtype=mx.float32)

print("Training data X with labels y:")
for i in range(len(X)):
    print(f"{X[i]} -> {Y[i]}")

Training data X with labels y:
array([0, 0], dtype=float32) -> array(0, dtype=float32)
array([0, 1], dtype=float32) -> array(1, dtype=float32)
array([1, 0], dtype=float32) -> array(1, dtype=float32)
array([1, 1], dtype=float32) -> array(0, dtype=float32)


# Define the Multi-Layer Perceptron (MLP)

## MLP class

In [7]:
class MultiLayerPerceptron(nn.Module):
    # override constructor from nn.Module
    def __init__(self, num_inputs, num_hidden_layer_neurons=2):
        super().__init__() ## call constructor of nn.Module

        # layer 1 defines the transformation from input to hidden layer
        self.layer1 = nn.Linear(input_dims=num_inputs, output_dims=num_hidden_layer_neurons)
        # layer 2 defines the transformation from hidden layer to output
        self.layer2 = nn.Linear(input_dims=num_hidden_layer_neurons, output_dims=1)
        self.sigmoid = mx.sigmoid

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        # x (input) -> hidden layer -> sigmoid -> output layer -> sigmoid
        x = self.sigmoid(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x        

    

# MLP training with gradient descent

## Training and testing functions

In [8]:
# Training function
def train(model, X, Y, optimizer, loss_and_grad_fn, num_epochs):

    #  Loop over epochs
    for epoch in range(num_epochs):

        # Reset accumulated loss per epoch
        acc_loss = 0

        # Loop over all training data
        for i in range(len(X)):          
        
            # forward and backward pass
            loss, gradients = loss_and_grad_fn(model, X[i], Y[i].reshape(1,))     
            acc_loss += loss

            # Update the model with the gradients. So far no computation has happened.
            optimizer.update(model, gradients)

            # Compute the new parameters and also the new optimizer state.
            mx.eval(model.parameters(), optimizer.state)
        

        # Print accumulated average loss per epoch once in a while
        if (epoch % (num_epochs//10)) == 0 or epoch == num_epochs - 1:     
            print(f"Epoch {epoch:5d}: loss = {mx.mean(acc_loss).item():.5f}")

In [9]:
# Testing function
def test(model, X, Y):
    # test the model on all data points
    print("Testing ...")
    for i in range(len(X)):
        prediction = model(X[i])
        print(f"{X[i]} -> {prediction} (label: {Y[i]})")

## Train and test

In [10]:
# Perceptron for our OR problem
model = MultiLayerPerceptron(num_inputs=2)
# Evaluate because mlx uses lazy evaluation
mx.eval(model.parameters())

# Hyperparameters
num_epochs = 10000
eta = 0.25

# Loss function
def loss_fn(model, X, y):  
    return nn.losses.mse_loss((model(X)), y)

# Create the gradient function
loss_and_grad_fn = nn.value_and_grad(model, loss_fn)

# Stochastic gradient descent (SGD) optimizer
optimizer = optim.SGD(learning_rate=eta)

# Train the model
train(model, X, Y, optimizer, loss_and_grad_fn, num_epochs)

# Test the model
test(model, X, Y)

NameError: name 'perceptron' is not defined

# Visualize decision boundary

In [None]:
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec


def show_decision_boundary(model, data, labels, subplot_spec=None):

    data   = np.array(data)
    labels = np.array(labels)

    wratio = (15, 1)
    if subplot_spec is None:
        gs = gridspec.GridSpec(1, 2, width_ratios=wratio)
    else:
        gs = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=subplot_spec, width_ratios=wratio)
        
    ax = plt.subplot(gs[0])
    ax.set_title('Dataset and decision function')
    
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    h = 0.01  # Reduced step size for higher resolution
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    Z = model(mx.array(np.c_[xx.ravel(), yy.ravel()], dtype=mx.float32))
    Z = Z.reshape(xx.shape)
    

    # Increase the number of levels for smoother color transitions
    levels = np.linspace(0, 1, 100)
    ctr = ax.contourf(xx, yy, np.array(Z), levels, cmap=cm.gray, vmin=0, vmax=1)
    
    unique_labels = np.unique(labels)

    # Define colors for each class
    colors = ['red', 'blue']
    for i, yi in enumerate(unique_labels):
        color = colors[i]
        ax.scatter(data[np.where(labels.flatten() == yi), 0], data[np.where(labels.flatten() == yi), 1], 
                   color=color, linewidth=0, label='Class %d (y=%d)' % (yi, yi))
    ax.legend()
    ax.set_xlim((x_min, x_max))
    ax.set_ylim((y_min, y_max))

    # Create colorbar
    cbar = plt.colorbar(ctr, cax=plt.subplot(gs[1]))
    cbar.set_ticks(np.arange(0, 1.1, 0.1))  # Set ticks from 0 to 1 with 0.1 increments
    cbar.set_label('Decision value')

In [None]:
# Plot decision boundary
show_decision_boundary(perceptron, X, Y)