# Multi-Layer Perceptron Demo

This file serves as a comprehensive overview of the implementation of the Multi-Layer Perceptron (MLP).

In [17]:
from mlp.activations import Sigmoid, Linear, ReLU, Softmax
from mlp.losses import MeanSquaredError, CrossEntropy
from mlp.optimizers import Optimizer
from mlp.mlp import MLP, MLPLayersBuilder
from mlp.constants import SEED

import numpy as np

from sklearn.datasets import load_iris
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier  # For comparison

# Profiling
from cProfile import Profile
from pstats import SortKey, Stats

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## XOR Problem

- non-linear decision boundary

<img src="xor.png" alt="XOR" width="400px">

(Credit: Aniruddha Karajgi - [How Neural Networks Solve the XOR Problem](https://towardsdatascience.com/how-neural-networks-solve-the-xor-problem-59763136bdd7))

In [2]:
X = np.array([[0,0],
              [0,1],
              [1,0],
              [1,1]])
y = np.array([[0, 1, 1, 0]]).T

### Vanilla Gradient Descent

- with one hidden layer

In [3]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=MeanSquaredError(),
          n_epochs=10_000,
          learning_rate=0.9,
          print_frequency=1_000,
          batch_size=X.shape[0])
print(mlp)
mlp.fit(X,y)
print(mlp.predict(X))

Multi-layer Perceptron
 - Layer 0: Input with Linear activation and 2 neurons
 - Layer 1: Dense with Sigmoid activation and 5 neurons
 - Layer 2: Dense with Sigmoid activation and 1 neurons
Epoch 1/10000 | Loss: [0.50015956]
Epoch 1001/10000 | Loss: [0.49999688]
Epoch 2001/10000 | Loss: [0.49998976]
Epoch 3001/10000 | Loss: [0.49997352]
Epoch 4001/10000 | Loss: [0.49991528]
Epoch 5001/10000 | Loss: [0.49932249]
Epoch 6001/10000 | Loss: [0.43331883]
Epoch 7001/10000 | Loss: [0.15882902]
Epoch 8001/10000 | Loss: [0.0210697]
Epoch 9001/10000 | Loss: [0.00880619]
[[0.04675456]
 [0.95023446]
 [0.950628  ]
 [0.05898119]]


- with two hidden layers (learning time increases, LR must also be set higher)

In [4]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=MeanSquaredError(),
          n_epochs=20_000,
          learning_rate=20,
          print_frequency=2_000,
          batch_size=X.shape[0])
mlp.fit(X, y)
print(mlp.predict(X))

Epoch 1/20000 | Loss: [0.50063226]
Epoch 2001/20000 | Loss: [0.49999995]
Epoch 4001/20000 | Loss: [0.49999988]
Epoch 6001/20000 | Loss: [0.49999978]
Epoch 8001/20000 | Loss: [0.49999961]
Epoch 10001/20000 | Loss: [0.49999923]
Epoch 12001/20000 | Loss: [0.49999802]
Epoch 14001/20000 | Loss: [0.49998582]
Epoch 16001/20000 | Loss: [0.00036631]
Epoch 18001/20000 | Loss: [0.00010878]
[[0.0059796 ]
 [0.99496663]
 [0.99467777]
 [0.00606359]]


- with cross-entropy (more appropriate loss yields more accurate results)

In [5]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          n_epochs=10_000,
          learning_rate=0.9,
          print_frequency=1_000,
          batch_size=X.shape[0])
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/10000 | Loss: 1.4067851199793036
Epoch 1001/10000 | Loss: 1.3862286448305612
Epoch 2001/10000 | Loss: 0.09770425849200409
Epoch 3001/10000 | Loss: 0.015338943583119637
Epoch 4001/10000 | Loss: 0.007875949782712728
Epoch 5001/10000 | Loss: 0.005227523662116129
Epoch 6001/10000 | Loss: 0.003888745652833281
Epoch 7001/10000 | Loss: 0.0030856721208831657
Epoch 8001/10000 | Loss: 0.00255223684094427
Epoch 9001/10000 | Loss: 0.0021730116011985786
[[0.00106148]
 [0.99900329]
 [0.99910735]
 [0.0011468 ]]


- with cross-entropy and two hidden layers

In [6]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          n_epochs=20_000,
          learning_rate=5,
          print_frequency=2_000,
          batch_size=X.shape[0])
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/20000 | Loss: 1.3163776701063878
Epoch 2001/20000 | Loss: 1.3862939592350552
Epoch 4001/20000 | Loss: 1.3862949375159754
Epoch 6001/20000 | Loss: 1.3862961577317217
Epoch 8001/20000 | Loss: 1.3862982396128865
Epoch 10001/20000 | Loss: 1.3863026869540414
Epoch 12001/20000 | Loss: 1.386307854173105
Epoch 14001/20000 | Loss: 0.001685145353581686
Epoch 16001/20000 | Loss: 0.00042766392481325766
Epoch 18001/20000 | Loss: 0.00022687644638765556
[[1.87916076e-04]
 [9.99924926e-01]
 [9.99924883e-01]
 [6.94430793e-05]]


### Mini-Batch Stochastic Gradient Descent

- we can notice a significant reduction in learning time

In [7]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          n_epochs=1_000,
          learning_rate=0.9,
          print_frequency=100,
          batch_size=1)
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/1000 | Loss: 1.6714886430374924
Epoch 101/1000 | Loss: 1.3650505844525338
Epoch 201/1000 | Loss: 1.1129079741919605
Epoch 301/1000 | Loss: 0.31537916754668416
Epoch 401/1000 | Loss: 0.04218939371778174
Epoch 501/1000 | Loss: 0.021854677011071036
Epoch 601/1000 | Loss: 0.014538872185703725
Epoch 701/1000 | Loss: 0.010807560612850186
Epoch 801/1000 | Loss: 0.008559904725761808
Epoch 901/1000 | Loss: 0.00706486227628214
[[0.00221109]
 [0.9970236 ]
 [0.99697411]
 [0.00523788]]


### Mini-Batch Stochastic Gradient Descent with Momentum

- again a significant reduction in learning time

In [8]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.SGD_MOMENTUM,
          n_epochs=100,
          learning_rate=0.5,
          print_frequency=10,
          batch_size=1,
          momentum=0.9)
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/100 | Loss: 0.8961455735130106
Epoch 11/100 | Loss: 1.2533403574210196
Epoch 21/100 | Loss: 1.1675980006302695
Epoch 31/100 | Loss: 0.886864352843159
Epoch 41/100 | Loss: 0.8110626725144197
Epoch 51/100 | Loss: 0.5712211272437716
Epoch 61/100 | Loss: 0.09101475544093689
Epoch 71/100 | Loss: 0.03786048573408496
Epoch 81/100 | Loss: 0.024727643261475016
Epoch 91/100 | Loss: 0.018424634668718878
[[0.00705071]
 [0.99255338]
 [0.99253324]
 [0.01193854]]


### ADAGrad Optimizer

In [9]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.ADAGRAD,
          n_epochs=10000,
          learning_rate=0.05,
          print_frequency=1000,
          batch_size=1,
          momentum=0.9,
          shuffle=True)
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/10000 | Loss: 1.4626224155806873
Epoch 1001/10000 | Loss: 1.3873309186595386
Epoch 2001/10000 | Loss: 1.3863562324303977
Epoch 3001/10000 | Loss: 1.3229542803125782
Epoch 4001/10000 | Loss: 1.1275731991225544
Epoch 5001/10000 | Loss: 1.0112479974960298
Epoch 6001/10000 | Loss: 0.9507905436284405
Epoch 7001/10000 | Loss: 0.8464785107348616
Epoch 8001/10000 | Loss: 0.665893381829449
Epoch 9001/10000 | Loss: 0.47828222132909143
[[0.03621863]
 [0.84495366]
 [0.84538056]
 [0.16930388]]


### RMSProp Optimizer

- note that the initial cache `(Layer._weight_v & Layer._bias_v)` is initialized to `0` (Tensorflow uses `0`, Pytorch uses `1`)
    - due to this decision, the learning rate must be set to a lower value

In [10]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.RMSPROP,
          n_epochs=1000,
          learning_rate=0.05,
          print_frequency=100,
          batch_size=1,
          momentum=0.9)
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/1000 | Loss: 1.4646204715448579
Epoch 101/1000 | Loss: 0.864252842853261
Epoch 201/1000 | Loss: 0.8499606271877937
Epoch 301/1000 | Loss: 0.8447740392373606
Epoch 401/1000 | Loss: 0.8421744885468587
Epoch 501/1000 | Loss: 0.8406749740472536
Epoch 601/1000 | Loss: 0.8396413848587594
Epoch 701/1000 | Loss: 0.020630150958757434
Epoch 801/1000 | Loss: 0.004501011888488405
Epoch 901/1000 | Loss: 0.002190113940384297
[[3.28362961e-04]
 [9.99328577e-01]
 [9.99344572e-01]
 [1.87349269e-02]]


### Adam Optimizer

In [11]:
layers = MLPLayersBuilder()\
    .add_input(2)\
    .add_dense(5, Sigmoid())\
    .add_dense(5, Sigmoid())\
    .add_dense(1, Sigmoid())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.ADAM,
          n_epochs=1000,
          learning_rate=0.01,
          print_frequency=100,
          batch_size=1,
          momentum=0.9)
mlp.fit(X,y)
print(mlp.predict(X))

Epoch 1/1000 | Loss: 1.4448380950968098
Epoch 101/1000 | Loss: 1.3850383090736456
Epoch 201/1000 | Loss: 1.3852650591116098
Epoch 301/1000 | Loss: 1.2150826084504405
Epoch 401/1000 | Loss: 0.6393716795037153
Epoch 501/1000 | Loss: 0.19141023764266474
Epoch 601/1000 | Loss: 0.066097244413905
Epoch 701/1000 | Loss: 0.03314648804662379
Epoch 801/1000 | Loss: 0.019897991653148824
Epoch 901/1000 | Loss: 0.013153088740467493
[[0.00170673]
 [0.99537612]
 [0.99539795]
 [0.00784896]]


## Iris Dataset

### SGD Optimizer

In [26]:
data = load_iris()

X = data.data
y = data.target.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.25, random_state=SEED, stratify=y)

ohe = OneHotEncoder(sparse_output=False).fit(y_train)
y_train = ohe.transform(y_train)
y_test = ohe.transform(y_test)

mms = MinMaxScaler().fit(X_train)
X_train = mms.transform(X_train)
X_test = mms.transform(X_test)

layers = MLPLayersBuilder()\
    .add_input(4)\
    .add_dense(10, Sigmoid())\
    .add_dense(3, Softmax())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.SGD_MOMENTUM,
          n_epochs=500,
          learning_rate=0.01,
          print_frequency=100,
          batch_size=8,
          momentum=0.9,
          shuffle=False)
mlp.fit(X_train,y_train)
y_pred = mlp.predict(X_test)

print("Accuracy:", accuracy_score(y_true=ohe.inverse_transform(y_test).T[0],
                                  y_pred=np.argmax(y_pred, axis=1)))

Epoch 1/500 | Loss: 122.91449639267151
Epoch 101/500 | Loss: 35.06830216300894
Epoch 201/500 | Loss: 16.515480014895942
Epoch 301/500 | Loss: 10.997864307437471
Epoch 401/500 | Loss: 8.747593837531937
Accuracy: 0.9736842105263158


In [13]:
print('First 10 predictions:')
y_pred.round(2)[:10,:]

First 10 predictions:


array([[0.99, 0.01, 0.  ],
       [0.  , 0.92, 0.08],
       [0.01, 0.99, 0.  ],
       [0.  , 0.98, 0.02],
       [0.99, 0.01, 0.  ],
       [0.  , 0.96, 0.03],
       [0.  , 0.43, 0.57],
       [0.  , 0.01, 0.99],
       [0.  , 0.02, 0.98],
       [0.  , 0.01, 0.99]])

In [14]:
# scikit-learn implementation
mlp = MLPClassifier(
    hidden_layer_sizes=(10,),  # Number of units in the hidden layer
    activation='logistic',  # Sigmoid activation for the hidden layer
    solver='sgd',  # Stochastic Gradient Descent
    max_iter=500,  # Number of epochs
    learning_rate_init=0.5,
    momentum=0.9,
    random_state=42,
    batch_size=8
)

mlp.fit(X_train, y_train)

# Make predictions on the test set
y_pred_one_hot = mlp.predict(X_test)

print("Accuracy:", accuracy_score(y_true=ohe.inverse_transform(y_test).T[0],
                                  y_pred=np.argmax(y_pred_one_hot, axis=1)))

Accuracy: 0.9736842105263158


### Adam Optimizer

In [15]:
layers = MLPLayersBuilder()\
    .add_input(4)\
    .add_dense(10, Sigmoid())\
    .add_dense(3, Softmax())\
    .build()
mlp = MLP(layers,
          loss_function=CrossEntropy(),
          optimizer=Optimizer.ADAM,
          n_epochs=1000,
          learning_rate=0.01,
          print_frequency=100,
          batch_size=16,
          momentum=0.9)
mlp.fit(X_train,y_train)
y_pred = mlp.predict(X_test)

print("Accuracy:", accuracy_score(y_true=ohe.inverse_transform(y_test).T[0],
                                  y_pred=np.argmax(y_pred, axis=1)))

Epoch 1/1000 | Loss: 122.39550140647741
Epoch 101/1000 | Loss: 12.341061873161818
Epoch 201/1000 | Loss: 7.0985408458733
Epoch 301/1000 | Loss: 5.815926382614126
Epoch 401/1000 | Loss: 5.230317150056784
Epoch 501/1000 | Loss: 4.884673764719647
Epoch 601/1000 | Loss: 4.653043718285834
Epoch 701/1000 | Loss: 4.484943715656308
Epoch 801/1000 | Loss: 4.355734765046648
Epoch 901/1000 | Loss: 4.25174120642312
Accuracy: 0.9473684210526315


In [16]:
mlp = MLPClassifier(
    hidden_layer_sizes=(10,),  # Number of units in the hidden layer
    activation='logistic',  # Sigmoid activation for the hidden layer
    solver='adam',  # Stochastic Gradient Descent
    max_iter=1000,  # Number of epochs
    learning_rate_init=0.01,
    momentum=0.9,
    random_state=42,
    batch_size=16,
    shuffle=False
)

mlp.fit(X_train, y_train)

# Make predictions on the test set
y_pred_one_hot = mlp.predict(X_test)

print("Loss:", mlp.best_loss_)
print("Accuracy:", accuracy_score(y_true=ohe.inverse_transform(y_test).T[0],
                                  y_pred=np.argmax(y_pred_one_hot, axis=1)))

Loss: 0.1084840638154925
Accuracy: 0.9473684210526315


### Performance Profiling

In [24]:
def demo():
    layers = MLPLayersBuilder()\
                .add_input(4)\
                .add_dense(10, Sigmoid())\
                .add_dense(3, Softmax())\
                .build()
    mlp = MLP(layers,
            loss_function=CrossEntropy(),
            optimizer=Optimizer.SGD_MOMENTUM,
            n_epochs=10_000,
            learning_rate=0.01,
            print_frequency=1_000,
            batch_size=8,
            momentum=0.9,
            shuffle=False)
    mlp.fit(X_train,y_train)
    y_pred = mlp.predict(X_test)

    print("Accuracy:", accuracy_score(y_true=ohe.inverse_transform(y_test).T[0],
                                    y_pred=np.argmax(y_pred, axis=1)))

In [28]:
# https://realpython.com/python-profiling/

with Profile() as profile:
    demo()
    Stats(profile)\
      .strip_dirs()\
      .sort_stats(SortKey.CALLS)\
      .print_stats()

Epoch 1/10000 | Loss: 122.91449639267151
Epoch 1001/10000 | Loss: 5.544500233816098
Epoch 2001/10000 | Loss: 4.509193578863244
Epoch 3001/10000 | Loss: 4.10527220264018
Epoch 4001/10000 | Loss: 3.8728374558054828
Epoch 5001/10000 | Loss: 3.718829683253311
Epoch 6001/10000 | Loss: 3.608919951260028
Epoch 7001/10000 | Loss: 3.526448931972931
Epoch 8001/10000 | Loss: 3.462159883645847
Epoch 9001/10000 | Loss: 3.410465374150089
Accuracy: 0.9210526315789473
         25484857 function calls in 30.441 seconds

   Ordered by: call count

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  1680000    0.359    0.000    0.532    0.000 enum.py:769(__hash__)
  1680000    0.173    0.000    0.173    0.000 {built-in method builtins.hash}
  1400283    0.184    0.000    0.184    0.000 {built-in method builtins.isinstance}
  1120305    3.078    0.000    3.078    0.000 {method 'reduce' of 'numpy.ufunc' objects}
  1120004    0.354    0.000    1.126    0.000 {built-in method builtins.n