In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Lab 9 - Multi-layer Perceptron Forward Pass & Backpropagation

## Part I
For this exercise you will implement a simple 2-layer perceptron with the forward pass and the backpropagation to learn the weights

For the first part you'll build and train a 2-layer neural network that predicts the prices of houses, using the usual Boston housing dataset.

In [3]:
path = 'data/BostonHousing.txt'
boston = pd.read_csv(path,  delim_whitespace=True)

  boston = pd.read_csv(path,  delim_whitespace=True)


As usual, consider the MEDV as your target variable. 
* Split the data into training, validation and testing (70,15,15)%
* Experiment with different number of neurons per layer for your network, using the validation set

In [4]:
X = boston.values[:,:-1]
y = boston.iloc[:,-1:]
X = np.array(X, dtype=float)
y = np.array(y, dtype=float)
X_tv, X_test, y_tv, y_test = train_test_split(X, y,test_size=0.15, random_state=189, shuffle=True)
X_train, X_validation, y_train, y_validation = train_test_split(X_tv, y_tv, test_size=0.15/0.85 , random_state=123, shuffle=True) 

In [5]:
def sigmoid_activation(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_inverted(z):
    sig = sigmoid_activation(z)
    return sig*(1 - sig)

def identity(z):
    return z

def standardize(X, mean=None, std=None):
    X = np.array(X, dtype=float)
    if mean is None or std is None:
        mean = np.mean(X, axis=0)  # Per feature
        std = np.std(X, axis=0)
        std = np.where(std == 0, 1, std)  # Avoid division by zero
    X_scaled = (X - mean) / std
    return X_scaled, mean, std
    

In [6]:
def two_layer_perceptron(X, W_list, activation, outputActivation):
    """
    Implements the forward pass of a two-layer fully connected perceptron.
    
    Parameters
    ----------
    X : a 2-dimensional array
        the input data
    W_list : list
        An list with teh weights matrices
    activation : function
        the activation function to be used for the hidden layer
    outputActivation : funcition
        the activation function to be used for the output layer
    Returns
    -------
    y_pred : float
        the output of the computation of the forward pass of the network
    """
    # pushing the column of bias on X:
    X = np.array(X, dtype=float)
    X_bias = np.c_[X, np.ones(X.shape[0])]
    W_1 = W_list[0]
    W_2 = W_list[1]

    A1 = X_bias @ W_1
    Z1 = activation(A1)
    Z1 = np.c_[Z1, np.ones(Z1.shape[0])]
    Y = Z1 @ W_2

    y_pred = outputActivation(Y)
    return y_pred



In [7]:
def RMSE_foward_pass(y_pred, y_target):
    if (type(y_target) == pd.core.frame.DataFrame):
        y_target = np.array(y_target)

    dif = y_pred - y_target
    dif_squared = dif * dif
    RMSE = (dif_squared.sum() / dif_squared.size) ** (1 / 2)

    return RMSE


In [None]:
def train_two_layer_perceptron(X, T, activation, outputActivation, dim_input, dim_hidden, dim_output, maxIter=100, learning_rate = 1e-3):
    """
    Implements the forward pass of a two-layer fully connected perceptron.
    
    Parameters
    ----------
    X : a 2-dimensional array
        the input data
    activation : function
        the activation function to be used for the hidden layer
    dim_input : int
        the dimensionality of the input layer
    dim_hidden : int
        the dimensionality of the hidden layer
    dim_output : int
        the dimensionality of the output layer
    maxIter : int
        the max number of iterations
    Returns
    -------
    y_pred : float
        the output of the computation of the forward pass of the network
    """
    # pushing the column of bias on X:
    X = np.array(X, dtype=float)
    T = np.array(T, dtype=float)
    X_bias = np.c_[X, np.ones(X.shape[0])]
    
    # Initializing the weights with random:
    # We have two layers, so
    np.random.seed(42)
    W_1 = np.random.randn(dim_input + 1, dim_hidden) * np.sqrt(2.0 / (dim_input + dim_hidden))
    W_2 = np.random.randn(dim_hidden + 1, dim_output) * np.sqrt(2.0 / (dim_hidden + dim_output))
    
    print(X_bias.shape)
    print(W_1.shape)
    for iteration in range(maxIter):
        # Computing the foward pass:
        A1 = X_bias @ W_1   
        Z1 = activation(A1)
        Z1 = np.c_[Z1, np.ones(Z1.shape[0])]
        A2 = Z1 @ W_2
        Y = outputActivation(A2)

        # Computing the errors 'delta': 
        Delta_2 = Y - T
        # Delta_1 = sigmoid_inverted(A1)
        Delta_1 = (Delta_2 @ W_2[:-1,:].T) * sigmoid_inverted(A1)
       
        # Adjusting the wheights:

        W_2 -= learning_rate*(Z1.T @ Delta_2)
        W_1 -= learning_rate*(X_bias.T @ Delta_1)

    return Y, [W_1, W_2] 
        

dim_input = X_train.shape[1]
dim_hidden = 32
dim_output = 1
X_train_scaled, train_mean, train_std = standardize(X_train)
y, W_list = train_two_layer_perceptron(X_train_scaled, y_train, sigmoid_activation, identity, dim_input, dim_hidden, dim_output, maxIter=1000, learning_rate=1e-4)


In [9]:
# Testing:
X_test_stand, _, _ = standardize(X_test, mean=train_mean, std=train_std)
y_pred_test = two_layer_perceptron(X_test_stand, W_list, sigmoid_activation, identity)
print(RMSE_foward_pass(y_test,y_pred_test))


3.4350121192066574


In [10]:
dim_input = X_train.shape[1]
dim_output = 1

for dim_hidden in [1, 10, 20, 32, 64, 128]:
    X_train_scaled, train_mean, train_std = standardize(X_train)
    y, W_list = train_two_layer_perceptron(X_train_scaled, y_train, sigmoid_activation, identity, dim_input, dim_hidden, dim_output, maxIter=1000, learning_rate=1e-3)
    # Testing:
    X_test_stand, _, _ = standardize(X_test, mean=train_mean, std=train_std)
    y_pred_test = two_layer_perceptron(X_test_stand, W_list, sigmoid_activation, identity)
    print(f"Neurônios ocultos: {dim_hidden} = {RMSE_foward_pass(y_test,y_pred_test)}")

Neurônios ocultos: 1 = 5.823651888537768
Neurônios ocultos: 10 = 3.5719031655752533
Neurônios ocultos: 20 = 3.3323005590158536
Neurônios ocultos: 32 = 5.970269468562682
Neurônios ocultos: 64 = 8.11303159997741


  return 1 / (1 + np.exp(-z))


Neurônios ocultos: 128 = 21.92576382372181


## Part II 

For this exercise you will build and train a 2-layer neural network that predicts the exact digit from a hand-written image, using the MNIST dataset. 
For this exercise, add weight decay to your network.

In [11]:
from sklearn.datasets import load_digits

In [12]:
digits = load_digits()

In [13]:
X = digits.data
y = digits.target

In [14]:
X.shape
X = np.array(X, dtype=float)

Again, you will split the data into training, validation and testing.

In [15]:
X_tv2, X_test2, y_tv2, y_test2 = train_test_split(X, y,test_size=0.15, random_state=189, shuffle=True)
X_train2, X_validation2, y_train2, y_validation2 = train_test_split(X_tv2, y_tv2, test_size=0.15/0.85 , random_state=123, shuffle=True) 

In [16]:
def softmax_activation(z):
    z = np.exp(z)
    softmax = z / np.sum(z, axis=1, keepdims=True)

    return softmax

def adjust_y_target(y_target):
    if (type(y_target) == pd.core.frame.DataFrame):
        y_target = np.array(y_target)
    Y_target = np.zeros(shape=(y_target.shape[0], 10))
    for l in range(y_target.shape[0]):
        Y_target[l, y_target[l]] = 1.
    
    return Y_target


def cross_entropy_loss(Y_pred, Y_target):
    E = -np.sum(Y_target * np.log(Y_pred)) / Y_pred.shape[0]
    return E

In [17]:
def multiclass_two_layer_perceptron_weight_decay(X, T, dim_hidden, dim_output,activation, outputActivation, lmb1, lmb2, learning_rate=1e-3, maxIter=100):
    X = np.array(X, dtype=float)
    dim_input = X.shape[1]

    T = adjust_y_target(T)

    # Adding the bias
    X_bias = np.c_[X, np.ones(X.shape[0])]
    
    # Initializing the weights with random numbers:
    # We have two layers, so
    np.random.seed(42) 
    W_1 = np.random.randn(dim_input + 1, dim_hidden) * np.sqrt(2.0 / (dim_input + dim_hidden))
    W_2 = np.random.randn(dim_hidden + 1, dim_output) * np.sqrt(2.0 / (dim_hidden + dim_output))
    
    for iteration in range(maxIter):
        # Computing the foward pass:
        A1 = X_bias @ W_1   
        Z1 = activation(A1)
        Z1 = np.c_[Z1, np.ones(Z1.shape[0])]
        A2 = Z1 @ W_2
        Y = outputActivation(A2)

        # Computing the errors 'delta': 
        Delta_2 = Y - T

        Delta_1 = (Delta_2 @ W_2[:-1,:].T) * sigmoid_inverted(A1)

        # Adjusting the wheights:

        W_2[:-1, :] -= learning_rate*(Z1.T @ Delta_2 + lmb2 * W_2)[:-1,:]
        W_2[-1, :] -= learning_rate*(Z1.T @ Delta_2)[-1, :]
        W_1[:-1, :] -= learning_rate*(X_bias.T @ Delta_1 + lmb1 * W_1)[:-1, :]
        W_1[-1, :] -= learning_rate*(X_bias.T @ Delta_1)[-1, :]

        if iteration % 100 == 0:
            print(cross_entropy_loss(Y, T))

    return Y, [W_1, W_2] 


dim_hidden=64
dim_output=len(np.unique(y))


y_pred_train, W_list_msti = multiclass_two_layer_perceptron_weight_decay(X_train2, y_train2, dim_hidden, dim_output,sigmoid_activation, softmax_activation, lmb1=1e-3, lmb2=1e-4, learning_rate=1e-3, maxIter=1000)




2.6237390029850745
0.09846686318026181
0.03735177181221458
0.02093960672925308
0.014893741768411578
0.011604991619001519
0.00950541769232497
0.008022276691455573
0.006919287409911968
0.006071962230965158


In [18]:
def forward_pass_multi_class(X, activation, activation_output, W_list):
    """
    Implements the forward pass of a two-layer fully connected perceptron.
    
    Parameters
    ----------
    X : a 2-dimensional array
        the input data
    activation : function
        the activation function to be used for the hidden layer
    activation_output : function
        the activation function to be used for the output layer
    dim_input : int
        the dimensionality of the input layer
    dim_hidden : int
        the dimensionality of the hidden layer
    dim_output : int
        the dimensionality of the output layer
    Returns
    -------
    y_pred : nd.array
        the output of the computation of the forward pass of the network
    """
    W_1 = W_list[0]
    W_2 = W_list[1]
    
    # pushing the column of bias on X:
    X = np.array(X, dtype=float)
    X_bias = np.c_[X, np.ones(X.shape[0])]


    A1 = X_bias @ W_1
    Z1 = activation(A1)
    Z1 = np.c_[Z1, np.ones(Z1.shape[0])]
    Y = Z1 @ W_2

    y_pred = activation_output(Y)
    
    return y_pred

dim_input = X_train.shape[1]
dim_hidden = 64
dim_output = 10
y_pred2 = forward_pass_multi_class(X_test2, sigmoid_activation,softmax_activation, W_list_msti)
# print(y_pred2)
y_test2 = adjust_y_target(y_test2)
print(cross_entropy_loss(y_pred2,y_test2))



0.12156794047663314
