![](https://www.ucd.ie/t4media/ucdmaincore_logo-footer.png)

#### COMP30320 - Connectionist Computing

#### Student Name: Michael Jordan

#### Student Number: 14376516



-------------------------------------------------------------------------------------------

# CODING

Imports

In [1]:
import numpy as np
import math

Main Class

In [2]:
class MLP:
    def __init__(self, NI, NH, NO, activation='tanh', max_epochs=5000, learning_rate=0.1, verbose=2): #activation=(tanh, linear)
        """ 
        Initialise network.
        NI: Number of inputs
        NH: Number of hidden units
        NO: Number of outputs
        activation: Activation function
        max_epochs: Maximum number of iterations for training
        learning_rate: Delta applied to weights
        verbose: Level of detail printed
        """
        self.NI = NI
        self.NH = NH
        self.NO = NO
        self.activation, self.d_activation = self.__init_activation(activation) 
        self.max_epochs = max_epochs
        self.learning_rate = learning_rate
        self.verbose = np.power(10, verbose-1) 
        
        self.loss_function = self.__init_loss_function()
            
        #Initialise value of units with ones. Will hold activation results in future.
        self.I = np.ones(self.NI + 1) #Extra 1 for bias node.
        self.H = np.ones(self.NH)
        self.O = np.ones(self.NO)
        
        self.W1, self.W2, self.dW1, self.dW2 = self.randomise()
        
    def __init_activation(self, activation):
        if activation == "tanh":
            return tanh, d_tanh
        if activation.lower() == "relu":
            return ReLU, d_ReLU
#         return tanh, d_tanh

    def __init_loss_function(self):
        """ 
        Initialize the loss function.
        If multiple number of outputs, use cross entropy.
        If a single output, use squared error.
        """
        if self.NO > 1:
            return cross_entropy
        if self.NO == 1:
            return squared_error
        else:
            print("Edge case: 49587323")
            
    def randomise(self):
        """ 
        Randomly initialize weights between -1 and +1 
        for lower layer and upper layer.
        """
        #Lower layer (between input and hidden layer)
        W1 = np.random.uniform(-0.5, 0.5, (self.I.size, self.H.size)) #Values between -1 and 1 were too varied. Output inconclusive.
        dW1 = np.zeros((self.I.size, self.H.size))
        #Upper layer (between hidden and output layer)
        W2 = np.random.uniform(-0.5, 0.5, (self.H.size, self.O.size))
        dW2 = np.zeros((self.H.size, self.O.size))
        return W1, W2, dW1, dW2

    def forward(self, inputs):
        """ 
        Propagate the inputs forward 
        from the input layer to the output layer
        """
        #Load training inputs into the form of our environment
        self.I[:-1] = inputs #-1 because last one is a bias node, and has already been initialised to 1 in __init__
        
        #Activate hidden layer
        self.H = self.activation(np.dot(self.I, self.W1))
        
        #Activate output layer
        if self.NO > 1:
            # Classification - using SoftMax
            self.O = softmax(np.dot(self.H, self.W2))
        else:
            # Regression
            self.O = self.activation(np.dot(self.H, self.W2))
#         self.O = softmax(np.dot(self.H, self.W2)) #Want to keep tanh/relu away from the output layer. It's best to restrict it to hidden layer.
        
        return self.O
    
    def backward(self, expected):
        # Error on the output layer.
        error = expected - self.O
        
        # Delta activation of output layer.
        if self.NO > 1:
            # Classification
            dW2 = error * d_softmax(self.O, softmax)
        else:
            # Regression
            dW2 = error * self.d_activation(self.O)
        
        # Delta activation of hidden layer.
        dW1 = np.dot(dW2, self.W2.T) * self.d_activation(self.H) #Restricting tanh/relu to the hidden layer again.

        # Update the weights
        self.update_weights(dW1, dW2)
        
    def update_weights(self, dW1, dW2):
        """ 
        Update weights on the lower layer and the upper layer.
        """
        #Lower Layer
        dW1 = np.dot(np.atleast_2d(self.I).T, np.atleast_2d(dW1))
        self.W1 += self.learning_rate * dW1
        
        #Upper Layer
        dW2 = np.dot(np.atleast_2d(self.H).T, np.atleast_2d(dW2))
        self.W2 += self.learning_rate * dW2

    def fit(self, X, Y):
        """ 
        Training the MLP
        X: The features of the training set
        Y: The labels/Outputs of the training set
        """
        for e in range(1, self.max_epochs):
            cost = 0.0
            for j, row in enumerate(X):
                #Feedforward inputs to the output layer.
                O = self.forward(row)
                
                #Sum the error of each example computed.
                cost += self.loss_function(O, Y[j])
                
                # Backpropagate the error signal.
                self.backward(Y[j])
            
            #Print details during training.
            # Classification
            if self.NO > 1:
                prediction = self.predict(X) 
                accuracy = 0.0
                for k, row in enumerate(Y):
                    if prediction[k] == np.argmax(Y[k]):
                        accuracy += 1
                        
                if e % self.verbose == 0:
                    print('Epoch: %d \t|\t Error: %.6f \t|\t Accuracy: %.3f' %(e, cost/len(X), accuracy/len(X)))
            
            # Regression
            else:
                if e % self.verbose == 0:
                    print('Epoch: %d \t|\t Error: %.6f' % (e, cost/(len(X))))

        return self
    
    def predict(self, X):
        """ 
        Predict on the test set
        X: Unknown features of the test set
        """
        Y = list()
        for j, row in enumerate(X):
            if self.NO > 1:
                # Classification - using one hot encoding,
                #   so find the index of output units with the max output
                Y.append(np.argmax(self.forward(row)))
            else:
                Y.append(self.forward(row))
        
        return np.array(Y)

Definitions of activation functions and their derivatives

In [3]:
def tanh(x):
    return np.tanh(x)

def d_tanh(x):
    return 1.0 - np.tanh(x)**2


# def ReLU(x):
#     if x[0] < 0:
#         x[0] = 0
#     if x[1] < 0:
#         x[1] = 0
#     return x

# def d_ReLU(x):
#     if x[0] < 0:
#         x[0] = 0
# #     if x[1] < 0:
# #         x[1] = 0
#     return x


def softmax(x):
    """
    x: input that needs to be activated
    return: softmax on x
    """
    x_exp = [math.exp(i) for i in x]
    sum_x_exp = sum(x_exp)
    
    if sum_x_exp != 0:
        return [i / sum_x_exp for i in x_exp]
    else:
        return 0

def d_softmax(output, function):
    """ 
    output: Output layer
    function: The activation function of the output layer - SoftMax in classification
    return: derivative of softmax
    """
    return function(output) * (1 - function(output))

Loss functions

In [4]:
def cross_entropy(o, y):
    return np.sum(np.nan_to_num(-y * np.log(o) - (1-y) * np.log(1-o)))


def squared_error(o, y):
    return 0.5 * ((y-o) ** 2).sum()

-------------------------------------------------------------------------------------------

# TESTS

## 1. XOR

### Train an MLP with 2 inputs, two hidden units and one output on the following examples (XOR function):
   ((0, 0), 0)
   
   ((0, 1), 1)
   
   ((1, 0), 1)
   
   ((1, 1), 0)

In [5]:
# Initialize the XOR inputs
XOR_inputs = np.array([
    [0, 0, 0],
    [0, 1, 1],
    [1, 0, 1],
    [1, 1, 0]
])

# Split the inputs and outputs
x = XOR_inputs[:, :-1]
y = XOR_inputs[:, -1]

# Initialize the MLP network
mlp = MLP(2, 2, 1, max_epochs=2000, verbose=3)

# Training the MLP
mlp.fit(x, y)

# Predict the MLP on XOR inputs
prediction = mlp.predict(x)
print("-----" * 13)
for i, l in enumerate(y):
    print('Input: %s \t|\t Expected: %.f \t|\t Output: %.0f' % (str(x[i]), y[i], prediction[i]))

Epoch: 100 	|	 Error: 0.132236
Epoch: 200 	|	 Error: 0.131917
Epoch: 300 	|	 Error: 0.131774
Epoch: 400 	|	 Error: 0.131724
Epoch: 500 	|	 Error: 0.131716
Epoch: 600 	|	 Error: 0.131709
Epoch: 700 	|	 Error: 0.131612
Epoch: 800 	|	 Error: 0.131103
Epoch: 900 	|	 Error: 0.128999
Epoch: 1000 	|	 Error: 0.120203
Epoch: 1100 	|	 Error: 0.082431
Epoch: 1200 	|	 Error: 0.053369
Epoch: 1300 	|	 Error: 0.037276
Epoch: 1400 	|	 Error: 0.028075
Epoch: 1500 	|	 Error: 0.022301
Epoch: 1600 	|	 Error: 0.018391
Epoch: 1700 	|	 Error: 0.015588
Epoch: 1800 	|	 Error: 0.013490
Epoch: 1900 	|	 Error: 0.011868
-----------------------------------------------------------------
Input: [0 0] 	|	 Expected: 0 	|	 Output: 0
Input: [0 1] 	|	 Expected: 1 	|	 Output: 1
Input: [1 0] 	|	 Expected: 1 	|	 Output: 1
Input: [1 1] 	|	 Expected: 0 	|	 Output: 0


### At the end of training, check if the MLP predicts correctly all the examples.
   Check

## Conclusion of Test 1
Here, we trained a model to identify the XOR solution. This is a good problem for neural networks as it's something that can't be solved with a single linear function. 

I think it ran very well. Ending with an error of only 0.011868, and predicting the outputs correctly, I can deem this a success. 

One thing to note, I found altering the weights (W1 and W2) to be between -0.5 and +0.5 to be much more efficient and conclusive. 

## 2. Sin function

#### Generate 200 vectors containing 4 components each. 
#### The value of each component should be a random number between -1 and 1. 
#### These will be your input vectors. 
#### The corresponding output for each vector should be the sin() of a combination of the components.
 
 Example:
 
    Input:  [x1 x2 x3 x4]
 
    Output: sin(x1-x2+x3-x4)

In [6]:
# Initialize the MLP network
mlp = MLP(4, 10, 1, learning_rate=0.1, max_epochs=3000, verbose=3)

# Initialize the training set and test set
sample_num = 200
train_num = 150
test_num = 50

# Inputs
X = np.random.uniform(-1, 1, (sample_num, 4))

# Output
Y = list(map(lambda a: np.sin(a[0] - a[1] + a[2] - a[3]), X))

# Combine input and output
ds = np.column_stack((X, Y))

# Split the training set and test set
train = ds[:train_num]
test = ds[train_num:]

#### Now train an MLP with 4 inputs, at least 5 hidden units and one output on 150 of these examples and keep the remaining 50 for testing.

In [7]:
# Train the MLP network
mlp.fit(train[:, :-1], train[:, -1])

Epoch: 100 	|	 Error: 0.008666
Epoch: 200 	|	 Error: 0.008743
Epoch: 300 	|	 Error: 0.007680
Epoch: 400 	|	 Error: 0.006302
Epoch: 500 	|	 Error: 0.006097
Epoch: 600 	|	 Error: 0.005568
Epoch: 700 	|	 Error: 0.004433
Epoch: 800 	|	 Error: 0.001439
Epoch: 900 	|	 Error: 0.000521
Epoch: 1000 	|	 Error: 0.000357
Epoch: 1100 	|	 Error: 0.000379
Epoch: 1200 	|	 Error: 0.000444
Epoch: 1300 	|	 Error: 0.000545
Epoch: 1400 	|	 Error: 0.000677
Epoch: 1500 	|	 Error: 0.000755
Epoch: 1600 	|	 Error: 0.000730
Epoch: 1700 	|	 Error: 0.000696
Epoch: 1800 	|	 Error: 0.000692
Epoch: 1900 	|	 Error: 0.000700
Epoch: 2000 	|	 Error: 0.000691
Epoch: 2100 	|	 Error: 0.000608
Epoch: 2200 	|	 Error: 0.000433
Epoch: 2300 	|	 Error: 0.000315
Epoch: 2400 	|	 Error: 0.000236
Epoch: 2500 	|	 Error: 0.000180
Epoch: 2600 	|	 Error: 0.000158
Epoch: 2700 	|	 Error: 0.000139
Epoch: 2800 	|	 Error: 0.000113
Epoch: 2900 	|	 Error: 0.000095


<__main__.MLP at 0x117575f98>

#### Predict on test set.

In [8]:
test_x = test[:, :-1]
test_y = test[:, -1]

prediction = mlp.predict(test_x).flatten()

cost = 0.0
accuracy = 0
total_i = 0
for i, k in enumerate(test_x):
    cost += 0.5 * (test_y[i] - prediction[i]) ** 2
    print('%s \t|\t Expected: %.f \t|\t Output: %.0f' % (str(k), test_y[i], prediction[i]))
    
    if (int(test_y[i]) == int(prediction[i])):
        accuracy += 1
    total_i += 1
        
print('-----' * 20)
print('Error on test set: %.5f' % (cost / len(test_x)))
print('Accuracy: %.2f' %(accuracy/total_i))

[ 0.74742818 -0.52278696  0.1004962   0.36804979] 	|	 Expected: 1 	|	 Output: 1
[ 0.95390476 -0.48334988  0.08922534  0.68245714] 	|	 Expected: 1 	|	 Output: 1
[0.49959348 0.55808257 0.82440938 0.58854471] 	|	 Expected: 0 	|	 Output: 0
[ 0.05079309 -0.18673182 -0.22610956  0.35493546] 	|	 Expected: -0 	|	 Output: -0
[-0.57611149 -0.30889683  0.73289491 -0.80030568] 	|	 Expected: 1 	|	 Output: 1
[-0.28978418  0.52940615 -0.8974755  -0.43615098] 	|	 Expected: -1 	|	 Output: -1
[-0.88746991  0.71273133 -0.05571403  0.63652154] 	|	 Expected: -1 	|	 Output: -1
[ 0.36789789 -0.35591926  0.67324562 -0.15543255] 	|	 Expected: 1 	|	 Output: 1
[ 0.4163947   0.59717439  0.90459803 -0.47961734] 	|	 Expected: 1 	|	 Output: 1
[ 0.72685379 -0.74594506  0.74759361 -0.61553623] 	|	 Expected: 0 	|	 Output: 0
[-0.20072192  0.71258938  0.2548554   0.21476807] 	|	 Expected: -1 	|	 Output: -1
[-0.93481544 -0.21645127  0.53328865  0.90192025] 	|	 Expected: -1 	|	 Output: -1
[ 0.60966682  0.12865448  0.448532

### What is the error on training at the end? How does it compare with the error on the test set? Do you think you have learned satisfactorily?

Error at end of training is 0.000129, compared to the error at end of testing 0.00026.

While all small errors, you could say there is a significant difference, as error during training is half that of testing.

This is as expected, as there are bound to be more errors when predicting on unseen  data.

I believe the learning has been satisfactory. The accuracy on the test set came out to 100%. Couldn't have gone better. 

## Conclusion of Test 2
These outcomes proved that our MLP is capable of solving for quite complex functions. 

Here we solved for the sin of a combination of four attributes. 

I was surprised by how efficient it was with a sample size of only 200 (very small in my opnion). Impressive performance!