## Exercise - DL Tutorial 5

## student name: 

In [11]:
# Equation numbers refer to handout 5

import numpy as np

np.random.seed(42)

In [12]:
def sigmoid(X):
    return 1/(1 +np.exp(-X))

def del_sigmoid(h):
    return h * (1 - h)

Implement a method that creates binary addition data.

In [21]:
def generate_data(num_examples, max_len):
    # Generate num_examples * 2 ints.
    rand_numbers = np.random.randint(0, 2**(max_len-1)-1, size=(num_examples * 2), dtype=np.uint8)
    rand_numbers_bits = np.unpackbits(rand_numbers)
    rand_numbers = rand_numbers.reshape(num_examples, 2)
    rand_numbers_bits = rand_numbers_bits.reshape(num_examples, 2, max_len)
    rand_results = np.sum(rand_numbers, axis=1, dtype=np.uint8)
    # Add 3rd dimension to tensor.
    rand_results_bits = np.unpackbits(rand_results).reshape(num_examples, max_len, 1)
    # Data should be of form (num_examples, sequence_length, num_features).
    rand_numbers_bits = np.transpose(rand_numbers_bits, axes=[0,2,1])

    return rand_numbers_bits, rand_results_bits

In [22]:
trainX, trainY = generate_data(100, 8)
print("-----------------------------------")
print(trainX[0,:,0])
print("+")
print(trainX[0,:,1])
print("=")
print(trainY[0,:,0])
print("-----------------------------------")

-----------------------------------
[0 0 0 1 1 0 1 0]
+
[0 0 0 0 1 0 1 1]
=
[0 0 1 0 0 1 0 1]
-----------------------------------


Implement the mean squared error as a loss function

In [15]:
def mean_square_error(pred, y):
    return np.mean((pred-y)**2)

Iimplement the accuracy of the predictions

In [16]:
def accuracy(pred, y):
    rounded = np.rint(pred)
    return np.mean(rounded==y)

Implement the RNN class, implement the forward propagation, implement the BPTT and implement the gradient step

In [17]:
class one_layer_rnn:
    def __init__(self, n_input, n_hidden, n_out):
        # Initialisation of weights, no bias.
        self.W_1 = np.random.randn(n_input, n_hidden)
        self.U = np.random.randn(n_hidden, n_hidden)
        self.W_2 = np.random.randn(n_hidden, n_out)
        self.X = None
        self.H = None
        self.out = None
        self.dW1 = None
        self.db_h1 = None
        self.dU = None
        self.dW2 = None
        self.db_out = None

    def forward_propagation(self, X):
        num_sequence = X.shape[1]
        # Reverse sequence order to process bits from low-valued to high-valued
        self.X = np.flip(X, axis=1)
        
        # "dot" multiplication of X and W_1 is performed over the last dimension of X 
        # (the features for one sequence and one training example) and W_1. 
        # Result: H without any horizontal information flow.
        # (2)
        self.H = np.dot(self.X, self.W_1)
        prev = np.zeros(self.H[:, 0, :].shape)
        # Loop over sequence. Numbers have to be added (after reversal) from left to right
        for i in range(num_sequence):
            # Matrix multiplication of ith element of sequence. Adding of horizontal information flow.
            # (2)
            self.H[:, i, :] = sigmoid(self.H[:, i, :] + np.dot(prev, self.U))
            prev = self.H[:, i, :]
        # (3)
 
        self.out = sigmoid(np.dot(self.H, self.W_2))
        
        # Reverse sequence back to the original order
        return np.flip(self.out, axis=1)

    def backprop_through_time(self, Y):
        num_examples, num_sequence = self.X.shape[:2]
        
        # Derivative of mean-square error see (6).
        # Note: target labels are flipped along the sequence axis to match the sequence reversal.
        self.d_out = 2 * (self.out - np.flip(Y, axis=1)) * del_sigmoid(self.out)
        
        
        self.dW2 = np.zeros(self.W_2.shape)
        # Backprop: left to right.
        for i in range(num_sequence):
            # Sum up contribution of all sequence results to dW2.
            # Basically, sum over (5), like in (14).
            #self.dW2 += (self.d_out[:, i, :].T @ self.H[:, i, :]).T
            self.dW2 += self.H[:, i, :].T @ self.d_out[:, i, :]
        # Average gradient for every sequence element.
        self.dW2 /= num_examples
        
        
            
        # (13): W^n \delta^{n,\tau} part, vertical backprop.
        self.d_hidden = np.dot(self.d_out, self.W_2.T)
        prev = np.zeros(self.d_hidden[:, 0, :].shape)
        # backpropagation trhough time
        for i in range(num_sequence - 1, -1, -1):
            # (13): U^{n-1} \delta^{n-1,\tau + 1} part, horizontal backprop.
            self.d_hidden[:, i, :] += prev @ self.U.T
            self.d_hidden[:, i, :] *= del_sigmoid(self.H[:, i, :])
            prev = self.d_hidden[:, i, :]
        
        
        self.dW1 = np.zeros(self.W_1.shape)
        for i in range(num_sequence):
            # (13) only vertical backprop necessary.
            self.dW1 += self.X[:, i, :].T @ self.d_hidden[:, i, :]
        self.dW1/= num_examples
        
        
        self.dU =  np.zeros(self.U.shape)
        for i in range(1, num_sequence):
            self.dU += self.H[:, i - 1, :].T @ self.d_hidden[:, i, :]
        self.dU /= num_examples
        

    def gradient_step(self, learning_rate):
        self.U -= learning_rate*self.dU
        self.W_2 -= learning_rate * self.dW2
        self.W_1 -= learning_rate * self.dW1


Implement the learning routine


In [20]:
learning_rate = 0.1
train_iters = 5000
print_iters = 100


trainX, trainY = generate_data(100, 8)
testX, testY = generate_data(10000, 8)
print("-----------------------------------")
print(trainX[0,:,0])
print("+")
print(trainX[0,:,1])
print("=")
print(trainY[0,:,0])
print("-----------------------------------")
print("Shapes")
net = one_layer_rnn(2, 16, 1)
result = net.forward_propagation(trainX)
net.backprop_through_time(trainY)
print("X        shape: {}".format(net.X.shape))
print("H        shape: {}".format(net.H.shape))
print("out      shape: {}".format(net.out.shape))
print("d_out    shape: {}".format(net.d_out.shape))
print("d_hidden shape: {}".format(net.d_hidden.shape))
print("dW2      shape: {}".format(net.dW2.shape))
print("dW1      shape: {}".format(net.dW1.shape))
print("dU       shape: {}".format(net.dU.shape))
print("-----------------------------------")

for i in range(train_iters):
    if i% print_iters == 0:
        result = net.forward_propagation(testX)
        print("Iteration: {}".format(i))
        print("Test loss: \t{}".format(mean_square_error(result, testY)))
        print("Test acc: \t{}".format(accuracy(result, testY)))
        result = net.forward_propagation(trainX)
        print("Train loss: \t{}".format(mean_square_error(result, trainY)))
        print("Train acc: \t{}".format(accuracy(result, trainY)))
    result = net.forward_propagation(trainX)
    net.backprop_through_time(trainY)
    net.gradient_step(learning_rate)

-----------------------------------
[0 1 0 0 0 0 1 1]
+
[0 0 1 1 1 1 1 1]
=
[1 0 0 0 0 0 1 0]
-----------------------------------
Shapes
X        shape: (100, 8, 2)
H        shape: (100, 8, 16)
out      shape: (100, 8, 1)
d_out    shape: (100, 8, 1)
d_hidden shape: (100, 8, 16)
dW2      shape: (16, 1)
dW1      shape: (2, 16)
dU       shape: (16, 16)
-----------------------------------
Iteration: 0
Test loss: 	0.2591587365780634
Test acc: 	0.50305
Train loss: 	0.2588415805614865
Train acc: 	0.5025
Iteration: 100
Test loss: 	0.24870291017759869
Test acc: 	0.5259
Train loss: 	0.2461162712411344
Train acc: 	0.54875
Iteration: 200
Test loss: 	0.24494756383286723
Test acc: 	0.57655
Train loss: 	0.24089750093059656
Train acc: 	0.605
Iteration: 300
Test loss: 	0.2411127881903506
Test acc: 	0.582775
Train loss: 	0.2356216443745122
Train acc: 	0.62
Iteration: 400
Test loss: 	0.2362729685075586
Test acc: 	0.6013625
Train loss: 	0.2290455232504128
Train acc: 	0.64
Iteration: 500
Test loss: 	0.2301