## Exercise - DL Tutorial 04: FCNN - BP 

Please complete the following notebook and submit your solutions to manuel.milling@informatik.uni-augsburg.de and maurice.gerczuk@informatik.uni-augsburg.de till 10 May 14:15.

## student name: Pavlo Mospan, Anastasia Karsten

Solutions from exercise sheet 3 (class methods below).

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import numpy as np
#numpy random seed
np.random.seed(42)

trainx, trainy, testx, testy = np.load('../03/mnist.npy', allow_pickle=True)
print("Trainx shape: {}".format(trainx.shape))
print("Trainy shape: {}".format(trainy.shape))
print("Testx shape:  {}".format(testx.shape))
print("Testy shape:  {}".format(testy.shape))

def sigmoid(X):
    return 1/(1 + np.exp(-X))

def softmax(X):
    #more stable
    eps = X.max()
    return np.exp(X + eps)/(np.sum(np.exp(X + eps), axis=1).reshape((X.shape[0],1)))

def fcc_one_layer(X, W, b, activation):
    return activation(np.matmul(X, W) + b)

def cross_entropy(pred_logits, y):
    num_data_points = pred_logits.shape[0]
    correct_logits = pred_logits[np.arange(num_data_points),y]
    return np.mean(-np.log(correct_logits))

def accuracy(logits, labels):
    class_predictions = np.argmax(logits, axis=1)
    return np.mean(class_predictions == labels)

Trainx shape: (60000, 784)
Trainy shape: (60000,)
Testx shape:  (10000, 784)
Testy shape:  (10000,)


1.   Implement the error of the last layer.

In [3]:
def delta_last_layer(H, y):
    """
    :param H: softmax activations of shape (num_examples, num_classes)
    :param y: correct labels of shape (num_examples,)
    :return: delta of of last layer, i.e. derivative of cross entropy times derivative of softmax
    """
    m = y.shape[0]
    gradient = H
    gradient[range(m),y] -= 1
    gradient = gradient/m
    return gradient

2.   Implement the derivative of the sigmoid function in terms of the sigmoid function.

In [4]:
def del_sigmoid(H):
    """
    :param H: output of the sigmoid function shape (num_examples, num_units)
    :return: element-wise derivative of the sigmoid function
    """
    return H * (1 - H)

3.   Implement the backpropagation as a class method.
4.   Implement the the optimisation step as a class method.


In [5]:
class fcc:
    def __init__(self, n_input, n_hidden1, n_hidden2, n_out):
        #parameters
        self.W_i_h1 = np.random.randn(n_input, n_hidden1)
        self.b_h1 = np.random.randn(n_hidden1)
        self.W_h1_h2 = np.random.randn(n_hidden1, n_hidden2)
        self.b_h2 = np.random.randn(n_hidden2)
        self.W_h2_o = np.random.randn(n_hidden2, n_out)
        self.b_out = np.random.randn(n_out)
        #neuron activations and input H^n
        self.X = None
        self.h1 = None
        self.h2 = None
        self.out = None
        #components of the gradient
        self.dW_i_h1 = None
        self.db_h1 = None
        
        self.dW_h1_h2 = None
        self.db_h2 = None
        
        self.dW_h2_o = None
        self.db_out = None

        n_trainable_bias = self.b_h1.shape[0] + self.b_h2.shape[0] + self.b_out.shape[0]
        n_trainable_weights = self.W_i_h1.shape[0] * self.W_i_h1.shape[1] + self.W_h1_h2.shape[0] * self.W_h1_h2.shape[1] + self.W_h2_o.shape[0] * self.W_h2_o.shape[1]
        print("Number of parameters: {}".format(n_trainable_bias + n_trainable_weights))

    def forward_propagation(self, X):
        self.X = X
        self.h1 = fcc_one_layer(X, self.W_i_h1, self.b_h1, sigmoid)
        self.h2 = fcc_one_layer(self.h1, self.W_h1_h2, self.b_h2, sigmoid)
        self.out = fcc_one_layer(self.h2, self.W_h2_o, self.b_out, softmax)
        return self.out

    def backprop(self, y):
        """
        :param y: labels, i.e. numbers of correct classes of shape (num_examples,)
        """
        # outer layer (Dims : 400 x 10) + bias (Dims : 10 x 1)
        
        delta = delta_last_layer(self.out, y)
        self.db_out = np.mean(delta) # * self.b_out
        self.dW_h2_o = np.matmul(np.transpose (self.h2), delta)
             
        # hidden2 layer (Dims : 400 x 400 ) + bias (Dims : 400 x 1)
        
        sigmoid = del_sigmoid(self.h2)
        self.db_h2 = np.mean(sigmoid) # * self.b_h2
        self.dW_h1_h2 = np.matmul(np.transpose(self.h1), sigmoid)
        
        # hidden1 layer (Dims : 784 x 400) + bias (Dims : 400 x 1)
        
        sigmoid2 =  del_sigmoid(self.h1)
        self.db_h1 = np.mean(sigmoid2) # * self.b_h1
        self.dW_i_h1 = np.matmul(np.transpose(self.X), sigmoid)
        
    
    def gradient_step(self, learning_rate):
        """
        :param learning_rate: learning_rate for training
        """
        # input -> hidden1
        self.W_i_h1 = self.W_i_h1 - self.dW_i_h1 * learning_rate
        self.b_h1 = self.b_h1 - self.db_h1 * learning_rate
        
        # hidden1 -> hidden2
        self.W_h1_h2 = self.W_h1_h2 - self.dW_h1_h2 * learning_rate
        self.b_h2 = self.b_h2 - self.db_h2 * learning_rate
        
        # hidden2 -> out
        self.W_h2_o = self.W_h2_o - self.dW_h2_o * learning_rate
        self.b_out = self.b_out - self.db_out * learning_rate


5.   Implement the training routine.

In [6]:
import csv 


learning_rate = 0.01
num_iterations = 1000

neural_net = fcc(784, 400, 400, 10)

for i in range(num_iterations + 1):
    logits = neural_net.forward_propagation(trainx)
    
    if i % 10 == 0:
        print('Iteration : \t', i)
        ce = cross_entropy(logits, trainy)
        print("Train Loss:\t{}".format(ce))
        
        acc = accuracy(logits, trainy)
        print("Train Accuracy:\t{} \n".format(acc))
        
        with open('results_1000_iterations.csv', mode='a') as file:
            file_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            file_writer.writerow([i, ce, acc])

    neural_net.backprop(trainy)
    neural_net.gradient_step(learning_rate)

Number of parameters: 478410
Iteration : 	 0
Train Loss:	20.01954426412725
Train Accuracy:	0.11021666666666667 



  return 1/(1 + np.exp(-X))


Iteration : 	 10
Train Loss:	12.699778525652373
Train Accuracy:	0.09863333333333334 

Iteration : 	 20
Train Loss:	9.89477099552828
Train Accuracy:	0.10218333333333333 

Iteration : 	 30
Train Loss:	7.779723004971075
Train Accuracy:	0.10218333333333333 

Iteration : 	 40
Train Loss:	6.3727033076220865
Train Accuracy:	0.10218333333333333 

Iteration : 	 50
Train Loss:	5.527972248463615
Train Accuracy:	0.10218333333333333 

Iteration : 	 60
Train Loss:	4.779275364251287
Train Accuracy:	0.11236666666666667 

Iteration : 	 70
Train Loss:	4.066664014413192
Train Accuracy:	0.11236666666666667 

Iteration : 	 80
Train Loss:	3.450223898681594
Train Accuracy:	0.11236666666666667 

Iteration : 	 90
Train Loss:	3.0247587639349685
Train Accuracy:	0.11236666666666667 

Iteration : 	 100
Train Loss:	2.7446440925807383
Train Accuracy:	0.11236666666666667 

Iteration : 	 110
Train Loss:	2.5426907564509755
Train Accuracy:	0.11236666666666667 

Iteration : 	 120
Train Loss:	2.4114366235093003
Train Accu

Iteration : 	 960
Train Loss:	2.3021080378931336
Train Accuracy:	0.11236666666666667 

Iteration : 	 970
Train Loss:	2.3021176656350555
Train Accuracy:	0.11236666666666667 

Iteration : 	 980
Train Loss:	2.302127289809603
Train Accuracy:	0.11236666666666667 

Iteration : 	 990
Train Loss:	2.3021369088963013
Train Accuracy:	0.11236666666666667 

Iteration : 	 1000
Train Loss:	2.302146521466598
Train Accuracy:	0.11236666666666667 



6. Bonus: Stochastic Gradient Descent 

### Answer to the question: Results are obtained so much faster. Like, in seconds

In [9]:
learning_rate = 0.01
num_iterations = 1000
neural_net = fcc(784, 400, 400, 10)

for i in range(num_iterations + 1):
    
    batch = np.random.randint(low=0, high=len(trainy), size=(64,))
    batchx = np.empty((64, 784))
    batchy = np.empty((64, ))
    
    for j in range(len(batch)):
        batchx[j] = trainx[batch[j]]
        batchy[j] = trainy[batch[j]]
        
    batchx = batchx.astype(int)
    batchy = batchy.astype(int)
    logits = neural_net.forward_propagation(batchx)
    
    if i % 100 == 0:
        print('Iteration : \t', i)
        ce = cross_entropy(logits, batchy)
        print("Train Loss:\t{}".format(ce))
        acc = accuracy(logits, batchy)
        print("Train Accuracy:\t{} \n".format(acc))
        
    neural_net.backprop(batchy)
    neural_net.gradient_step(learning_rate)

Number of parameters: 478410
Iteration : 	 0
Train Loss:	28.127218229806303
Train Accuracy:	0.0625 

Iteration : 	 100
Train Loss:	5.058496631008965
Train Accuracy:	0.078125 

Iteration : 	 200
Train Loss:	2.9614718473165316
Train Accuracy:	0.09375 

Iteration : 	 300
Train Loss:	2.5677177934732813
Train Accuracy:	0.046875 

Iteration : 	 400
Train Loss:	2.403050325894572
Train Accuracy:	0.125 

Iteration : 	 500
Train Loss:	2.7639809759734715
Train Accuracy:	0.125 

Iteration : 	 600
Train Loss:	2.462344417162109
Train Accuracy:	0.109375 

Iteration : 	 700
Train Loss:	2.4316533825475686
Train Accuracy:	0.203125 

Iteration : 	 800
Train Loss:	2.4815518921598794
Train Accuracy:	0.15625 

Iteration : 	 900
Train Loss:	2.624125829059529
Train Accuracy:	0.078125 

Iteration : 	 1000
Train Loss:	2.4189620561990366
Train Accuracy:	0.125 



7. Bonus: Derivative of the Softmax Function 

### Check 04_dl_tut_Pavlo_Mospan_Anastasia_Karsten.pdf 

In [8]:
%reset -f