In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from numba import vectorize

class Learn_model:
    """The class trains a 3 layer network based on the given data using backpropagation. It takes the data and prepares it by
    normalizing the numbers, taking out the answer column and adding ones for bias. It generates random weights(depends if 
    the seed is set). Then it splits the data into mini-batches and trains using backpropagation. The class main function
    is to return(the method returns) the cost and success rate with specific parameters."""
    
    np.random.seed(42)
    
    def __init__(self, wLayer1, wLayer2, trainData, testData, position_of_expected_output, over_fit, num_iterations,
                 learning_rate, batch_size, rate_decreasing_speed):
        """The class takes these inputs:
        1. the size of the weight of the first layer.
        2. the size of the weight of the second layer.
        3. the training data in numpy array.
        4. testing data in numpy array
        5. position of expected output.
        6. a variable to protect from overfitting
        7. number of iterations the training will repeat
        8. learning rate
        9. the batch size
        10. how much overtime does the learning rate will decrease"""
        self.wLayer1 = wLayer1
        self.wLayer2 = wLayer2
        self.trainData = trainData
        self.testData = testData
        self.position_of_expected_output = position_of_expected_output
        self.over_fit = over_fit
        self.num_iterations = num_iterations
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.rate_decreasing_speed = rate_decreasing_speed
    
    def prepare_data_for_training(self):
        """regularizes the data and removes expected outputs from the data set, adds ones for the bias unit.
        returns training output is 2d array and 1d array of testing output"""
        
        expected_output1D = self.trainData[:, self.position_of_expected_output].copy()#list of expected Output
        expected_output1D_test = self.testData[:, self.position_of_expected_output].copy()
        #array of expected outputs. 1s are placed in rows that correspond with expected_output1D
        expected_output2D = np.zeros((len(expected_output1D), self.wLayer2[1]))
        for i in range(len(expected_output1D)):
            expected_output2D[i, expected_output1D[i]] = 1
        
        #regularizes the data and removes expected outputs from the training set
        self.trainData = np.delete(self.trainData, self.position_of_expected_output, 1)
        self.trainData = (self.trainData - np.average(self.trainData)) / np.std(self.trainData)
        self.trainData = np.append(np.ones((len(self.trainData), 1)), self.trainData, axis=1)
        
        self.testData = np.delete(self.testData, self.position_of_expected_output, 1)
        self.testData = (self.testData - np.average(self.testData)) / np.std(self.testData)
        self.testData = np.append(np.ones((len(self.testData), 1)), self.testData, axis=1)
        
        return expected_output2D, expected_output1D_test
    
    def generate_random_weights(self):
        """Creates random weights for network. Returns first and second layers weights"""
        w1 = np.random.rand(self.wLayer1[0], self.wLayer1[1]) - 1
        w2 = np.random.rand(self.wLayer2[0], self.wLayer2[1]) - 1
        return w1, w2
    
    def sigmoid(self, prediction):
        """takes each layers predictions and maps it between 0 and 1"""
        #prediction = prediction.astype(float)
        prediction = prediction * -1
        sigmoid = 1 / (1 + np.exp(prediction))
        return sigmoid
    
    def feedforward(self, w1, w2, input_layer):
        """Takes in weights of the layer 1 and 2 and the inputs and outputs 1 and 2 layers calculations"""
        layer2 = self.sigmoid(np.dot(input_layer, w1))#multiplies input layer with weights
        #add ones because the bias is in the weights in w[:, 0]. By multiplying by one it is the same as adding the bias
        z2 = np.append(np.ones((len(layer2), 1)), layer2, axis=1)
        layer3 = self.sigmoid(np.dot(z2, w2))
        #a3 -= 1e-15
        return layer2, layer3
    
    def backpropagation(self, w2, layer3, layer2, input_layer, expected_output):
        """Uses partial derivatives to calculate each layers partial derivative.
        Takes in second layers weights, layers 2 and 3 predictions, input layer,
        expected 2d array output. Returns the gradient of weights of layers 1 and 2."""
        delta3 = (layer3 - expected_output) #the derivative of the cost function
        delta2 = np.dot(delta3, w2.T) #the partial derivative of the second layer sigmoid
        #removing the bias layer. The partial derivative of bias is equal to delta3 because in layer 2 it is equal to 1.
        delta2 = np.delete(delta2, 0, 1)
        sigmoid_layer2_derivative = (layer2 * (1 - layer2))#the partial derivative of the sigmoid function
        delta2 = delta2 * sigmoid_layer2_derivative #the partial derivative of the second layer
        w1_gradient = (np.dot(input_layer.T, delta2)) / len(input_layer)#the layer 1 weights gradient
        layer2 = np.append(np.ones((len(layer2), 1)), layer2, axis=1) #this is needed to calculate with the bias gradient
        w2_gradient = (np.dot(layer2.T, delta3)) / len(input_layer)#the layer 2 weights gradient
        return w1_gradient, w2_gradient
    
    def cost(self, w1, w2, prediction, expected_output, over_fit):
        """Calculate the error in network prediction. Takes weights of layer 1 and 2, models prediction, 2d expected output
        and overfit unit. Returns the error of the function"""
        #temp_w_all is for calculating the sum of all weights, this helps to avoid overfitting
        temp_w_all = w1[:, 1:].ravel()
        temp_w_all = np.append(w2[:, 1:].ravel(), temp_w_all)
        #some number is too small or too big and overflow happens and it turns out either 1 or 0. The cost function has log()
        #and log(0) is invalid so the for loop adds or subtracts a very small number if it is 1 or 0.
        for i in range(len(prediction)):
            for j in range(len(prediction[0])):
                if prediction[i][j] == 1.:
                    #print(prediction[i][j])
                    prediction[i][j] -= 1e-15
                elif prediction[i][j] == 0.:
                    #print(prediction[i][j])
                    prediction[i][j] += 1e-15
                    
        #the cost function. The log punishes more if the answer is incorrect.
        cost = -(sum(sum(expected_output * np.log(prediction) + (1 - expected_output) * np.log(1 - prediction))) /
                 len(expected_output)) + (sum(np.power(temp_w_all, 2))) * over_fit / (2 * len(expected_output))
        return cost
    
    def succes_rate(self, layer3, expected_output):
        """cheeks how well the model is doing on the test dataset, returns the success rate."""
        #converts to a list for easy comparison
        expected_output = expected_output.reshape((1, len(expected_output))).tolist()
        predictions = np.argmax(layer3, axis=1)#takes the biggest models prediction for each example
        succes_rate = (expected_output == predictions).astype(int)
        succes_rate = sum(sum(succes_rate)) / len(expected_output[0])
        return succes_rate
    
    def shuffling_data(self, expected_output2D):
        """Combines train data and train datas expected output arrays in one and then shuffles the columns. Then restores
        data into seperate arrays. Takes in train data 2d array expected output. Returns train data expected_output2D"""
        combined_data = np.c_[self.trainData.reshape(len(self.trainData), - 1), #combines the arrays
                  expected_output2D.reshape(len(expected_output2D), - 1)]
        np.random.shuffle(combined_data)
        #splits the data back into separate arrays
        self.trainData = combined_data[:,0:len(self.trainData[0])]
        expected_output2D = combined_data[:, (len(combined_data[0]) - len(expected_output2D[0])):]
        return expected_output2D
        
    def update_weights(self, expected_output2D, w1, w2, mini_batch):
        """using feedforward and backpropagation methods updates the weights. update_weights takes train expected output
        2d array, 1 and 2 weights to change and the mini-batch. Returns modified weights of layers 1 and 2."""
        [layer2, layer3] = self.feedforward(w1, w2, mini_batch)
        [w1_gradient, w2_gradient] = self.backpropagation(w2, layer3, layer2, mini_batch, expected_output2D)
        w1 = (w1 * (1 - (self.over_fit * self.learning_rate / len(expected_output2D)))) - (w1_gradient * self.learning_rate)
        w2 = (w2 * (1 - (self.over_fit * self.learning_rate / len(expected_output2D)))) - (w2_gradient * self.learning_rate)
        return w1, w2
    
    def training(self):
        """Combines all of the methods together. Prepares the data, generates new weights, shuffles the data and splits
        it to mini batches. Returns how the model did on the train and test data set."""
        original_lerning_rate = self.learning_rate
        [expected_output2D, expected_output1D_test] = self.prepare_data_for_training()
        [w1, w2] = self.generate_random_weights()
        
        for i in range(self.num_iterations):
            expected_output2D = self.shuffling_data(expected_output2D)
            #his decreases the learning rate as iterations increase. Over time the gradient gets smaller so the steps also have
            #to get smaller, this helps to not overshoot.
            self.learning_rate = original_lerning_rate / (1 + (i / self.rate_decreasing_speed)) 
            for j in range(int(len(self.trainData) / self.batch_size)):
                #splitting data into mini batches
                mini_batch = self.trainData[j * self.batch_size : j * self.batch_size + self.batch_size, :]
                batch_output2D = expected_output2D[j * self.batch_size : j * self.batch_size + self.batch_size, :]
                [w1, w2] = self.update_weights(batch_output2D, w1, w2, mini_batch)
        
        #how well did the model does
        [layer2, layer3] = self.feedforward(w1, w2, self.trainData)
        cost_result = self.cost(w1, w2, layer3, expected_output2D, self.over_fit)
        [layer2, layer3] = self.feedforward(w1, w2, self.testData)
        succes_rate_result = self.succes_rate(layer3, expected_output1D_test)
        
        return cost_result, succes_rate_result  

# Training

In [2]:
import pandas as pd
%matplotlib notebook
import numpy as np

In [3]:
mnist_test = pd.read_csv("mnist_test.csv")
mnist_test_np = mnist_test.to_numpy()
mnist_train = pd.read_csv("mnist_train.csv")
mnist_train = mnist_train.to_numpy()

In [None]:
#this will test a wide range of different parameters
overfit = 0
learning_rate = 1.1
batch_size = 2
learning_rate_decreasing_speed = 70

different_parameters_tracker = np.zeros((2*5*6+1, 6))
index = 0
for i in range(2): #learning_rate
    for j in range(5): # overfit
        
        for k in range(6): # batch_size
            mnist = Learn_model([785, 78], [79, 10], mnist_train, mnist_test_np, 0, overfit, 50, 
                                learning_rate, batch_size, learning_rate_decreasing_speed)
                
            [cost, succes_rate] = mnist.training()
            #saving the parameters and models performance
            different_parameters_tracker[index] = [cost, succes_rate, overfit, learning_rate, batch_size, learning_rate_decreasing_speed]
            index += 1
            #tweking parameters
            batch_size *= 2
            
        batch_size = 2
        overfit += 0.5
        
    overfit = 0
    learning_rate += 0.3

In [8]:
succes_results = different_parameters_tracker[:, 1]
result = np.where(succes_results == np.amax(succes_results))

In [9]:
print("cost:", different_parameters_tracker[result, 0], "succes rate:", different_parameters_tracker[result, 1],
     "overfit value:", different_parameters_tracker[result, 2], "training rate:", different_parameters_tracker[result, 3],
     "batch size:", different_parameters_tracker[result, 4], "learning rate decreasing speed", different_parameters_tracker[result, 5])

cost: [[0.14153398]] succes rate: [[0.9606]] overfit value: [[0.]] training rate: [[1.1]] batch size: [[8.]] learning rate decreasing speed [[70.]]
