In [5]:
import time

import numpy as np
import pandas as pd

from copy import deepcopy

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

from matplotlib import pyplot as plt

In [6]:
def cross_entropy_loss(reference_Y, predicted_Y):
    
    loss = 0
    m = reference_Y.shape[0]
    
    for yt, yp in zip(reference_Y, predicted_Y):
        
        value = -1 * np.sum(yt * np.log(yp) + (1 - yt) * np.log(1 - yp))
        
        loss += value
    
    loss = (1 / m) * loss
    
    return loss


class NeuralNetwork:
    
    def __init__(self,
                 input_nodes,
                 first_hidden_nodes,
                 second_hidden_nodes,
                 output_nodes,
                 learning_rate,
                 hidden_activation_function,
                 hidden_activation_derivative):
        
        self.input_nodes = input_nodes
        self.first_hidden_nodes = first_hidden_nodes
        self.second_hidden_nodes = second_hidden_nodes
        self.output_nodes = output_nodes
        
        self.learning_rate = learning_rate
        
        self.bwih = np.random.uniform(-0.1, 0.1, self.first_hidden_nodes).reshape(self.first_hidden_nodes, 1)
        self.bwhh = np.random.uniform(-0.1, 0.1, self.second_hidden_nodes).reshape(self.second_hidden_nodes, 1)
        self.bwho = np.random.uniform(-0.1, 0.1, self.output_nodes).reshape(self.output_nodes, 1)
        
        self.wih = np.random.uniform(-0.1, 0.1, self.first_hidden_nodes * self.input_nodes).reshape(self.first_hidden_nodes, self.input_nodes)
        self.whh = np.random.uniform(-0.1, 0.1, self.second_hidden_nodes * self.first_hidden_nodes).reshape(self.second_hidden_nodes, self.first_hidden_nodes)
        
        hidden_nodes = self.first_hidden_nodes if (self.second_hidden_nodes == 0) else self.second_hidden_nodes
        
        self.who = np.random.uniform(-0.1, 0.1, self.output_nodes * hidden_nodes).reshape(self.output_nodes, hidden_nodes)
        
        self.hidden_activation_function = hidden_activation_function
        self.hidden_activation_derivative = hidden_activation_derivative
    
    def _softmax(self, x):
        
        value = np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))) 
        return value
    
    def partial_fit(self, X, y):
        
        for inputs, targets in zip(X_normalized_training, y_normalized_training):

            self._train(inputs, targets)
    
    def _train(self, inputs, targets):
        
        if (self.second_hidden_nodes == 0):
            
            self._one_hidden_layer_train(inputs, targets)
            
        else:
            
            self._two_hidden_layer_train(inputs, targets)
        
    
    def _one_hidden_layer_train(self, inputs, targets):
        
        inputs = np.array(inputs, ndmin = 2).T
        targets = np.array(targets, ndmin = 2).T
                

        hidden_inputs = np.dot(self.wih, inputs) + (self.bwih * 1)
        hidden_outputs = self.hidden_activation_function(hidden_inputs)
        
        final_inputs = np.dot(self.who, hidden_outputs) + (self.bwho * 1)
        outputs = self._softmax(final_inputs)
        
        output_errors = targets - outputs
        
        self.bwho += self.learning_rate * output_errors
        self.who += self.learning_rate * np.dot(output_errors, hidden_outputs.T)
                
        hidden_errors = np.dot(self.who.T, output_errors)
        
        self.bwih += self.learning_rate * (hidden_errors * self.hidden_activation_derivative(hidden_inputs))        
        self.wih += self.learning_rate * np.dot((hidden_errors * self.hidden_activation_derivative(hidden_inputs)), inputs.T)
        
    
    def _two_hidden_layer_train(self, inputs, targets):
        
        inputs = np.array(inputs, ndmin = 2).T
        targets = np.array(targets, ndmin = 2).T
        
        first_hidden_inputs = np.dot(self.wih, inputs) + (self.bwih * 1)
        first_hidden_outputs = self.hidden_activation_function(first_hidden_inputs)
        
        second_hidden_inputs = np.dot(self.whh, first_hidden_outputs) + (self.bwhh * 1)
        second_hidden_outputs = self.hidden_activation_function(second_hidden_inputs)
        
        final_inputs = np.dot(self.who, second_hidden_outputs) + (self.bwho * 1)
        outputs  = self._softmax(final_inputs)
        
        output_errors = targets - outputs
        
        self.bwho += self.learning_rate * output_errors
        self.who += self.learning_rate * np.dot(output_errors, second_hidden_outputs.T)
        
        second_hidden_errors = np.dot(self.who.T, output_errors)
        
        self.bwhh += self.learning_rate * (second_hidden_errors * self.hidden_activation_derivative(second_hidden_inputs))
        self.whh += self.learning_rate * np.dot((second_hidden_errors * self.hidden_activation_derivative(second_hidden_inputs)), first_hidden_outputs.T)
        
        first_hidden_errors = np.dot(self.whh.T, second_hidden_errors)
        
        self.bwih += self.learning_rate * (first_hidden_errors * self.hidden_activation_derivative(first_hidden_inputs))
        self.wih += self.learning_rate * np.dot((first_hidden_errors * self.hidden_activation_derivative(first_hidden_inputs)), inputs.T)
    
    def predict(self, X):
        
        outputs = []
        
        for inputs in X:
            
            output = self._query(inputs)
            
            output = np.argmax(output)
            
            outputs.append(output)
            
        outputs = np.array(outputs)
        
        return outputs
    
    def predict_proba(self, X):
        
        outputs = []
        
        for inputs in X:
            
            output = self._query(inputs)
            
            outputs.append(output)
            
        outputs = np.array(outputs)
        
        return outputs
        
    
    def _query(self, inputs):
        
        result = np.array([])
        
        if (self.second_hidden_nodes == 0):
            
            result = self._one_hidden_layer_query(inputs)
        
        else:
            
            result = self._two_hidden_layer_query(inputs)
    
        return result
    
    def _one_hidden_layer_query(self, inputs):
        
        inputs = np.array(inputs, ndmin = 2).T
        
        hidden_inputs = np.dot(self.wih, inputs) + (self.bwih * 1)
        hidden_outputs = self.hidden_activation_function(hidden_inputs)
        
        final_inputs = np.dot(self.who, hidden_outputs) + (self.bwho * 1)
        final_outputs  = self._softmax(final_inputs)
        
        return final_outputs.ravel()
    
    def _two_hidden_layer_query(self, inputs):
        
        inputs = np.array(inputs, ndmin = 2).T
        
        first_hidden_inputs = np.dot(self.wih, inputs) + (self.bwih * 1)
        first_hidden_outputs = self.hidden_activation_function(first_hidden_inputs)
        
        second_hidden_inputs = np.dot(self.whh, first_hidden_outputs) + (self.bwhh * 1)
        second_hidden_outputs = self.hidden_activation_function(second_hidden_inputs)
        
        final_inputs = np.dot(self.who, second_hidden_outputs) + (self.bwho * 1)
        final_outputs  = self._softmax(final_inputs)
        
        return final_outputs.ravel()

Read Training Data

In [7]:
training_set = pd.read_csv('fashion-mnist_train.csv', sep = ',')

Split Features and Labels

In [8]:
X, y = training_set.iloc[:, 1:].values, training_set.iloc[:, 0].values

K-Fold Cross Validation With Logistic Activation Function and One Hidden Layer

In [9]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 0
output_nodes = 10

activation_function = lambda x : 1 / (1 + np.exp(-x))
activation_derivative = lambda x : activation_function(x) * (1 - activation_function(x))

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

Fold: 1 Accuracy: 0.87275 Time: 383.8347337245941s
Fold: 2 Accuracy: 0.8628333333333333 Time: 305.77174735069275s
Fold: 3 Accuracy: 0.8686666666666667 Time: 297.05134868621826s
Fold: 4 Accuracy: 0.8606666666666667 Time: 303.55727887153625s
Fold: 5 Accuracy: 0.8621666666666666 Time: 299.03385615348816s
Fold Mean: 0.8654166666666668 Time: 1589.2534823417664S


K-Fold Cross Validation With Logistic Activation Function and Two Hidden Layer

In [10]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 5
output_nodes = 10

activation_function = lambda x : 1 / (1 + np.exp(-x))
activation_derivative = lambda x : activation_function(x) * (1 - activation_function(x))

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

Fold: 1 Accuracy: 0.8511666666666666 Time: 402.18348836898804s
Fold: 2 Accuracy: 0.8490833333333333 Time: 381.5738859176636s
Fold: 3 Accuracy: 0.84975 Time: 388.8062937259674s
Fold: 4 Accuracy: 0.8484166666666667 Time: 409.5522663593292s
Fold: 5 Accuracy: 0.83925 Time: 403.2087891101837s
Fold Mean: 0.8475333333333334 Time: 1985.3287348747253S


K-Fold Cross Validation With ReLU Activation Function and One Hidden Layer

In [11]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 0
output_nodes = 10

activation_function = lambda x : np.maximum(x, 0) 
activation_derivative = lambda x : 1 * (x > 0)

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

Fold: 1 Accuracy: 0.6004166666666667 Time: 268.31643986701965s
Fold: 2 Accuracy: 0.5605 Time: 274.70882296562195s
Fold: 3 Accuracy: 0.5483333333333333 Time: 271.36923360824585s
Fold: 4 Accuracy: 0.6384166666666666 Time: 268.75539541244507s
Fold: 5 Accuracy: 0.583 Time: 267.8313226699829s
Fold Mean: 0.5861333333333333 Time: 1350.9842262268066S


K-Fold Cross Validation With ReLU Activation Function and Two Hidden Layer

In [12]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 5
output_nodes = 10

activation_function = lambda x : np.maximum(x, 0) 
activation_derivative = lambda x : 1 * (x > 0)

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

  # Remove the CWD from sys.path while we load stuff.
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  # This is added back by InteractiveShellApp.init_path()


Fold: 1 Accuracy: 0.10125 Time: 401.4645824432373s
Fold: 2 Accuracy: 0.10141666666666667 Time: 390.5203847885132s
Fold: 3 Accuracy: 0.0985 Time: 405.6593062877655s
Fold: 4 Accuracy: 0.09891666666666667 Time: 396.02067041397095s
Fold: 5 Accuracy: 0.09991666666666667 Time: 385.85765314102173s
Fold Mean: 0.1 Time: 1979.5276100635529S


K-Fold Cross Validation With TanH Activation Function and One Hidden Layer

In [13]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 0
output_nodes = 10

activation_function = lambda x : np.tanh(x)
activation_derivative = lambda x : 1.0 - np.tanh(x) ** 2

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

Fold: 1 Accuracy: 0.8648333333333333 Time: 250.76187443733215s
Fold: 2 Accuracy: 0.8618333333333333 Time: 250.45608687400818s
Fold: 3 Accuracy: 0.8613333333333333 Time: 258.0417287349701s
Fold: 4 Accuracy: 0.8645833333333334 Time: 249.24977087974548s
Fold: 5 Accuracy: 0.8571666666666666 Time: 252.72898936271667s
Fold Mean: 0.86195 Time: 1261.241458415985S


K-Fold Cross Validation With TanH Activation Function and Two Hidden Layer

In [14]:
k_folds = 5
epochs = 60
learning_rate = 0.001

input_nodes = 784
first_hidden_nodes = 15
second_hidden_nodes = 5
output_nodes = 10

activation_function = lambda x : np.tanh(x)
activation_derivative = lambda x : 1.0 - np.tanh(x) ** 2

k_fold = KFold(n_splits = k_folds, random_state = None, shuffle = False)

accuracies = np.array([])

current_fold = 0

total_start = time.time()

for train_index, validation_index in k_fold.split(X, y):
    
    start = time.time()
    
    current_fold += 1
    
    X_training, X_validation = X[train_index], X[validation_index]
    y_training, y_validation = y[train_index], y[validation_index] 
    
    # An grayscale image must have 255 as maximum value
    maximum_value = 255 
    
    # Executes normalization between 0.01 and 0.99 to avoid lose of neurons
    X_normalized_training = (X_training / maximum_value * 0.99) + 0.01
    X_normalized_validation = (X_validation / maximum_value * 0.99) + 0.01
    
    # Executes one-hot-encoding on labels
    y_normalized_training = pd.get_dummies(y_training).values
    y_normalized_validation = pd.get_dummies(y_validation).values
    
    n = NeuralNetwork(input_nodes,
                      first_hidden_nodes,
                      second_hidden_nodes,
                      output_nodes,
                      learning_rate,
                      activation_function,
                      activation_derivative)
    
    for epoch in range(epochs):
        
        n.partial_fit(X_normalized_training, y_normalized_training)
        
    y_validation_predicted = n.predict(X_normalized_validation)
    
    accuracy = accuracy_score(y_validation, y_validation_predicted)
    
    accuracies = np.append(accuracies, [accuracy])
    
    finish = time.time()
    
    print("Fold: " + str(current_fold) + " Accuracy: " + str(accuracy) + " Time: " + str((finish - start)) + "s")

total_finish = time.time()    

print("=====================================================================")
print("Fold Mean: " + str(accuracies.mean()) + " Time: " + str(total_finish - total_start) + "S")

Fold: 1 Accuracy: 0.86175 Time: 316.42951703071594s
Fold: 2 Accuracy: 0.85675 Time: 321.8556385040283s
Fold: 3 Accuracy: 0.8565 Time: 308.57216143608093s
Fold: 4 Accuracy: 0.856 Time: 292.59344124794006s
Fold: 5 Accuracy: 0.8525833333333334 Time: 293.26931071281433s
Fold Mean: 0.8567166666666667 Time: 1532.7240707874298S
