In [1]:
#importing essential libraries

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from keras.datasets import mnist
from keras.utils import to_categorical

#loading the datasets

(x_training_set, y_training_set), (x_testing_set, y_testing_set) = mnist.load_data()

#storing different classes in a list

classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
'''
#function definition to plot the one image from different classes

def plot_img(images, labels, classes):
  image_list = [] #list to store one image from each class
  class_num = len(classes)

  for i in range(class_num):
    indx = np.where(labels == i)[0][0]
    image_list.append(images[indx])

  #plotting the images

  plt.figure(figsize = (10,10))
  for i in range(class_num):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(image_list[i], cmap=plt.cm.binary)
    plt.xlabel(classes[i])
  plt.show()


#calling the function

plot_img(x_training_set, y_training_set, classes)'''

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


'\n#function definition to plot the one image from different classes\n\ndef plot_img(images, labels, classes):\n  image_list = [] #list to store one image from each class\n  class_num = len(classes)\n\n  for i in range(class_num):\n    indx = np.where(labels == i)[0][0]\n    image_list.append(images[indx])\n\n  #plotting the images\n\n  plt.figure(figsize = (10,10))\n  for i in range(class_num):\n    plt.subplot(5,5,i+1)\n    plt.xticks([])\n    plt.yticks([])\n    plt.grid(False)\n    plt.imshow(image_list[i], cmap=plt.cm.binary)\n    plt.xlabel(classes[i])\n  plt.show()\n\n\n#calling the function\n\nplot_img(x_training_set, y_training_set, classes)'

In [2]:
#splitting the data for cross validation

x_validation_set = x_training_set[50000:]
y_validation_set = y_training_set[50000:]     # validation set has 10000 data

x_training_set = x_training_set[:50000]
y_training_set = y_training_set[:50000]


#vactorising the data

x_training_set = x_training_set.reshape(x_training_set.shape[0], 784)  #28x28=784
x_testing_set = x_testing_set.reshape(x_testing_set.shape[0], 784)
x_validation_set = x_validation_set.reshape(x_validation_set.shape[0], 784)

#normalising the data

x_train = x_training_set/255  # since, pixel range from 0 to 255
x_test = x_testing_set/255
x_valid = x_validation_set/255

#one hot encoding for labels to represent categorical variables as numerical values

y_train = to_categorical(y_training_set)
y_test = to_categorical(y_testing_set)
y_valid = to_categorical(y_validation_set)


#default_x_train = x_train
#default_y_train = y_train

# some useful functions

#for hidden layer
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def relu(x):
  return (x>0)*(x)

def tanh(x):
  return np.tanh(x)

def gradient_sigmoid(x):
  return sigmoid(x) * (1 - sigmoid(x))

def gradient_tanh(x):
  return 1 - np.tanh(x) ** 2

def gradient_relu(x):
  return np.where(x > 0, 1, 0)

#for output layer
def softmax(x):
  exponents = np.exp(x - np.max(x, axis=1, keepdims=True))
  return exponents / np.sum(exponents, axis=1, keepdims=True)

'''
#loss function(cross-entropy)
def loss_func(x,y):
  L = -np.mean(np.sum(x * np.log(y), axis=1))
  return L'''

# loss function (cross-entropy)
def loss_func(x, y):
    epsilon = 1e-10  # small epsilon value to avoid log overflow
    clipped_y = np.clip(y, epsilon, 1 - epsilon)  # clip predicted probabilities
    L = -np.mean(np.sum(x * np.log(clipped_y), axis=1))
    return L


#function to choose the activation functions

def choose_activation(x, activation_function):
  if activation_function == 'sigmoid':
    return sigmoid(x)

  elif activation_function == 'tanh':
    return tanh(x)

  elif activation_function == 'relu':
    return relu(x)


#function for derivatives

def activation_derivative(x, activation_function):
    if activation_function == 'sigmoid':
        return gradient_sigmoid(x)
    elif activation_function == 'relu':
        return gradient_relu(x)
    elif activation_function == 'tanh':
        return gradient_tanh(x)
    else:
        raise ValueError("Invalid activation function. Please choose from 'sigmoid', 'relu', or 'tanh'.")

In [3]:
#class definition

class Network:
    def __init__(self, neuron_sizes, weight_initialiser, activation_function, momentum, beta1, beta2):

        self.train_loss_sgd = []
        self.train_loss_momentum = []


        self.total_layers = len(neuron_sizes)
        self.momentum = momentum
        #self.weight_initialiser = weight_initialiser
        self.activation_function = activation_function
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = 1e-8

        # Initializing the weights and biases
        # After initializing, store weights and biases in separate lists
        if weight_initialiser == 'xavier':
            if activation_function != 'relu':
                self.Weights = [-1/np.sqrt(neuron_sizes[i])+np.random.randn(neuron_sizes[i], neuron_sizes[i+1])*2/np.sqrt(neuron_sizes[i]) for i in range(self.total_layers-1)]
                self.biases = [-1/np.sqrt(neuron_sizes[i])+np.random.randn(1, neuron_sizes[i+1])*2/np.sqrt(neuron_sizes[i]) for i in range(self.total_layers-1)]
            else:
                self.Weights = [np.random.randn(neuron_sizes[i], neuron_sizes[i+1])*(np.sqrt(2/(neuron_sizes[i]))) for i in range(self.total_layers-1)]
                self.biases = [np.random.randn(1, neuron_sizes[i+1])*(np.sqrt(2/(neuron_sizes[i]))) for i in range(self.total_layers-1)]
        else:
            self.Weights = [np.random.randn(neuron_sizes[i], neuron_sizes[i+1]) for i in range(self.total_layers-1)]
            self.biases = [np.random.randn(1, neuron_sizes[i+1])*0.05 for i in range(self.total_layers-1)]

        # Initializing momentum weights and biases
        self.Weights_moment = [np.zeros_like(x) for x in self.Weights]
        self.biases_moment = [np.zeros_like(x) for x in self.biases]

        # Initializing Nesterov momentum adjusted weights and biases
        self.momentum_adjusted_Weights = None
        self.momentum_adjusted_biases = None


        # Initializing for adam optimizer
        self.Weights_moment_adam1 = [np.zeros_like(x) for x in self.Weights]
        self.biases_moment_adam1 = [np.zeros_like(x) for x in self.biases]
        self.Weights_moment_adam2 = [np.zeros_like(x) for x in self.Weights]  #rmsprop
        self.biases_moment_adam2 = [np.zeros_like(x) for x in self.biases]   #rmsprop


        # Initializing for nadam optimizer
        self.Weights_moment_nadam1 = [np.zeros_like(x) for x in self.Weights]
        self.biases_moment_nadam1 = [np.zeros_like(x) for x in self.biases]
        self.Weights_moment_nadam2 = [np.zeros_like(x) for x in self.Weights]
        self.biases_moment_nadam2 = [np.zeros_like(x) for x in self.biases]





    # Defining function for forward propagation
    def forward_prop(self, X):
        self.pre_activations_A = [None]*(self.total_layers)  # List to store the pre-activations
        self.activations_H = [X]  # List to store the activations

        for i in range(self.total_layers-1):
            self.pre_activations_A[i+1] = np.dot(self.activations_H[i], self.Weights[i]) + self.biases[i]
            if i == self.total_layers-2:  # For output layer: activation function = softmax
                h = softmax(self.pre_activations_A[i+1])
                self.activations_H.append(h)
            else:  # For hidden layers: activation function = sigmoid
                h = choose_activation(self.pre_activations_A[i+1], self.activation_function)
                #h = sigmoid(self.pre_activations_A[i+1])
                self.activations_H.append(h)
        return self.activations_H[-1]

    #defining the training method for stochastic gradient method
    def train_sgd(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
        for epoch in range(total_epochs):
            loss_epo = 0
            accuracy = 0
            for i in range(0, x_train.shape[0], batch_size):
                # Forward pass
                Xbatch = x_train[i:i+batch_size]
                Ybatch = y_train[i:i+batch_size]
                y_predicted = self.forward_prop(Xbatch)

                #calculate loss
                loss = loss_func(Ybatch, y_predicted)
                loss_epo = loss_epo+loss

                #calculate accuracy
                acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
                accuracy = accuracy + acc

                # Applying backpropagation algorithm
                loss_gradient = y_predicted - Ybatch
                for j in range(self.total_layers - 1, 0, -1):
                    gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
                    gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)
                    if j > 1:
                        derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
                        loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T) * derivative_activation
                        #loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T) * (self.activations_H[j-1] * (1 - self.activations_H[j-1]))

                    #updation of parameters
                    self.Weights[j-1] = self.Weights[j-1] - learning_rate_eta * gradient_W
                    self.biases[j-1] = self.biases[j-1] - learning_rate_eta * gradient_b


            #computing average train accuracy
            training_accuracy = accuracy / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
            wandb.log({'train-accuracy':training_accuracy*100})


            #computing average epoch loss
            loss_epo = loss_epo / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
            wandb.log({'train-loss':loss_epo})
            #self.train_loss_sgd.append(loss_epo)
            '''
            #computing training loss
            train_loss = loss_func(y_train, y_predicted)
            print(f'Epoch Number {epoch+1}, training loss: {valid_accuracy:.4f}')'''

            #computing accuracy on validation set
            y_valid_predicted = self.forward_prop(x_valid)
            valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
            print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
            wandb.log({'val_accuracy':valid_accuracy*100})
            wandb.log({'epoch':epoch+1})

            #computing validation loss
            val_loss = loss_func(y_valid, y_valid_predicted)
            print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
            wandb.log({'val-loss':val_loss})


        #checking the efficiency of the model by passing test set
        y_test_predicted = self.forward_prop(x_test)
        test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
        print(f'Test Accuracy: {test_accuracy:.4f}')

    # Defining the momentum-based gradient descent training method
    def train_momentum(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
        for epoch in range(total_epochs):
            loss_epo = 0
            accuracy = 0
            for i in range(0, x_train.shape[0], batch_size):

                #performing the forward pass
                Xbatch = x_train[i:i+batch_size]
                Ybatch = y_train[i:i+batch_size]
                y_predicted = self.forward_prop(Xbatch)

                #calculate loss
                loss = loss_func(Ybatch, y_predicted)
                loss_epo = loss_epo+loss

                #calculate accuracy
                acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
                accuracy = accuracy + acc

                #performing the back-propagation
                loss_gradient = y_predicted - Ybatch
                for j in range(self.total_layers - 1, 0, -1):
                    gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
                    gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)
                    if j > 1:
                        derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
                        loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T) * derivative_activation
                        #loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T) * (self.activations_H[j-1] * (1 - self.activations_H[j-1]))

                    #updation of momentum
                    self.Weights_moment[j-1] = self.momentum * self.Weights_moment[j-1] + learning_rate_eta * gradient_W
                    self.biases_moment[j-1] = self.momentum * self.biases_moment[j-1] + learning_rate_eta * gradient_b

                    #updation of parameters
                    self.Weights[j-1] = self.Weights[j-1] - self.Weights_moment[j-1]
                    self.biases[j-1] = self.biases[j-1] - self.biases_moment[j-1]

            #computing average train accuracy
            training_accuracy = accuracy / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
            wandb.log({'train-accuracy':training_accuracy*100})


            #computing average epoch(training loss) loss
            loss_epo = loss_epo / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
            wandb.log({'train-loss':loss_epo})
            #self.train_loss_sgd.append(loss_epo)

            #computing accuracy on validation set
            y_valid_predicted = self.forward_prop(x_valid)
            valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
            print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
            wandb.log({'val_accuracy':valid_accuracy*100})
            wandb.log({'epoch':epoch+1})

            #computing validation loss
            val_loss = loss_func(y_valid, y_valid_predicted)
            print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
            wandb.log({'val-loss':val_loss})

        #checking the efficiency of the model by passing test set
        y_test_predicted = self.forward_prop(x_test)
        test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
        print(f'Test Accuracy: {test_accuracy:.4f}')

    #defining the training method for nesterov accelerated gradient descent method
    def train_nag(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
        for epoch in range(total_epochs):
            loss_epo = 0
            accuracy = 0
            for i in range(0, x_train.shape[0], batch_size):
                #performing forward pass
                Xbatch = x_train[i:i+batch_size]
                Ybatch = y_train[i:i+batch_size]

                # Nesterov accelerated gradient descent: lookahead
                '''
                self.momentum_adjusted_Weights = [self.Weights[j-1] - self.momentum * self.Weights_moment[j-1] for j in range(self.total_layers - 1, 0, -1)]
                self.momentum_adjusted_biases = [self.biases[j-1] - self.momentum * self.biases_moment[j-1] for j in range(self.total_layers - 1, 0, -1)]'''

                self.momentum_adjusted_Weights = [self.Weights[j] - self.momentum * self.Weights_moment[j] for j in range(self.total_layers-1)]
                self.momentum_adjusted_biases = [self.biases[j] - self.momentum * self.biases_moment[j] for j in range(self.total_layers-1)]
                '''
                print(self.momentum_adjusted_Weights[-1].shape)
                print(self.momentum_adjusted_Weights[-2].shape)
                print(self.momentum_adjusted_Weights[-3].shape)'''

                y_predicted = self.forward_prop(Xbatch)


                #calculate loss
                loss = loss_func(Ybatch, y_predicted)
                loss_epo = loss_epo+loss

                #calculate accuracy
                acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
                accuracy = accuracy + acc


                #performing back propagation
                loss_gradient = y_predicted - Ybatch
                for j in range(self.total_layers - 1, 0, -1):
                    gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
                    gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)
                    if j > 1:
                        #derivative_activation = 1 - self.activations_H[j-1] ** 2
                        derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
                        loss_gradient = np.dot(loss_gradient, self.momentum_adjusted_Weights[j-1].T) * derivative_activation

                        #loss_gradient = np.dot(loss_gradient, self.momentum_adjusted_Weights[j-1].T) * (self.activations_H[j-1] * (1 - self.activations_H[j-1]))

                    self.Weights[j-1] = self.momentum_adjusted_Weights[j-1] - learning_rate_eta * gradient_W
                    self.biases[j-1] = self.momentum_adjusted_biases[j-1] - learning_rate_eta * gradient_b


            #computing average train accuracy
            training_accuracy = accuracy / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
            wandb.log({'train-accuracy':training_accuracy*100})


            #computing average epoch(training loss) loss
            loss_epo = loss_epo / (x_train.shape[0] / batch_size)
            print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
            wandb.log({'train-loss':loss_epo})
            #self.train_loss_sgd.append(loss_epo)

            #computing accuracy on validation set
            y_valid_predicted = self.forward_prop(x_valid)
            valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
            print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
            wandb.log({'val_accuracy':valid_accuracy*100})
            wandb.log({'epoch':epoch+1})

            #computing validation loss
            val_loss = loss_func(y_valid, y_valid_predicted)
            print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
            wandb.log({'val-loss':val_loss})

        #checking the efficiency of the model by passing test set
        y_test_predicted = self.forward_prop(x_test)
        test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
        print(f'Test Accuracy: {test_accuracy:.4f}')



    def train_adam(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
      for epoch in range(total_epochs):
        loss_epo = 0
        accuracy = 0
        for i in range(0, x_train.shape[0], batch_size):
          #performing forward pass
          Xbatch = x_train[i:i+batch_size]
          Ybatch = y_train[i:i+batch_size]
          y_predicted = self.forward_prop(Xbatch)

          #calculate loss
          loss = loss_func(Ybatch, y_predicted)
          loss_epo = loss_epo + loss

          #calculate accuracy
          acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
          accuracy = accuracy + acc

          #backpropagation
          loss_gradient = y_predicted-Ybatch
          for j in range(self.total_layers-1, 0, -1):
            gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
            gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)

            if j > 1:
              derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
              loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T)*derivative_activation

            #compute 1st momentum term
            self.Weights_moment_adam1[j-1] = self.beta1 * self.Weights_moment_adam1[j-1] + (1-self.beta1) * gradient_W
            self.biases_moment_adam1[j-1] = self.beta1 * self.biases_moment_adam1[j-1] + (1-self.beta1) * gradient_b

            #compute 2nd moment term
            self.Weights_moment_adam2[j-1] = self.beta2 * self.Weights_moment_adam2[j-1] + (1-self.beta2) * np.square(gradient_W)
            self.biases_moment_adam2[j-1] = self.beta2 * self.biases_moment_adam2[j-1] + (1-self.beta2) * np.square(gradient_b)

            #corrected terms in 1st moment
            corrected_weight_adam1 = self.Weights_moment_adam1[j-1] / (1-self.beta1 ** (epoch+1))
            corrected_bias_adam1 = self.biases_moment_adam1[j-1] / (1-self.beta1 ** (epoch+1))

            #corrected terms in 2nd moment
            corrected_weight_adam2 = self.Weights_moment_adam2[j-1] / (1-self.beta2 ** (epoch+1))
            corrected_bias_adam2 = self.biases_moment_adam2[j-1] / (1-self.beta2 ** (epoch+1))

            #updating weights and biases
            self.Weights[j-1] = self.Weights[j-1] - learning_rate_eta * corrected_weight_adam1 / (np.sqrt(corrected_weight_adam2)+self.epsilon)
            self.biases[j-1] = self.biases[j-1] - learning_rate_eta * corrected_bias_adam1 / (np.sqrt(corrected_bias_adam2)+self.epsilon)


        #computing average train accuracy
        training_accuracy = accuracy / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
        wandb.log({'train-accuracy':training_accuracy*100})



        #computing average epoch(training loss) loss
        loss_epo = loss_epo / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
        wandb.log({'train-loss':loss_epo})


        #computing accuracy on validation set
        y_valid_predicted = self.forward_prop(x_valid)
        valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
        print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
        wandb.log({'val_accuracy':valid_accuracy*100})
        wandb.log({'epoch':epoch+1})

        #computing validation loss
        val_loss = loss_func(y_valid, y_valid_predicted)
        print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
        wandb.log({'val-loss':val_loss})

      #checking the efficiency of the model by passing test set
      y_test_predicted = self.forward_prop(x_test)
      #return y_test_predicted

      test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
      print(f'Test Accuracy: {test_accuracy:.4f}')



    def train_rmsprop(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
      for epoch in range(total_epochs):
        loss_epo = 0
        accuracy = 0
        for i in range(0, x_train.shape[0], batch_size):
          Xbatch = x_train[i:i+batch_size]
          Ybatch = y_train[i:i+batch_size]
          y_predicted = self.forward_prop(Xbatch)

          #calculate loss
          loss = loss_func(Ybatch, y_predicted)
          loss_epo = loss_epo + loss

          #calculate accuracy
          acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
          accuracy = accuracy + acc

          #backpropagation
          loss_gradient = y_predicted-Ybatch
          for j in range(self.total_layers-1, 0, -1):
            gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
            gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)

            if j > 1:
              derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
              loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T)*derivative_activation
            '''
            #compute 1st momentum term
            self.Weights_moment_adam1[j-1] = self.beta1 * self.Weights_moment_adam1[j-1] + (1-self.beta1) * gradient_W
            self.biases_moment_adam1[j-1] = self.beta1 * self.biases_moment_adam1[j-1] + (1-self.beta1) * gradient_b'''

            #compute moment term
            self.Weights_moment_adam2[j-1] = self.beta2 * self.Weights_moment_adam2[j-1] + (1-self.beta2) * np.square(gradient_W)
            self.biases_moment_adam2[j-1] = self.beta2 * self.biases_moment_adam2[j-1] + (1-self.beta2) * np.square(gradient_b)
            '''
            #corrected terms in 1st moment
            corrected_weight_adam1 = self.Weights_moment_adam1[j-1] / (1-self.beta1 ** (epoch+1))
            corrected_bias_adam1 = self.biases_moment_adam1[j-1] / (1-self.beta1 ** (epoch+1))

            #corrected terms in 2nd moment
            corrected_weight_adam2 = self.Weights_moment_adam2[j-1] / (1-self.beta2 ** (epoch+1))
            corrected_bias_adam2 = self.biases_moment_adam2[j-1] / (1-self.beta2 ** (epoch+1))'''

            #updating weights and biases
            '''
            self.Weights[j-1] = self.Weights[j-1] - learning_rate_eta * corrected_weight_adam1 / (np.sqrt(corrected_weight_adam2)+self.epsilon)
            self.biases[j-1] = self.biases[j-1] - learning_rate_eta * corrected_bias_adam1 / (np.sqrt(corrected_bias_adam2)+self.epsilon)'''

            self.Weights[j-1] = self.Weights[j-1] - learning_rate_eta * gradient_W / (np.sqrt(self.Weights_moment_adam2[j-1]) + self.epsilon)
            self.biases[j-1] = self.biases[j-1] - learning_rate_eta * gradient_b / (np.sqrt(self.biases_moment_adam2[j-1]) + self.epsilon)

        #computing average train accuracy
        training_accuracy = accuracy / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
        wandb.log({'train-accuracy':training_accuracy*100})


        #computing average epoch(training loss) loss
        loss_epo = loss_epo / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
        wandb.log({'train-loss':loss_epo})
        #self.train_loss_sgd.append(loss_epo)

        #computing accuracy on validation set
        y_valid_predicted = self.forward_prop(x_valid)
        valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
        print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
        wandb.log({'val_accuracy':valid_accuracy*100})
        wandb.log({'epoch':epoch+1})

        #computing validation loss
        val_loss = loss_func(y_valid, y_valid_predicted)
        print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
        wandb.log({'val-loss':val_loss})

      #checking the efficiency of the model by passing test set
      y_test_predicted = self.forward_prop(x_test)
      test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
      print(f'Test Accuracy: {test_accuracy:.4f}')



    def train_nadam(self, x_train, y_train, learning_rate_eta, total_epochs, batch_size):
      for epoch in range(total_epochs):
        loss_epo = 0
        accuracy = 0
        for i in range(0, x_train.shape[0], batch_size):
          Xbatch = x_train[i:i+batch_size]
          Ybatch = y_train[i:i+batch_size]
          y_predicted = self.forward_prop(Xbatch)

          #calculate loss
          loss = loss_func(Ybatch, y_predicted)
          loss_epo = loss_epo + loss

          #calculate accuracy
          acc = accuracy_score(np.argmax(y_predicted, axis=1), np.argmax(Ybatch, axis=1))
          accuracy = accuracy + acc

          #backpropagation
          loss_gradient = y_predicted-Ybatch
          for j in range(self.total_layers-1, 0, -1):
            gradient_W = np.dot(self.activations_H[j-1].T, loss_gradient)
            gradient_b = np.sum(loss_gradient, axis=0, keepdims=True)

            if j > 1:
              derivative_activation = activation_derivative(self.pre_activations_A[j-1], self.activation_function)
              loss_gradient = np.dot(loss_gradient, self.Weights[j-1].T)*derivative_activation

            #compute 1st momentum term
            self.Weights_moment_nadam1[j-1] = self.beta1 * self.Weights_moment_nadam1[j-1] + (1-self.beta1) * gradient_W
            self.biases_moment_nadam1[j-1] = self.beta1 * self.biases_moment_nadam1[j-1] + (1-self.beta1) * gradient_b

            #compute 2nd moment term
            self.Weights_moment_nadam2[j-1] = self.beta2 * self.Weights_moment_nadam2[j-1] + (1-self.beta2) * np.square(gradient_W)
            self.biases_moment_nadam2[j-1] = self.beta2 * self.biases_moment_nadam2[j-1] + (1-self.beta2) * np.square(gradient_b)

            #corrected terms in 1st moment
            corrected_weight_nadam1 = self.Weights_moment_nadam1[j-1] / (1-self.beta1 ** (epoch+1))
            corrected_bias_nadam1 = self.biases_moment_nadam1[j-1] / (1-self.beta1 ** (epoch+1))

            #corrected terms in 2nd moment
            corrected_weight_nadam2 = self.Weights_moment_nadam2[j-1] / (1-self.beta2 ** (epoch+1))
            corrected_bias_nadam2 = self.biases_moment_nadam2[j-1] / (1-self.beta2 ** (epoch+1))

            #netsrov momentum update
            momentum_updated_weight = self.beta1*corrected_weight_nadam1 + ((1-self.beta1)*gradient_W) / (1 - self.beta1**(epoch+1))
            momentum_updated_bias = self.beta1 * corrected_bias_nadam1 + ((1-self.beta1)*gradient_b) / (1 - self.beta1**(epoch+1))

            #updating weights and biases
            self.Weights[j-1] = self.Weights[j-1] - learning_rate_eta * momentum_updated_weight / (np.sqrt(corrected_weight_nadam2)+self.epsilon)
            self.biases[j-1] = self.biases[j-1] - learning_rate_eta * momentum_updated_bias / (np.sqrt(corrected_bias_nadam2)+self.epsilon)

        #computing average train accuracy
        training_accuracy = accuracy / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training accuracy: {training_accuracy:.4f}')
        wandb.log({'train-accuracy':training_accuracy*100})


        #computing average epoch(training loss) loss
        loss_epo = loss_epo / (x_train.shape[0] / batch_size)
        print(f'Epoch Number {epoch+1}, training loss: {loss_epo:.4f}')
        #self.train_loss_sgd.append(loss_epo)
        wandb.log({'train-loss':loss_epo})

        #computing accuracy on validation set
        y_valid_predicted = self.forward_prop(x_valid)
        valid_accuracy = accuracy_score(np.argmax(y_valid_predicted, axis=1), np.argmax(y_valid, axis=1))
        print(f'Epoch Number {epoch+1}, validation accuracy: {valid_accuracy:.4f}')
        wandb.log({'val_accuracy':valid_accuracy*100})
        wandb.log({'epoch':epoch+1})

        #computing validation loss
        val_loss = loss_func(y_valid, y_valid_predicted)
        print(f'Epoch Number {epoch+1}, validation loss: {val_loss:.4f}')
        wandb.log({'val-loss':val_loss})

      #checking the efficiency of the model by passing test set
      y_test_predicted = self.forward_prop(x_test)
      test_accuracy = accuracy_score(np.argmax(y_test_predicted, axis = 1), np.argmax(y_test, axis = 1))
      print(f'Test Accuracy: {test_accuracy:.4f}')



In [4]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.42.0-py2.py3-none-any.whl (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.5/263.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [5]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

In [6]:
wandb.login(key='cd7a6c2259e8886dc269bbf6f0f9e55089d3beeb')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
#for best combination

# You need to define a config file in the form of dictionary or yaml
sweep_config = {
    'method': 'random',
    'name' : 'best comb',
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [10]
        },
        'hidden_layers':{
            'values':[5]
        },
        'optimizer': {
            'values':[ 'adam']
        },
         'hidden_size':{
            'values':[128]
        },
        'batch_size': {
            'values':[16]
        },
        'learning_rate': {
            'values':[1e-3]
        },
        'weight_init': {
           'values' :['xavier']
        },
        'activation': {
            'values': ['tanh']
        },
        'weight_decay': {
            'values': [0]
        },
    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='Deep_Learning_Assignment1')

Create sweep with ID: ehcdyzno
Sweep URL: https://wandb.ai/prabhat-kumar/Deep_Learning_Assignment1/sweeps/ehcdyzno


In [9]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init(entity = 'prabhat-kumar') as run:

        run_name="-ac_"+wandb.config.activation+"-hs"+str(wandb.config.hidden_size)+'-wi'+wandb.config.weight_init+'-hl'+str(wandb.config.hidden_layers)+'-op'+wandb.config.optimizer+'-ep'+str(wandb.config.epochs)+'lr'+str(wandb.config.learning_rate)+'bs'+str(wandb.config.batch_size) +'wd'+str(wandb.config.weight_decay)
        wandb.run.name=run_name

        model = Network([784,wandb.config.hidden_size,10], wandb.config.weight_init, wandb.config.activation, 0.9, 0.9, 0.999)

        if wandb.config.optimizer == 'nesterov':
          model.train_nag(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'momentum':
          model.train_momentum(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'sgd':
          model.train_sgd(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'adam':
          model.train_adam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'rmsprop':
          model.train_rmsprop(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'nadam':
          model.train_nadam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)

wandb.agent(sweep_id, function=main,count=1) # calls main function for count number of times.
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: 9js8zqny with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


Epoch Number 1, training accuracy: 0.9181
Epoch Number 1, training loss: 0.2836
Epoch Number 1, validation accuracy: 0.9539
Epoch Number 1, validation loss: 0.1706
Epoch Number 2, training accuracy: 0.9550
Epoch Number 2, training loss: 0.1558
Epoch Number 2, validation accuracy: 0.9630
Epoch Number 2, validation loss: 0.1391
Epoch Number 3, training accuracy: 0.9641
Epoch Number 3, training loss: 0.1258
Epoch Number 3, validation accuracy: 0.9667
Epoch Number 3, validation loss: 0.1229
Epoch Number 4, training accuracy: 0.9700
Epoch Number 4, training loss: 0.1067
Epoch Number 4, validation accuracy: 0.9686
Epoch Number 4, validation loss: 0.1126
Epoch Number 5, training accuracy: 0.9741
Epoch Number 5, training loss: 0.0929
Epoch Number 5, validation accuracy: 0.9699
Epoch Number 5, validation loss: 0.1053
Epoch Number 6, training accuracy: 0.9776
Epoch Number 6, training loss: 0.0821
Epoch Number 6, validation accuracy: 0.9711
Epoch Number 6, validation loss: 0.0998
Epoch Number 7, 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train-accuracy,▁▅▆▆▇▇▇███
train-loss,█▄▃▃▂▂▂▁▁▁
val-loss,█▅▄▃▃▂▂▁▁▁
val_accuracy,▁▄▅▆▆▇▇▇██

0,1
epoch,10.0
train-accuracy,98.646
train-loss,0.05374
val-loss,0.0866
val_accuracy,97.48


In [11]:
#for 2nd best combination


# You need to define a config file in the form of dictionary or yaml
sweep_config = {
    'method': 'random',
    'name' : '2nd best comb',
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [10]
        },
        'hidden_layers':{
            'values':[4]
        },
        'optimizer': {
            'values':[ 'rmsprop']
        },
         'hidden_size':{
            'values':[128]
        },
        'batch_size': {
            'values':[32]
        },
        'learning_rate': {
            'values':[1e-3]
        },
        'weight_init': {
           'values' :['xavier']
        },
        'activation': {
            'values': ['tanh']
        },
        'weight_decay': {
            'values': [0]
        },
    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='Deep_Learning_Assignment1')

Create sweep with ID: bp0k023t
Sweep URL: https://wandb.ai/prabhat-kumar/Deep_Learning_Assignment1/sweeps/bp0k023t


In [12]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init(entity = 'prabhat-kumar') as run:

        run_name="-ac_"+wandb.config.activation+"-hs"+str(wandb.config.hidden_size)+'-wi'+wandb.config.weight_init+'-hl'+str(wandb.config.hidden_layers)+'-op'+wandb.config.optimizer+'-ep'+str(wandb.config.epochs)+'lr'+str(wandb.config.learning_rate)+'bs'+str(wandb.config.batch_size) +'wd'+str(wandb.config.weight_decay)
        wandb.run.name=run_name

        model = Network([784,wandb.config.hidden_size,10], wandb.config.weight_init, wandb.config.activation, 0.9, 0.9, 0.999)

        if wandb.config.optimizer == 'nesterov':
          model.train_nag(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'momentum':
          model.train_momentum(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'sgd':
          model.train_sgd(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'adam':
          model.train_adam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'rmsprop':
          model.train_rmsprop(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'nadam':
          model.train_nadam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)

wandb.agent(sweep_id, function=main,count=1) # calls main function for count number of times.
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: q5kdkukl with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


Epoch Number 1, training accuracy: 0.9361
Epoch Number 1, training loss: 0.2241
Epoch Number 1, validation accuracy: 0.9616
Epoch Number 1, validation loss: 0.1391
Epoch Number 2, training accuracy: 0.9696
Epoch Number 2, training loss: 0.1092
Epoch Number 2, validation accuracy: 0.9684
Epoch Number 2, validation loss: 0.1110
Epoch Number 3, training accuracy: 0.9795
Epoch Number 3, training loss: 0.0778
Epoch Number 3, validation accuracy: 0.9720
Epoch Number 3, validation loss: 0.0976
Epoch Number 4, training accuracy: 0.9854
Epoch Number 4, training loss: 0.0575
Epoch Number 4, validation accuracy: 0.9726
Epoch Number 4, validation loss: 0.0893
Epoch Number 5, training accuracy: 0.9904
Epoch Number 5, training loss: 0.0428
Epoch Number 5, validation accuracy: 0.9741
Epoch Number 5, validation loss: 0.0833
Epoch Number 6, training accuracy: 0.9939
Epoch Number 6, training loss: 0.0314
Epoch Number 6, validation accuracy: 0.9745
Epoch Number 6, validation loss: 0.0803
Epoch Number 7, 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train-accuracy,▁▅▆▆▇▇████
train-loss,█▄▃▃▂▂▁▁▁▁
val-loss,█▅▃▂▂▁▁▁▁▂
val_accuracy,▁▄▆▆▇▇▇███

0,1
epoch,10.0
train-accuracy,99.984
train-loss,0.0076
val-loss,0.08313
val_accuracy,97.74


In [15]:
#for 3rd best combination


# You need to define a config file in the form of dictionary or yaml
sweep_config = {
    'method': 'random',
    'name' : '3rd best comb',
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [10]
        },
        'hidden_layers':{
            'values':[3]
        },
        'optimizer': {
            'values':[ 'nadam']
        },
         'hidden_size':{
            'values':[128]
        },
        'batch_size': {
            'values':[64]
        },
        'learning_rate': {
            'values':[1e-3]
        },
        'weight_init': {
           'values' :['xavier']
        },
        'activation': {
            'values': ['tanh']
        },
        'weight_decay': {
            'values': [0]
        },
    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='Deep_Learning_Assignment1')

Create sweep with ID: rt8vu9ze
Sweep URL: https://wandb.ai/prabhat-kumar/Deep_Learning_Assignment1/sweeps/rt8vu9ze


In [14]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init(entity = 'prabhat-kumar') as run:

        run_name="-ac_"+wandb.config.activation+"-hs"+str(wandb.config.hidden_size)+'-wi'+wandb.config.weight_init+'-hl'+str(wandb.config.hidden_layers)+'-op'+wandb.config.optimizer+'-ep'+str(wandb.config.epochs)+'lr'+str(wandb.config.learning_rate)+'bs'+str(wandb.config.batch_size) +'wd'+str(wandb.config.weight_decay)
        wandb.run.name=run_name

        model = Network([784,wandb.config.hidden_size,10], wandb.config.weight_init, wandb.config.activation, 0.9, 0.9, 0.999)

        if wandb.config.optimizer == 'nesterov':
          model.train_nag(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'momentum':
          model.train_momentum(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'sgd':
          model.train_sgd(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'adam':
          model.train_adam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'rmsprop':
          model.train_rmsprop(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)
        if wandb.config.optimizer == 'nadam':
          model.train_nadam(x_train, y_train, wandb.config.learning_rate, wandb.config.epochs, wandb.config.batch_size)

wandb.agent(sweep_id, function=main,count=1) # calls main function for count number of times.
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: kcfu3f17 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


Epoch Number 1, training accuracy: 0.9132
Epoch Number 1, training loss: 0.2989
Epoch Number 1, validation accuracy: 0.9542
Epoch Number 1, validation loss: 0.1729
Epoch Number 2, training accuracy: 0.9533
Epoch Number 2, training loss: 0.1662
Epoch Number 2, validation accuracy: 0.9612
Epoch Number 2, validation loss: 0.1465
Epoch Number 3, training accuracy: 0.9619
Epoch Number 3, training loss: 0.1386
Epoch Number 3, validation accuracy: 0.9638
Epoch Number 3, validation loss: 0.1323
Epoch Number 4, training accuracy: 0.9674
Epoch Number 4, training loss: 0.1211
Epoch Number 4, validation accuracy: 0.9653
Epoch Number 4, validation loss: 0.1227
Epoch Number 5, training accuracy: 0.9712
Epoch Number 5, training loss: 0.1080
Epoch Number 5, validation accuracy: 0.9672
Epoch Number 5, validation loss: 0.1155
Epoch Number 6, training accuracy: 0.9740
Epoch Number 6, training loss: 0.0975
Epoch Number 6, validation accuracy: 0.9683
Epoch Number 6, validation loss: 0.1097
Epoch Number 7, 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train-accuracy,▁▅▆▆▇▇▇███
train-loss,█▄▃▃▂▂▂▁▁▁
val-loss,█▆▄▄▃▂▂▂▁▁
val_accuracy,▁▄▅▅▆▆▇▇██

0,1
epoch,10.0
train-accuracy,98.238
train-loss,0.06947
val-loss,0.09459
val_accuracy,97.28
