<a href="https://colab.research.google.com/github/oikn2018/CS6910_assignment_1/blob/main/Q7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#importing required packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from keras.datasets import fashion_mnist
# import wandb
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

np.random.seed(42)

In [4]:
# Load the fashion_mnist dataset
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# Split the training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True)

print(f"Training data shape: {x_train.shape}, Training label shape: {y_train.shape}")
print(f"Validation data shape: {x_val.shape}, Validation label shape: {y_val.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
Training data shape: (54000, 28, 28), Training label shape: (54000,)
Validation data shape: (6000, 28, 28), Validation label shape: (6000,)


In [5]:
# Reshape the input data for training, validation, and testing sets
X_train = np.reshape(x_train, (x_train.shape[0], -1)).T
X_val = np.reshape(x_val, (x_val.shape[0], -1)).T
X_test = np.reshape(x_test, (x_test.shape[0], -1)).T

# Normalize the input data to have values between 0 and 1
X_train = X_train / 255.
X_val = X_val / 255.
X_test = X_test / 255.

# Convert the target labels into one-hot encoded vectors
Y_train = np.eye(np.max(y_train) + 1)[y_train].T
Y_val = np.eye(np.max(y_val) + 1)[y_val].T
Y_test = np.eye(np.max(y_test) + 1)[y_test].T

print(f"Training data shape: {X_train.shape}, Training label shape: {Y_train.shape}")
print(f"Validation data shape: {X_val.shape}, Validation label shape: {Y_val.shape}")
print(f"Testing data shape: {X_test.shape}, Testing label shape: {Y_test.shape}")

Training data shape: (784, 54000), Training label shape: (10, 54000)
Validation data shape: (784, 6000), Validation label shape: (10, 6000)
Testing data shape: (784, 10000), Testing label shape: (10, 10000)


In [6]:
class FeedForwardNN:
    def __init__(self,config=None,epochs=20,hidden_layers=[512, 512, 512, 512, 512],weight_decay=0,learning_rate=0.005,optimizer='nadam',batch_size=64,weight_initialization='xavier',activations='sigmoid',loss_function='cross-entropy',output_function='softmax',gamma=0.9,beta=0.9,beta1=0.9,beta2=0.999,eps=1e-8):
        
        # self.run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_ep_{}_nn_{}_nh_{}".format(self.learning_rate, self.activations, self.weight_initialization, self.optimizer, self.batch_size, self.epochs, self.hidden_layers, len(self.hidden_layers))

        # Constructor that initializes the neural network
        if config is not None:
            # If a config dictionary is passed, use its values to initialize the parameters
            self.epochs = config["epochs"]
            self.learning_rate = config["learning_rate"]
            self.weight_decay = config["weight_decay"]
            self.loss_function = config["loss_function"]
            self.optimizer = config["optimizer"]
            self.batch_size = config["batch_size"]
            self.weight_initialization = config["weight_initialization"]
            self.activations = config["activations"]
            self.hidden_layers = [config["hidden_layers_size"] for x in range(config["no_hidden_layers"])]
        else:
            # If no config dictionary is passed, use the default values to initialize the parameters
            self.epochs = epochs
            self.learning_rate = learning_rate
            self.weight_decay = weight_decay
            self.loss_function = loss_function
            self.optimizer = optimizer
            self.batch_size = batch_size
            self.weight_initialization = weight_initialization
            self.activations = activations
            self.hidden_layers = hidden_layers

        # Set the remaining parameters for the neural network
        # self.loss_function = loss_function
        self.output_function = output_function
        self.gamma = gamma
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        self.run_name = "loss_{}_lr_{}_ac_{}_in_{}_op_{}_bs_{}_ep_{}_nn_{}".format(self.loss_function, self.learning_rate, self.activations, self.weight_initialization, self.optimizer, self.batch_size, self.epochs, self.hidden_layers)
        # Initialize the neural network
        self.initialize()

    def initialize(self):
        # Set the number of neurons in each layer of the neural network
        layers = self.hidden_layers + [Y_train.shape[0]]

        # Initialize the weights and biases for each layer of the neural network
        self.theta = self.initialize_parameters(X_train.shape[0],layers,self.weight_initialization)

        # Calculate the regularization parameter
        self.lambd = self.weight_decay/self.learning_rate

        # Set the number of layers in the neural network
        self.L = len(layers)



    def sigmoid(self,x):
        return (1.0 / (1.0 + np.exp(-x)) )
    
    def tanh(self,x):
        return np.tanh(x)

    def relu(self,x):
        return np.maximum(0, x)

    def d_sigmoid(self,x):
        return (self.sigmoid(x)*(1.0 - self.sigmoid(x)))
    
    def d_tanh(self,x):
        return 1.0 - (self.tanh(x)**2)

    def d_relu(self,x):
        return np.greater(x, 0).astype(int)

    def activation(self,x,activation_function='sigmoid'):
        if activation_function == 'sigmoid':
              return self.sigmoid(x)
        elif activation_function == 'tanh':
              return self.tanh(x)
        elif activation_function == 'relu':
              return self.relu(x)

    def d_activation(self,x,activation_function='sigmoid'):
        if activation_function == 'sigmoid':
              return self.d_sigmoid(x)
        elif activation_function == 'tanh':
              return self.d_tanh(x)
        elif activation_function=='relu':
              return self.d_relu(x)


    def softmax(self,x):
        e = np.exp(x)
        return e / np.sum(e,axis=0)

    def output(self, x, output_function='softmax'):
        if output_function == 'softmax':
            return self.softmax(x)

    def cross_entropy_error(self, Y, inputs):
        Y_hat = inputs[1][-1]
        return -1 * np.sum(Y * (np.log(Y_hat)))
    
    def squared_error(self, Y, inputs):
        Y_hat = inputs[1][-1]
        return (1 / 2) * np.sum((Y_hat - Y) ** 2)

    def error(self, Y, inputs, loss_function='cross-entropy'):
        if loss_function == 'cross-entropy':
            return self.cross_entropy_error(Y, inputs)
        elif loss_function == 'squared-error':
            return self.squared_error(Y, inputs) 

    def squared_error_val(self, Y, inputs):
        Y_hat = inputs[1][-1]
        W, B = self.theta
        m = Y.shape[1]
        return (1 / (2 * m)) * np.sum((Y_hat - Y) ** 2) + (self.lambd / (2 * m)) * (self.frobenius(W ** 2) + self.frobenius(B ** 2))

    def cross_entropy_error_val(self, Y, inputs):
        W, B = self.theta
        Y_hat = inputs[1][-1]
        m = Y.shape[1]
        return (-1/m) * np.sum(Y * (np.log(Y_hat))) + (self.lambd/(2*m)) * (self.frobenius(W ** 2) + self.frobenius(B ** 2))

    def val_error(self, Y, inputs, loss_function='cross-entropy'):
        if loss_function == 'cross-entropy':
            return self.cross_entropy_error_val(Y, inputs)
        if loss_function == 'squared-error':
            return self.squared_error_val(Y, inputs)

#     def frobenius(self, M):
#         return np.sqrt(np.sum(np.square(M)))

    def initialize_params_random(self, n, layers):
        L = len(layers)
        biases = [np.float128(np.zeros((layers[i], 1))) for i in range(L)]
        weights = [np.float128(np.random.randn(layers[i], n) if i == 0 else np.random.randn(layers[i], layers[i - 1])) for i in range(L)]
        return (np.array(weights), np.array(biases))

    def initialize_params_xavier(self, n, layers):
        
        L=len(layers)
        biases = []
        weights = []
        for i in range(L):
            bias=np.float128(np.zeros((layers[i], 1)))
            if i==0:
                 weight=np.float128(np.random.randn(layers[i],n))
            else:
                 weight = np.float128(np.random.randn(layers[i],layers[i - 1]) * np.sqrt(1 / layers[i - 1]))
            biases.append(bias)
            weights.append(weight)

        return (np.array(weights),np.array(biases))
#         biases = [np.float128(np.zeros((layer, 1))) for layer in layers]
#         weights = [np.float128(np.random.randn(layers[i], n if i == 0 else layers[i - 1]) * np.sqrt(1 / (n if i == 0 else layers[i - 1]))) for i in range(len(layers))]
#         return np.array(weights), np.array(biases)

    def initialize_parameters(self,n,layers,param_init_type):
        if param_init_type == 'random':
            return self.initialize_params_random(n,layers)
        elif param_init_type == 'xavier':
            return self.initialize_params_xavier(n,layers)

    def frobenius(self,X):
        s=0
        for x in X:
          s += np.sum(x)
        return s
#     def frobenius(self, X):
#         return np.linalg.norm(X, ord='fro')

    def feedforward(self,X,theta,L):
        H = X
        weights ,biases = theta
        activations = []
        pre_activations = []
        for k in range(L-1):
              A = biases[k] + (weights[k] @ H)
              H = self.activation(A,self.activations)
              pre_activations.append(A)
              activations.append(H)
        
        AL = biases[L-1] + (weights[L-1] @ H)
        Y_hat = self.output(AL,self.output_function)
        pre_activations.append(AL)
        activations.append(Y_hat)
        return (np.array(pre_activations),np.array(activations))

    def backprop(self,X,Y,inputs,theta,batch_size,L):
        # Initialize empty lists for storing gradients
        d_biases, d_weights = [], []
        d_biases2 = []
        d_weights2 = []
        # Extract pre-activations and activations from the inputs
        pre_activations, activations = inputs
        # Get the predicted output
        Y_hat = activations[-1]
#         # Retrieve the weights and biases from the current model parameters
#         weights, biases = theta

        if self.loss_function == 'squared-error':
          d_AL = Y_hat*(Y_hat - Y)*(1 - Y_hat)
        elif self.loss_function == 'cross-entropy':
          d_AL = Y_hat - Y
        # Loop over the layers in reverse order to calculate the gradients
        for k in range(L-1, -1, -1):
            # Calculate the gradients for the weights and biases
            d_W = (1/batch_size)*(d_AL @ activations[k-1].T) if k > 0 else (1/batch_size)*(d_AL @ X.T)
            d_W2 = (1 / batch_size) * (d_AL ** 2 @ (activations[k-1].T) ** 2) if k>0 else (1 / batch_size) * (d_AL ** 2 @ (X.T) ** 2)
            d_B = (1/batch_size)*np.sum(d_AL, axis=1, keepdims=True)
            d_B2 = (1 / batch_size) * np.sum(d_AL ** 2, axis=1, keepdims=True)

            # Calculate the derivative of the activation function and backpropagate the error to the previous layer
            if k > 0:
                d_AL = (theta[0][k].T @ d_AL) * self.d_activation(pre_activations[k-1], self.activations)
            # Add the gradients to the lists
            d_weights.insert(0, d_W)
            d_biases.insert(0, d_B)
            d_weights2.insert(0, d_W2)
            d_biases2.insert(0, d_B2)
        d_theta = (np.array(d_weights),np.array(d_biases))
        d_theta2 = (np.array(d_weights2), np.array(d_biases2))
        
        return (d_theta, d_theta2)

    # Function to update weights and biases based on the calculated gradients and learning rate
    def update_params(self, theta, d_theta, learning_rate):
        weights, biases = theta
        d_weights, d_biases = d_theta
        updated_weights = (1 - self.weight_decay)*weights - learning_rate*d_weights
        updated_biases = (1 - self.weight_decay)*biases - learning_rate*d_biases
        return updated_weights, updated_biases

    # Function to perform mini-batch gradient descent on the given data
    def sgd(self, X, Y, theta, learning_rate, batch_size, L):
        m = X.shape[1]
        total_error = 0
        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            # compute gradients
            d_theta, _ = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) 
            # update weights and biases                                                                    
            theta = self.update_params(theta, d_theta, learning_rate)
        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
#             start = i*batch_size
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            d_theta, _ = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, m % batch_size, L) # compute gradients
            theta = self.update_params(theta, d_theta, learning_rate) # update weights and biases
            W, B = theta

            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 
            
        # Calculate the average error
        avg_err = total_error/m
        # Return the updated theta and average error
        return (theta, avg_err)
    
    def update_params_momentum(self, theta, d_theta, learning_rate, gamma, prev_weight, prev_bias):
        weights, biases = theta
        d_weights,d_biases = d_theta
        
        # Calculate the velocity for weights and biases
        v_weight = gamma * prev_weight + learning_rate * d_weights
        v_bias = gamma * prev_bias + learning_rate * d_biases

        # Apply weight decay to the weights
        decay = (1 - self.weight_decay)

        # Update weights and biases using the velocity and decayed weights
        updated_weights = decay*weights - v_weight
        updated_biases = decay*biases - v_bias

        return (updated_weights, updated_biases, v_weight, v_bias)

    def gd_momentum(self, X, Y, theta, learning_rate, batch_size, gamma, L):
        m = X.shape[1] # number of training examples
        prev_weights = 0 # initialize previous weights to zero
        prev_biases = 0 # initialize previous biases to zero
        total_error = 0 # initialize total error to zero

        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            # compute gradients
            d_theta, _ = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) 
            # update weights and biases using momentum                                                                     
            weights, biases, prev_weights, prev_biases = self.update_params_momentum(theta, d_theta, learning_rate, gamma, prev_weights, prev_biases)
            theta = weights, biases
        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
#             start = i*batch_size
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            d_theta = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, m % batch_size, L) # compute gradients
        
            weights, biases, prev_weights, prev_biases = self.update_params_momentum(theta, d_theta, learning_rate, gamma, prev_weights, prev_biases) # update weights and biases using momentum
            regularization = (self.lambd / 2) * (self.frobenius(weights**2) + self.frobenius(biases**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 
            
            

            theta = weights, biases
        
        total_error /= m # average total error across all mini-batches
        return (theta, total_error) # return updated weights and biases and the total error

    def update_params_nesterov(self, theta, d_theta, learning_rate, gamma, prev_weight, prev_bias):
        weights, biases = theta
        d_weights,d_biases = d_theta
        
        # Calculate the velocity for weights and biases
        v_weight = gamma * prev_weight + learning_rate * d_weights
        v_bias = gamma * prev_bias + learning_rate * d_biases

        # Apply weight decay to the weights
        decay = (1 - self.weight_decay)

        # Update weights and biases using the velocity and decayed weights
        updated_weights = decay*weights - v_weight
        updated_biases = decay*biases - v_bias

        return (updated_weights, updated_biases, v_weight, v_bias)

    def gd_nesterov(self, X, Y, theta, learning_rate, batch_size, gamma, L):
        m = X.shape[1] # number of training examples
        prev_weights = 0 # initialize previous weights to zero
        prev_biases = 0 # initialize previous biases to zero
        total_error = 0 # initialize total error to zero

        weights, biases = theta
        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            # compute output of the network
            inputs = self.feedforward(X[:, start:stop], theta, L) 
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            
            # Compute gradients using backpropagation
            v_weight=gamma*prev_weights
            v_biases=gamma*prev_biases
            theta2=weights-v_weight,biases-v_biases
            d_theta, _ = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta2, batch_size, L) 
            
            # update weights and biases using momentum                                                                     
            weights, biases, prev_weights, prev_biases = self.update_params_nesterov(theta, d_theta, learning_rate, gamma, prev_weights, prev_biases)
            theta = weights, biases
        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            
            # Compute gradients using backpropagation
            v_weight=gamma*prev_weights
            v_biases=gamma*prev_biases
            theta2=weights-v_weight,biases-v_biases
            d_theta, _= self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta2, m % batch_size, L) # compute gradients
        
            weights, biases, prev_weights, prev_biases = self.update_params_nesterov(theta, d_theta, learning_rate, gamma, prev_weights, prev_biases) # update weights and biases using momentum
            regularization = (self.lambd / 2) * (self.frobenius(weights**2) + self.frobenius(biases**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 

            theta = weights, biases
        
        total_error /= m # average total error across all mini-batches
        return (theta, total_error) # return updated weights and biases and the total error
    
    def update_params_rmsprop(self, theta, d_theta, d_theta2, prev_weights2, prev_biases2, learning_rate, beta, eps):
        weights, biases = theta
        d_weights, d_biases = d_theta
        d_weights2, d_biases2 = d_theta2

        # Compute the exponential moving averages of squared gradients
        prev_weights2 = beta * prev_weights2 + (1 - beta) * ((d_weights)**2)
        prev_biases2 = beta * prev_biases2 + (1 - beta) * ((d_biases)**2)

        # Compute the RMSProp update
        W_ = learning_rate / ((prev_weights2)**0.5 + eps)
        B_ = learning_rate / ((prev_biases2)**0.5 + eps)

        # Update the parameters
        # Apply weight decay to the weights
        decay = (1 - self.weight_decay)
        # Update weights and biases using the velocity and decayed weights
        updated_weights = decay * weights - W_ * d_weights
        updated_biases = decay * biases - B_ * d_biases

        return ((np.array(updated_weights), np.array(updated_biases)), prev_weights2, prev_biases2)

    def rmsprop(self, X, Y, theta, learning_rate, beta, eps, batch_size, L):
        m = X.shape[1] # number of training examples
        prev_weights2 = 0 # initialize previous weights to zero
        prev_biases2 = 0 # initialize previous biases to zero
        total_error = 0 # initialize total error to zero

        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            # compute output of the network
            inputs = self.feedforward(X[:, start:stop], theta, L) 
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            
            # Compute gradients using backpropagation
            d_theta, d_theta2 = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) 
            
            # update weights and biases using RMSProp                                                                    
            theta, prev_weights, prev_biases = self.update_params_rmsprop(theta, d_theta, d_theta2, prev_weights2, prev_biases2, learning_rate, beta, eps)

        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            
            # Compute gradients using backpropagation
            d_theta, d_theta2= self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) # compute gradients
        
            theta, prev_weights2, prev_biases2 = self.update_params_rmsprop(theta, d_theta, d_theta2, prev_weights2, prev_biases2, learning_rate, beta, eps)
            W, B = theta
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 

        
        total_error /= m # average total error across all mini-batches
        return (theta, total_error) # return updated weights and biases and the total error
  

    def update_params_adam(self,theta,d_theta,d_theta2,prev_weights,prev_bias,prev_weights2,prev_biases2,learning_rate,beta1,beta2,eps,t):
        weights, biases = theta
        d_weights,d_biases = d_theta
        d_weights2,d_biases2 = d_theta2
        
        # update the exponentially weighted averages of the gradients
        prev_weights = beta1*prev_weights + (1-beta1)*d_weights
        prev_bias = beta1*prev_bias + (1-beta1)*d_biases

         # update the exponentially weighted averages of the squared gradients
        prev_weights2=beta2*prev_weights2 + (1-beta2)*(d_weights2)
        prev_biases2=beta2*prev_biases2 + (1-beta2)*(d_biases2)
    
        # bias correction to the weighted averages of the gradients
        corr_m_w = prev_weights/(1-(beta1**t))
        corr_m_b = prev_bias/(1-(beta1**t))

        # bias correction to the weighted averages of the squared gradients
        corr_v_w = prev_weights2/(1-(beta2**t))
        corr_v_b = prev_biases2/(1-(beta2**t))

        # calculate the update parameters using the bias-corrected averages of the gradients and squared gradients
        corr_v_w = learning_rate/((corr_v_w)**0.5 + eps)
        corr_v_b = learning_rate/((corr_v_b)**0.5 + eps)

        # update the weights and biases using the update parameters and L2 regularization
        weights = (1 - self.weight_decay)*weights - corr_v_w*corr_m_w
        biases = (1 - self.weight_decay)*biases - corr_v_b*corr_m_b

        theta = (np.array(weights),np.array(biases))
        return (theta,prev_weights,prev_bias,prev_weights2,prev_biases2)

    def adam(self,X,Y,theta,learning_rate,beta1,beta2,eps,batch_size,L):
        m = X.shape[1] # number of training examples
        prev_weights, prev_weights2 = 0,0 # initialize previous weights to zero
        prev_biases, prev_biases2 = 0,0 # initialize previous biases to zero
        total_error = 0 # initialize total error to zero

        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            # compute output of the network
            inputs = self.feedforward(X[:, start:stop], theta, L) 
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            
            # Compute gradients using backpropagation
            d_theta, d_theta2 = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) 
            
            # update weights and biases using Adam                                                                    
            theta, prev_weights, prev_biases, prev_weights2, prev_biases2 = self.update_params_adam(theta, d_theta, d_theta2, prev_weights, prev_biases, prev_weights2, prev_biases2, learning_rate, beta1, beta2, eps, i+1)

        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            
            # Compute gradients using backpropagation
            d_theta, d_theta2= self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) # compute gradients
        
            theta, prev_weights, prev_biases, prev_weights2, prev_biases2 = self.update_params_adam(theta, d_theta, d_theta2, prev_weights, prev_biases, prev_weights2, prev_biases2, learning_rate, beta1, beta2, eps, i+1)
            W, B = theta
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 

        
        total_error /= m # average total error across all mini-batches
        return (theta, total_error) # return updated weights and biases and the total error
  

    def update_params_nadam(self,theta,d_theta,d_theta2,prev_weights,prev_bias,prev_weights2,prev_biases2,learning_rate,beta1,beta2,eps,t):
        weights, biases = theta
        d_weights,d_biases = d_theta
        d_weights2,d_biases2 = d_theta2
        
        # update the exponentially weighted averages of the gradients
        prev_weights = beta1*prev_weights + (1-beta1)*d_weights
        prev_bias = beta1*prev_bias + (1-beta1)*d_biases

         # update the exponentially weighted averages of the squared gradients
        prev_weights2=beta2*prev_weights2 + (1-beta2)*(d_weights2)
        prev_biases2=beta2*prev_biases2 + (1-beta2)*(d_biases2)
    
        beta_t = 1-(beta1**t)
        beta2_t = 1-(beta2**t)
        # bias correction to the weighted averages of the gradients
        corr_m_w = beta1*prev_weights/beta_t + ((1-beta1)/beta_t)*d_weights
        corr_m_b = beta1*prev_bias/beta_t + ((1-beta1)/beta_t)*d_biases

        # bias correction to the weighted averages of the squared gradients
        corr_v_w = prev_weights2/beta2_t
        corr_v_b = prev_biases2/beta2_t

        # calculate the update parameters using the bias-corrected averages of the gradients and squared gradients
        corr_v_w = learning_rate/((corr_v_w)**0.5 + eps)
        corr_v_b = learning_rate/((corr_v_b)**0.5 + eps)

        # update the weights and biases using the update parameters and L2 regularization
        weights = (1 - self.weight_decay)*weights - corr_v_w*corr_m_w
        biases = (1 - self.weight_decay)*biases - corr_v_b*corr_m_b

        theta = (np.array(weights),np.array(biases))
        return (theta,prev_weights,prev_bias,prev_weights2,prev_biases2)

    def nadam(self,X,Y,theta,learning_rate,beta1,beta2,eps,batch_size,L):
        m = X.shape[1] # number of training examples
        prev_weights, prev_weights2 = 0,0 # initialize previous weights to zero
        prev_biases, prev_biases2 = 0,0 # initialize previous biases to zero
        total_error = 0 # initialize total error to zero

        weights, biases = theta
        
        # loop over mini-batches
        for i in range(0, m, batch_size):
            start = i
            stop = i + batch_size
            # compute output of the network
            inputs = self.feedforward(X[:, start:stop], theta, L) 
            W, B = theta
            # compute L2 regularization term
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            # compute error
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization
            
            # Compute gradients using backpropagation
            d_theta, d_theta2 = self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) 
            
            # update weights and biases using Adam                                                                    
            theta, prev_weights, prev_biases, prev_weights2, prev_biases2 = self.update_params_nadam(theta, d_theta, d_theta2, prev_weights, prev_biases, prev_weights2, prev_biases2, learning_rate, beta1, beta2, eps, i+1)

        # handle the last mini-batch if it is not a multiple of batch_size
        if m % batch_size != 0:
            start = m - m % batch_size
            stop = m
            inputs = self.feedforward(X[:, start:stop], theta, L) # compute output of the network
            
            # Compute gradients using backpropagation
            d_theta, d_theta2= self.backprop(X[:, start:stop], Y[:, start:stop], inputs, theta, batch_size, L) # compute gradients
        
            theta, prev_weights, prev_biases, prev_weights2, prev_biases2 = self.update_params_nadam(theta, d_theta, d_theta2, prev_weights, prev_biases, prev_weights2, prev_biases2, learning_rate, beta1, beta2, eps, i+1)
            W, B = theta
            regularization = (self.lambd / 2) * (self.frobenius(W**2) + self.frobenius(B**2) )
            total_error += self.error(Y[:, start:stop], inputs, self.loss_function) + regularization 

        
        total_error /= m # average total error across all mini-batches
        return (theta, total_error) # return updated weights and biases and the total error
  
    # Function to perform optimization based on the specified optimizer
    def optimizations(self, theta, L):
        # If optimizer is stochastic gradient descent
        if self.optimizer == 'sgd':
            # Perform mini-batch gradient descent on the training data
            return self.sgd(X_train, Y_train, theta, self.learning_rate, 1, L)
        elif self.optimizer == 'momentum':
            return self.gd_momentum(X_train,Y_train,theta,self.learning_rate,self.batch_size,self.gamma,L)
        elif self.optimizer == 'nesterov':
            return self.gd_nesterov(X_train,Y_train,theta,self.learning_rate,self.batch_size,self.gamma,L)
        elif self.optimizer == 'rmsprop':
            return self.rmsprop(X_train,Y_train,theta,self.learning_rate,self.beta,self.eps,self.batch_size,L)
        elif self.optimizer == 'adam':
            return self.adam(X_train,Y_train,theta,self.learning_rate,self.beta1,self.beta2,self.eps,self.batch_size,L)
        elif self.optimizer == 'nadam':
            return self.nadam(X_train,Y_train,theta,self.learning_rate,self.beta1,self.beta2,self.eps,self.batch_size,L)
      


    def fit(self):
        # perform optimization on the model's parameters (theta) and get train loss
        self.theta, train_loss = self.optimizations(self.theta, self.L)

        # make predictions on the training set
        outputs_train = self.feedforward(X_train, self.theta, self.L)
        Y_pred_train = np.argmax(outputs_train[1][-1], axis=0)
        Y_true_train = np.argmax(Y_train, axis=0)

        # calculate training accuracy
        train_acc = accuracy_score(Y_true_train, Y_pred_train)

        # make predictions on the validation set
        outputs_val = self.feedforward(X_val, self.theta, self.L)

        # calculate validation loss
        val_loss = self.val_error(Y_val, outputs_val, self.loss_function)
        Y_pred_val = np.argmax(outputs_val[1][-1], axis=0)
        Y_true_val = np.argmax(Y_val, axis=0)

        # calculate validation accuracy
        val_acc = accuracy_score(Y_true_val, Y_pred_val)

        # return training and validation accuracies and losses
        return train_acc, train_loss, val_acc, val_loss
    
    # def fit_test(self):
    #     # perform optimization on the model's parameters (theta) and get train loss
    #     self.theta, train_loss = self.optimizations(self.theta, self.L)

    #     # make predictions on the training set
    #     outputs_train = self.feedforward(X_train, self.theta, self.L)
    #     Y_pred_train = np.argmax(outputs_train[1][-1], axis=0)
    #     Y_true_train = np.argmax(Y_train, axis=0)

    #     # calculate training accuracy
    #     train_acc = accuracy_score(Y_true_train, Y_pred_train)

    #     # make predictions on the test set
    #     outputs_val = self.feedforward(X_test, self.theta, self.L)

    #     # calculate test loss
    #     test_loss = self.val_error(Y_test, outputs_val, self.loss_function)
    #     Y_pred_val = np.argmax(outputs_val[1][-1], axis=0)
    #     Y_true_val = np.argmax(Y_test, axis=0)

    #     # calculate test accuracy
    #     test_acc = accuracy_score(Y_true_val, Y_pred_val)

    #     # return training and test accuracies and losses
    #     return train_acc, train_loss, test_acc, test_loss

    def predict(self, X_test):
        # get the number of hidden layers
        L = len(self.hidden_layers) + 1

        # make predictions on the test set
        outputs = self.feedforward(X_test, self.theta, L)
        Y_pred = np.argmax(outputs[1][-1], axis=0)

        # return predicted labels
        return Y_pred

In [7]:
tuned_models  = []

In [None]:
def best_model(tuned_models):
  sorted_list = sorted(tuned_models, key = lambda d : d['validation_accuracy'],reverse=True)
  best_model = sorted_list[0]["model"]
  return (sorted_list, best_model)


def train():
    # with wandb.init() as run:
    model = FeedForwardNN(config=None)
    # run.name = model.run_name
    train_acc,train_loss,val_acc,val_loss = 0,0,0,0
    for epoch in range(20):
        train_acc,train_loss,val_acc,val_loss = model.fit()  # model training code here
        print({
        "epochs": epoch,
        "accuracy":train_acc,
        "loss":train_loss,
        "validation_accuracy": val_acc,
        "validation_loss": val_loss,
        })
        # wandb.log(metrics) 
    # print(run.name)
    tuned_models.append({
        "accuracy":train_acc,
        "loss":train_loss,
        "validation_accuracy": val_acc,
        "validation_loss": val_loss,
        "model":model
    })  



train()  

{'epochs': 0, 'accuracy': 0.8535185185185186, 'loss': 0.6628250572097459586, 'validation_accuracy': 0.8446666666666667, 'validation_loss': 0.43420532847324388792}
{'epochs': 1, 'accuracy': 0.8757962962962963, 'loss': 0.40235168603769215316, 'validation_accuracy': 0.8636666666666667, 'validation_loss': 0.37643314530730460817}
{'epochs': 2, 'accuracy': 0.8859444444444444, 'loss': 0.35092108303638460857, 'validation_accuracy': 0.8665, 'validation_loss': 0.36216592755619782343}
{'epochs': 3, 'accuracy': 0.8935, 'loss': 0.32180806416134436463, 'validation_accuracy': 0.8705, 'validation_loss': 0.35697027751231945263}
{'epochs': 4, 'accuracy': 0.900425925925926, 'loss': 0.29761846765460826697, 'validation_accuracy': 0.8746666666666667, 'validation_loss': 0.35043470261584015396}


In [None]:
(models_sorted, bestmodel) = best_model(tuned_models)

NameError: ignored

In [None]:
Y_true_test = np.argmax(Y_test,0)
Y_true_pred = bestmodel.predict(X_test)

In [None]:
confusion_matrix_test = confusion_matrix(Y_true_test, Y_true_pred, normalize='true')
fig = plt.figure(figsize=(12, 8))
ax = sns.heatmap(bern_cnf_matrix_test, annot=bern_cnf_matrix_test,xticklabels=class_names, yticklabels=class_names)
ax.set_title("Confusion Matrix (Test set)", size=16)
ax.set_xlabel("Predicted Class", size=14)
ax.set_ylabel("True Class", size=14)
plt.savefig("testmatrix_best")
img2 = plt.imread("testmatrix_best.png")
#wandb.init(project="CS6910 ASSIGNMENT 1", entity="dlstack", name="CONFUSION_MATRIX")
#wandb.log({"Confusion Matrix - Test set 3": wandb.sklearn.plot_confusion_matrix(Y_true_test, Y_true_pred, class_names)})
wandb.init(project="Testing", entity="dl_research", name="Confusion Matrix")
wandb.log({"Confusion Matrix Best": wandb.Image(img2)})
wandb.finish()
# wandb.sklearn.plot_confusion_matrix(Y_true_test, Y_true_pred, class_names)