## Installation

In [None]:
pip install wandb numpy pandas matplotlib

## Q1: fashion-MNIST dataset

In [4]:
# %%writefile Q1_fashion_mnist_class_images.py
import wandb
from keras.datasets import fashion_mnist

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

classes = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot"
}

def logClassImages(project_name:str):
  wandb.init(project=project_name)
  wandb_image_indices = []

  for classNumber in range(10):
    for j in range(len(y_test)):
      if y_test[j] == classNumber:
        wandb_image_indices.append(x_test[j])
        break

  wandb_images = [wandb.Image(wandb_image_indices[i], caption = classes[i]) for i in range(10)]
  wandb.log({"Sample images for each class": wandb_images})
  wandb.finish()

logClassImages("da6401_assignment1")

Writing Q1_fashion_mnist_class_images.py


## Feedforward neural network



### Libraries required

In [34]:
# %%writefile libraries.py
import wandb
from keras.datasets import fashion_mnist, mnist
import numpy
import copy
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot
from sklearn.metrics import confusion_matrix
import seaborn
import argparse

Overwriting libraries.py


### Activation functions

In [82]:
# %%writefile activation_functions.py
import numpy as np
"""
  ACTIVATION FUNCTIONS
"""
def identity(x):
    return x

def sigmoid(x):
    # x = np.clip(x,-10,10)
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    # print(x)
    # x = np.clip(x, -200,200)
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical Stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

Overwriting activation_functions.py


### Loss Functions

In [54]:
# %%writefile loss_functions.py
import numpy as np
"""
  LOSS FUNCTIONS
"""
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 1e-9), axis=-1)    # To avoid log 0, 1e-9 added to y_pred

Overwriting loss_functions.py


### Derivatives

In [86]:
# %%writefile derivatives.py
import numpy as np
# from activation_functions import *
# from loss_functions import *
"""
  DERIVATIVES OF ACTIVATION AND LOSS FUNCTIONS
"""
def identity_derivative(x):
    return np.ones_like(x)

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def mean_squared_error_derivative(y_true, y_pred):
    return y_pred - y_true

def cross_entropy_loss_derivative(y_true, y_pred):
    return -y_true / (y_pred + 1e-9)

def softmax_derivative(inp:np.array):
    derivates = []
    if(len(inp.shape) == 1):
      S_vector = inp.reshape(-1, 1)
      derivates = np.diag(inp) - np.dot(S_vector, S_vector.T)
    elif(len(inp.shape) == 2):
      for i in range(inp.shape[0]):
        S_vector = inp[i].reshape(-1, 1)
        derivates.append(np.diag(inp[i]) - np.dot(S_vector, S_vector.T))

    return np.array(derivates)

Overwriting derivatives.py


### Optimizers

In [56]:
# %%writefile optimizers.py
import numpy as np
"""
  OPTIMIZERS UPDATE RULES
"""

# STOCHASTIC GRADIENT DESCENT
def sgd(optimizer_input_dict, wts_bias_history_dict, itr=None):
  # cant update weights in one single matrix op as dimensions of weights can be different in each layer
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # weight decay term added additionally to the formula in slides
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["db"][i])

# MOMENTUM BASED GRADIENT DESCENT
def momentumGradientDescent(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i])

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - (optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i])

# NAG(NESTEROV ACCELERATED GRADIENT DESCENT)
def nag(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # dw,db will contain lookahead gradients only since forward and backward propagations are implemented accordingly
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i])

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i]

# RMSPROP
def rmsProp(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["dw"][i] ** 2)
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] *((wts_bias_history_dict["dw"][i]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"])))  - (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i])
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["dw"][i]))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["db"][i] ** 2)
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_biases"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["db"][i]))

# ADAM
def adam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - (optimizer_input_dict["learning_rate"]*((history_weights_hat/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])))) - ((optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (history_weights_hat))

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (history_biases_hat))

# NADAM
def nadam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_dw = optimizer_input_dict["beta1"] * history_weights_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["dw"][i])
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - (optimizer_input_dict["learning_rate"]*(lookahead_dw/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"]))) - ((optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (lookahead_dw))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_db = optimizer_input_dict["beta1"] * history_biases_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["db"][i])
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (lookahead_db))


Overwriting optimizers.py


### Network (forward and back propagation)

In [84]:
# %%writefile neural_network.py
# import libraries
# from activation_functions import *
# from loss_functions import *
# from optimizers import *
# from derivatives import *
import numpy as np
import copy
import math

class FeedForwardNeuralNetwork:
  # class variables
  optimizersMap = {"sgd": sgd, "momentum": momentumGradientDescent, "nag": nag, "rmsprop": rmsProp, "adam": adam, "nadam": nadam}
  lossFunctionsMap = {"mean_squared_error": mean_squared_error, "cross_entropy" : cross_entropy_loss}
  activationFunctionsMap = {"identity":identity, "sigmoid":sigmoid, "tanh":tanh, "ReLU":relu, "softmax": softmax}
  derivatesFuncMap = {"mean_squared_error": mean_squared_error_derivative, "cross_entropy_loss": cross_entropy_loss_derivative, "identity": identity_derivative,
                      "sigmoid": sigmoid_derivative, "tanh": tanh_derivative, "relu": relu_derivative, "softmax": softmax_derivative}

  def __init__(self,
               input_size=784, output_size=10,
               n_hiddenLayers=3, n_neuronsPerLayer=32,
               activationFun="sigmoid",
               weight_init="random",
               batch_size=64,
               lossFunc="cross_entropy",
               optimizer="adam",
               learning_rate=0.001,
               momentum=0.5,
               beta=0.9, beta1=0.9, beta2=0.99,
               epsilon=1e-8, weight_decay=0.01,
               epochs=10):

    # Inialtization parameters
    self.input_size = input_size  # no of features
    self.output_size = output_size
    self.n_hiddenLayers = n_hiddenLayers
    self.n_neuronsPerLayer = n_neuronsPerLayer
    self.weight_init = weight_init
    self.epochs = epochs

    self.activationFun = FeedForwardNeuralNetwork.activationFunctionsMap[activationFun]
    self.lossFunc = FeedForwardNeuralNetwork.lossFunctionsMap[lossFunc]
    self.optimizer = FeedForwardNeuralNetwork.optimizersMap[optimizer]

    # paramters required for optimizers
    self.batch_size = batch_size
    self.isLookAhead = False;

    if(optimizer == "nag"):
      self.isLookAhead = True;

    # add these parameters as dict
    self.optimizer_input_dict = { "learning_rate" : learning_rate,
                                  "momentum" : momentum,                  # used by momentumGD
                                  "beta" : beta,                          # used by rmsprop
                                  "beta1" : beta1,                        # used by adam & nadam
                                  "beta2" : beta2,                        # used by adam & nadam
                                  "epsilon" : epsilon,
                                  "weight_decay" : weight_decay,
                                  "n_hiddenLayers": n_hiddenLayers}

    # weights and biases matrices
    self.weights = []
    self.biases = []
    self.lookAheadWeights = []
    self.lookAheadBiases = []

    self.wts_bias_history_dict = {"weights": self.weights, "biases": self.biases,
                                  "history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],         # these will be modified before their first use (dimensions of each values will also be changed)
                                  "history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dw": [np.empty(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dh": [np.empty(1) for _ in range(self.n_hiddenLayers+1)]}

    self.initializeWeightsAndBiases()
    self.wts_bias_history_dict["second_history_weights"] = copy.deepcopy(self.wts_bias_history_dict["history_weights"])
    self.wts_bias_history_dict["second_history_biases"] = copy.deepcopy(self.wts_bias_history_dict["history_biases"])

    # pre-activation(a) and post-activation(h) values
    self.a = []
    self.h = []

  '''
    Weights,Biases initialization based on weight_init parameter

    weights[0]: input layer to first hidden layer  : input_size x n_neuronsPerLayer
    weights[1]: first hidden layer to second hidden layer : n_neuronsPerLayer x n_neuronsPerLayer
    ...
    weights[n_hiddenLayers]: last hidden layer to output layer : n_neuronsPerLayer x output_size

    biases[i] : bias for ith layer : 1 x n_neuronsPerLayer   (i:0 to n_hiddenLayers-1)
    biases[n_hiddenLayers]: 1 x output_size
  '''
  def initializeWeightsAndBiases(self):
    # biases for both types
    for i in range(self.n_hiddenLayers):
      self.biases.append(np.zeros(self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_biases"][i] = np.zeros(self.n_neuronsPerLayer)

    self.biases.append(np.zeros(self.output_size))   # biases[n_hiddenLayers]
    self.wts_bias_history_dict["history_biases"][self.n_hiddenLayers] = np.zeros(self.output_size)

    if(self.weight_init == "random"):   # Random Normal
      # weights[0]
      self.weights.append(np.random.randn(self.input_size, self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))

      # weights[1] -> weights[n_hiddenLayers-1]
      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.n_neuronsPerLayer))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))

      # weights[n_hiddenLayers]
      self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.output_size))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

    elif(self.weight_init == "Xavier"):   # Xavier Normal: mean = 0, variance = 2/(n_input + n_output)
      # weights[0]
      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.input_size + self.n_neuronsPerLayer)), size=(self.input_size, self.n_neuronsPerLayer)))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))


      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.n_neuronsPerLayer)), size=(self.n_neuronsPerLayer, self.n_neuronsPerLayer)))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))


      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.output_size)), size=(self.n_neuronsPerLayer, self.output_size)))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

  '''
    Forward propagation through the neural network. (for batch)
    Instead of doing one input at a time, this function handles it for a batch using respective sized matrices

    x_batch: B x n where B - batch size, n- no of features = input_size
    x_batch is assumbed to be numpy array when given as input
  '''
  def forwardPropagation(self, x_batch, isValidation=False):
    a_pre_activation = []
    h_post_activation = []

    # considering a0,h0 as X values as a1: first layer  (it is calculated from x values)
    a_pre_activation.append(x_batch)
    h_post_activation.append(x_batch)

    wt = []
    b = []

    if(self.isLookAhead and not isValidation):
      for i in range(self.n_hiddenLayers+1):
        wt.append(self.weights[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_weights"][i]))
        b.append(self.biases[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_biases"][i]))

      self.lookAheadWeights = wt
      self.lookAheadBiases = b
    else:
      wt = copy.deepcopy(self.weights)
      b = copy.deepcopy(self.biases)

    # Except last layer since activation function could be different
    for i in range(self.n_hiddenLayers):
      # ai: B x n_neuronsPerLayer, biases[i]: 1 x n_neuronsPerLayer (it will be broadcasted while adding)
      ai = np.matmul(h_post_activation[-1], wt[i]) + b[i]
      hi = self.activationFun(ai)

      a_pre_activation.append(ai)
      h_post_activation.append(hi)

    # aL: last layer (activation function is softmax)
    aL = np.matmul(h_post_activation[-1], wt[self.n_hiddenLayers]) + b[self.n_hiddenLayers]
    hL = softmax(aL)   # y_batch

    a_pre_activation.append(aL)
    h_post_activation.append(hL)

    return a_pre_activation, h_post_activation

  '''
    Backward propagation through the neural network. (for batch)
  '''
  def backwardPropagation(self, a_pre_activation, h_post_activation, y_batch, y_pred_batch):
    grad_w = []
    grad_b = []
    grad_a = []
    grad_h = []

    wt = []
    b = []
    if(self.isLookAhead):
        wt = self.lookAheadWeights
        b = self.lookAheadBiases
    else:
        wt = copy.deepcopy(self.weights)
        b = copy.deepcopy(self.biases)

    # Output gradient (wrt aL)
    grad_hL = self.derivatesFuncMap[self.lossFunc.__name__](y_batch, y_pred_batch)
    grad_h.append(grad_hL)

    if(self.lossFunc.__name__ == "cross_entropy_loss"):
      grad_aL = y_pred_batch - y_batch    # just to reduce computation of jacobian matrix
      grad_a.append(grad_aL)
    else:
      grad_aL_list = []
      # softmax derivatives of each input is a matrix of size output_size x output_size, we need to perform matrix_mul for each input of batch
      for i in range(y_batch.shape[0]):   # self.batch_size = y_batch.shape[0] but better to take y_batch.shape[0] since last batch inputs can have less
        grad_aL_inp_i = grad_hL[i] @ softmax_derivative(y_pred_batch[i])
        grad_aL_list.append(grad_aL_inp_i)

      grad_aL = np.array(grad_aL_list)
      grad_aL = grad_aL / y_batch.shape[0]
      grad_a.append(grad_aL)                    # aL contains (aL) values of all inputs in the batch

    # Hidden layers
    for k in range(self.n_hiddenLayers, -1, -1):
      # gradients w.r.t parameters
      # wk
      grad_wk = np.zeros_like(wt[k])    # will be equal to sum across

      for inpNum in range(y_batch.shape[0]):
        grad_wk_inp_num = np.matmul(h_post_activation[k][inpNum].reshape(-1,1), grad_a[-1][inpNum].reshape(1,-1))
        grad_wk += grad_wk_inp_num
      grad_w.append(grad_wk)                   # contains sum across all batches

      # bk
      grad_bk = np.zeros_like(self.biases[k])
      for inpNum in range(y_batch.shape[0]):
        grad_bk += grad_a[-1][inpNum]
      grad_b.append(grad_bk)                     # contains sum across all batches

      if(k > 0):
        # gradients w.r.t layer below
        grad_hk_1 = grad_a[-1] @ wt[k].T
        grad_h.append(grad_hk_1)

        # gradients w.r.t layer below (pre-activation)
        grad_ak_1 = grad_hk_1 * self.derivatesFuncMap[self.activationFun.__name__](a_pre_activation[k])
        grad_a.append(grad_ak_1)

    grad_w = grad_w[::-1]
    grad_b = grad_b[::-1]

    for i in range(self.n_hiddenLayers):
        grad_w[i] = grad_w[i] + (self.optimizer_input_dict["weight_decay"] * wt[i])

    return grad_w, grad_b

  def updateWeights(self, grad_w, grad_b, itr):
    grad_w = [np.clip(dw, -10,10) for dw in grad_w]
    grad_h = [np.clip(db, -10,10) for db in grad_b]
    self.wts_bias_history_dict["dw"] = grad_w
    self.wts_bias_history_dict["db"] = grad_b
    self.optimizer(self.optimizer_input_dict, self.wts_bias_history_dict, itr)

Overwriting neural_network.py


## Loading data

In [58]:
# %%writefile dataset_load.py
import numpy as np
from keras.datasets import fashion_mnist, mnist
# import numpy

datasets = {"fashion_mnist": fashion_mnist, "mnist": mnist}

def load_data(dataset_name):
  (x_train, y_train), (x_test, y_test) = datasets[dataset_name].load_data()
  num_classes = len(np.unique(y_train))

  y_train = np.eye(num_classes)[y_train]
  y_test = np.eye(num_classes)[y_test]

  x_train = x_train.reshape(x_train.shape[0], -1)
  x_test = x_test.reshape(x_test.shape[0], -1)

  x_train = np.array(x_train/255, dtype=np.float64)
  y_train = np.array(y_train, dtype=np.float64)
  x_test = np.array(x_test/255, dtype=np.float64)
  y_test = np.array(y_test, dtype=np.float64)

  return x_train, y_train, x_test, y_test, num_classes

Overwriting dataset_load.py


## Sweep Configuration

In [None]:
sweep_configuration = {
    "method": "bayes",
    "name" : "final_sweep",
    "metric": {"name": "validation_accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"values": [1e-3, 1e-4]},
        "optimizer": {"values": ["sgd", "momentum", "nag", "rmsprop",  "adam", "nadam"]},
        "num_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "weight_decay": {"values": [0, 0.0005, 0.5]},
        "weight_init": {"values": ["random", "Xavier"]},
        "epochs": {"values": [10, 5]},
        "loss": {"values": ["cross_entropy"]},
        "momentum": {"values": [0.9]},
        "beta": {"values": [0.9]},
        "beta1": {"values":[0.9]},
        "beta2": {"values": [0.999]},
        "epsilon": {"values": [1e-8]},
        "dataset": {"values":["fashion_mnist"]}
    }
}

In [None]:
# sweep_configuration = {
#     "method": "grid",
#     "name" : "fashion_mnist_nadam_relu_confusion_matrix",
#     "parameters": {
#         "learning_rate": {"values": [1e-3]},
#         "optimizer": {"values": ["nadam"]},
#         "num_layers": {"values": [4]},
#         "hidden_size": {"values": [128]},
#         "batch_size": {"values": [16]},
#         "activation": {"values": ["ReLU"]},
#         "weight_decay": {"values": [0]},
#         "weight_init": {"values": ["Xavier"]},
#         "epochs": {"values": [10]},
#         "loss": {"values": ["cross_entropy"]},
#         "momentum": {"values": [0.9]},
#         "beta": {"values": [0.9]},
#         "beta1": {"values":[0.9]},
#         "beta2": {"values": [0.999]},
#         "epsilon": {"values": [1e-8]},
#         "dataset": {"values":["fashion_mnist"]},
#         "isConfusionMatrix": {"values": ["True"]}
#     }
# }

## Training the Feed Forward Neural Network

In [77]:
# %%writefile train_sweep.py
# from neural_network import *
# from dataset_load import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

sweep_configuration = {
    "method": "bayes",
    "name" : "train_sweep",
    "metric": {"name": "validation_accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"values": [1e-3, 1e-4]},
        "optimizer": {"values": ["sgd", "momentum", "nag", "rmsprop",  "adam", "nadam"]},
        "num_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "weight_decay": {"values": [0, 0.0005, 0.5]},
        "weight_init": {"values": ["random", "Xavier"]},
        "epochs": {"values": [10, 5]},
        "loss": {"values": ["cross_entropy"]},
        "momentum": {"values": [0.9]},
        "beta": {"values": [0.9]},
        "beta1": {"values":[0.9]},
        "beta2": {"values": [0.999]},
        "epsilon": {"values": [1e-8]},
        "dataset": {"values":["fashion_mnist"]}
    }
}

def calculateAccuracy(y_true, y_pred):
  y_pred_labels = np.argmax(y_pred, axis=1)
  y_true_labels = np.argmax(y_true, axis=1)
  accuracy = np.mean(y_pred_labels == y_true_labels)
  return accuracy*100

def trainNeuralNetwork_sweep():
  wandb.init(mode="online")
  args = wandb.config
  x_train, y_train, x_test, y_test, num_classes = load_data(args["dataset"])
  input_size = len(x_train[0])
  output_size = num_classes
  n_hiddenLayers = args["num_layers"]
  n_neuronsPerLayer = args["hidden_size"]
  activationFun = args["activation"]
  weight_init = args["weight_init"]
  batch_size = args["batch_size"]
  lossFunc = args["loss"]
  optimizer = args["optimizer"]
  learning_rate = args["learning_rate"]
  momentum = args["momentum"]
  beta = args["beta"]
  beta1 = args["beta1"]
  beta2 = args["beta2"]
  epsilon = args["epsilon"]
  weight_decay = args["weight_decay"]
  epochs = args["epochs"]

  wandb.run.name = f"{optimizer}_{activationFun}_{n_hiddenLayers}_{n_neuronsPerLayer}_{epochs}_{weight_init}"

  # paste all above paramters as fun params
  fnn = FeedForwardNeuralNetwork(input_size, output_size, n_hiddenLayers, n_neuronsPerLayer,
                                 activationFun, weight_init, batch_size, lossFunc,
                                 optimizer, learning_rate, momentum,
                                 beta, beta1, beta2,
                                 epsilon, weight_decay, epochs)

  x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
  num_batches = math.ceil(len(x_train)/batch_size)

  for epochNum in range(epochs):
    for batchNum in range(num_batches):
      start_idx = batchNum * batch_size
      end_idx = start_idx + batch_size

      x_batch = x_train[start_idx:end_idx]
      y_batch = y_train[start_idx:end_idx]

      # Forward Propagation
      a_pre_activation, h_post_activation = fnn.forwardPropagation(x_batch)
      y_pred_batch = h_post_activation[-1]

      # Back Propagation
      grad_w, grad_b = fnn.backwardPropagation(a_pre_activation, h_post_activation, y_batch, y_pred_batch)

      # Update weights
      itr = epochNum * num_batches + batchNum + 1
      fnn.updateWeights(grad_w, grad_b, itr)

    # Validation accuracy
    _, h_validation = fnn.forwardPropagation(x_validation, isValidation=True)
    y_pred_validation = h_validation[-1]
    validation_accuracy = calculateAccuracy(y_validation, y_pred_validation)
    wandb.run.summary["metric_name"] = validation_accuracy


    # Train accuracy
    _, h_train = fnn.forwardPropagation(x_train, isValidation=True)
    y_pred_train = h_train[-1]
    train_accuracy = calculateAccuracy(y_train, y_pred_train)

    wandb.log({
        "epoch": epochNum + 1,
        "validation_loss": np.mean(fnn.lossFunc(y_validation, y_pred_validation)),
        "validation_accuracy": validation_accuracy,
        "train_loss": np.mean(fnn.lossFunc(y_train, y_pred_train)),
        "train_accuracy": train_accuracy
        },commit=True)

  # Test accuracy
  _,h_test = fnn.forwardPropagation(x_test, isValidation=True)
  y_pred_test = h_test[-1]
  test_accuracy = calculateAccuracy(y_test, y_pred_test)
  wandb.log({ "test_accuracy": test_accuracy,
             "test_loss": np.mean(fnn.lossFunc(y_test, y_pred_test))})

  # Confusion matrix
  class_names = []
  if(args["isConfusionMatrix"] == "True"):
      if(args["dataset"] == "fashion_mnist"):
          class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress","Coat","Sandal", "Shirt", "Sneaker","Bag","Ankle boot"]
      elif(args["dataset"] == "mnist"):
          class_names = [str(i) for i in range(10)]

  confusion_mat = confusion_matrix(y_pred_test.argmax(axis=1), y_test.argmax(axis=1))

  # plot
  plt.figure(figsize=(8,8))
  sns.heatmap(confusion_mat, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names, cmap="Greens")
  plt.xlabel("y_true")
  plt.ylabel("y_pred")
  plt.title("Confusion Matrix")

  wandb.log({"confusion_matrix": wandb.Image(plt)})
  plt.close()

  wandb.finish()

wandb.login()
wandb_id = wandb.sweep(sweep_configuration, project="DA6401_Assignment1")
wandb.agent(wandb_id, function=trainNeuralNetwork_sweep)

Overwriting train_sweep.py


In [None]:
API_KEY="x"
wandb.login(key=API_KEY)

### Squared error loss config

In [None]:
sweep_configuration_2 = {
    "method": "grid",
    "name" : "squared_error",
    "parameters": {
        "learning_rate": {"values": [1e-3]},
        "num_layers": {"values": [4]},
        "hidden_size": {"values": [128]},
        "batch_size": {"values": [16]},
        "weight_decay": {"values": [0]},
        "weight_init": {"values": ["Xavier"]},
        "epochs": {"values": [10]},
        "loss": {"values": ["mean_squared_error", "cross_entropy"]},
        "optimizer": {"values": ["nadam", "sgd"]},
        "activation": {"values": ["ReLU", "tanh", "sigmoid"]},
        "momentum": {"values": [0.9]},
        "beta": {"values": [0.9]},
        "beta1": {"values":[0.9]},
        "beta2": {"values": [0.999]},
        "epsilon": {"values": [1e-8]},
        "dataset": {"values":["fashion_mnist"]},
        "isConfusionMatrix": {"values": ["False"]}
    }
}

In [None]:
wandb_id = wandb.sweep(sweep_configuration_2, project="DA6401_Assignment1")
wandb.agent(wandb_id, function=trainNeuralNetwork_sweep)

## Argument Parser

In [98]:
# %%writefile argument_parser.py
import argparse

def parse_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument("-wp", "--wandb_project", type=str, default="DA6401_Assignment1",
                        help="Project name used to track experiments in Weights & Biases dashboard")
    parser.add_argument("-we", "--wandb_entity", type=str, default="nikhithaa-iit-madras",
                        help="Wandb Entity used to track experiments in the Weights & Biases dashboard.")
    parser.add_argument("-d", "--dataset", type=str, choices=["mnist", "fashion_mnist"], default="fashion_mnist",
                        help="Choose one among these datasets: ['mnist', 'fashion_mnist']")
    parser.add_argument("-e", "--epochs", type=int, default=10,
                        help="Number of epochs to train neural network")
    parser.add_argument("-b", "--batch_size", type=int, default=16,
                        help="Batch size used to train neural network")
    parser.add_argument("-l", "--loss", type=str, choices=["mean_squared_error", "cross_entropy"], default="cross_entropy",
                        help="Choose one among these loss functions: ['mean_squared_error', 'cross_entropy']")
    parser.add_argument("-o", "--optimizer", type=str, choices=["sgd", "momentum", "nag", "rmsprop", "adam", "nadam"], default="nadam",
                        help="Choose one among these optimizers: ['sgd', 'momentum', 'nag', 'rmsprop', 'adam', 'nadam']")
    parser.add_argument("-lr", "--learning_rate", type=float, default=0.001,
                        help="Learning rate used to optimize model parameters")
    parser.add_argument("-m", "--momentum", type=float, default=0.9,
                        help="Momentum used by momentum and nag optimizers")
    parser.add_argument("-beta", "--beta", type=float, default=0.9,
                        help="Beta used by rmsprop optimizer")
    parser.add_argument("-beta1", "--beta1", type=float, default=0.9,
                        help="Beta1 used by adam and nadam optimizers")
    parser.add_argument("-beta2", "--beta2", type=float, default=0.999,
                        help="Beta2 used by adam and nadam optimizers")
    parser.add_argument("-eps", "--epsilon", type=float, default=0.00000001,
                        help="Epsilon used by optimizers")
    parser.add_argument("-w_d", "--weight_decay", type=float, default=0.0005,
                        help="Weight decay used by optimizers")
    parser.add_argument("-w_i", "--weight_init", type=str, choices=["random", "Xavier"], default="Xavier",
                        help="Choose one among these weight initialization methods: ['random', 'Xavier']")
    parser.add_argument("-nhl", "--num_layers", type=int, default=4,
                        help="Number of hidden layers used in feedforward neural network")
    parser.add_argument("-sz", "--hidden_size", type=int, default=64,
                        help="Number of hidden neurons in a feedforward layer")
    parser.add_argument("-a", "--activation", type=str, choices=["identity", "sigmoid", "tanh", "ReLU"], default="sigmoid",
                        help="Choose one among these activation functions: ['identity', 'sigmoid', 'tanh', 'ReLU']")
    parser.add_argument("-cm", "--confusion_matrix", type=str, choices=["True", "False"], default="False",
                        help="Set true if confusion matrix to be logged")

    return parser.parse_args()


Overwriting argument_parser.py


## Training (argparser included) for train.py file

In [104]:
# %%writefile train.py
# from neural_network import *
# from dataset_load import *
# from argument_parser import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix

def calculateAccuracy(y_true, y_pred):
  y_pred_labels = np.argmax(y_pred, axis=1)
  y_true_labels = np.argmax(y_true, axis=1)
  accuracy = np.mean(y_pred_labels == y_true_labels)
  return accuracy*100


def trainNeuralNetwork(args):
  wandb.login()
  wandb.init(project=args.wandb_project, entity=args.wandb_entity)
  x_train, y_train, x_test, y_test, num_classes = load_data(args.dataset)
  input_size = len(x_train[0])
  output_size = num_classes
  n_hiddenLayers = args.num_layers
  n_neuronsPerLayer = args.hidden_size
  activationFun = args.activation
  weight_init = args.weight_init
  batch_size = args.batch_size
  lossFunc = args.loss
  optimizer = args.optimizer
  learning_rate = args.learning_rate
  momentum = args.momentum
  beta = args.beta
  beta1 = args.beta1
  beta2 = args.beta2
  epsilon = args.epsilon
  weight_decay = args.weight_decay
  epochs = args.epochs

  wandb.run.name = f"train_run_{optimizer}_{activationFun}_{n_hiddenLayers}_{n_neuronsPerLayer}_{epochs}_{weight_init}"

  # paste all above paramters as fun params
  fnn = FeedForwardNeuralNetwork(input_size, output_size, n_hiddenLayers, n_neuronsPerLayer,
                                 activationFun, weight_init, batch_size, lossFunc,
                                 optimizer, learning_rate, momentum,
                                 beta, beta1, beta2,
                                 epsilon, weight_decay, epochs)

  x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
  num_batches = math.ceil(len(x_train)/batch_size)

  for epochNum in range(epochs):
    for batchNum in range(num_batches):
      start_idx = batchNum * batch_size
      end_idx = start_idx + batch_size

      x_batch = x_train[start_idx:end_idx]
      y_batch = y_train[start_idx:end_idx]

      # Forward Propagation
      a_pre_activation, h_post_activation = fnn.forwardPropagation(x_batch)
      y_pred_batch = h_post_activation[-1]

      # Back Propagation
      grad_w, grad_b = fnn.backwardPropagation(a_pre_activation, h_post_activation, y_batch, y_pred_batch)

      # Update weights
      itr = epochNum * num_batches + batchNum + 1
      fnn.updateWeights(grad_w, grad_b, itr)

    # Validation accuracy
    _, h_validation = fnn.forwardPropagation(x_validation, isValidation=True)
    y_pred_validation = h_validation[-1]
    validation_accuracy = calculateAccuracy(y_validation, y_pred_validation)
    wandb.run.summary["metric_name"] = validation_accuracy


    # Train accuracy
    _, h_train = fnn.forwardPropagation(x_train, isValidation=True)
    y_pred_train = h_train[-1]
    train_accuracy = calculateAccuracy(y_train, y_pred_train)

    wandb.log({
        "epoch": epochNum + 1,
        "validation_loss": np.mean(fnn.lossFunc(y_validation, y_pred_validation)),
        "validation_accuracy": validation_accuracy,
        "train_loss": np.mean(fnn.lossFunc(y_train, y_pred_train)),
        "train_accuracy": train_accuracy
        },commit=True)

  # Test accuracy
  _,h_test = fnn.forwardPropagation(x_test, isValidation=True)
  y_pred_test = h_test[-1]
  test_accuracy = calculateAccuracy(y_test, y_pred_test)
  wandb.log({ "test_accuracy": test_accuracy,
             "test_loss": np.mean(fnn.lossFunc(y_test, y_pred_test))})

  # Confusion matrix
  class_names = []
  if(args.confusion_matrix == "True"):
      if(args.dataset == "fashion_mnist"):
          class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress","Coat","Sandal", "Shirt", "Sneaker","Bag","Ankle boot"]
      elif(args.dataset == "mnist"):
          class_names = [str(i) for i in range(10)]

      confusion_mat = confusion_matrix(y_pred_test.argmax(axis=1), y_test.argmax(axis=1))

      # plot
      plt.figure(figsize=(8,8))
      sns.heatmap(confusion_mat, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names, cmap="Greens")
      plt.xlabel("y_true")
      plt.ylabel("y_pred")
      plt.title("Confusion Matrix")
      plt.xticks(rotation=45)
      plt.yticks(rotation=45)
      plt.tight_layout()

      wandb.log({"confusion_matrix": wandb.Image(plt)})
      plt.close()

  wandb.finish()


if __name__=="__main__":
  args = parse_arguments()
  trainNeuralNetwork(args)

Overwriting train.py


In [None]:
!python3 train.py -wp DA6401_Assignment1 -we nikhithaa-iit-madras -b 16 -beta1 0.9 -beta2 0.999 -lr 0.001 -e 10 --dataset fashion_mnist -o nadam -a ReLU -w_d 0 -cm True -nhl 4 -sz 128

2025-03-17 18:03:34.367855: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742234614.394038   39909 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742234614.401654   39909 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mnikhithaa[0m ([33mnikhithaa-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.8
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250317_180339-9uxj2zq5[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: S

In [None]:
API_KEY="x"
wandb.login(key=API_KEY)