## Installation

In [None]:
pip install wandb numpy pandas matplotlib

## Q1: fashion-MNIST dataset

In [None]:
import wandb
from keras.datasets import fashion_mnist

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

classes = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot"
}

def logClassImages(project_name:str):
  wandb.init(project=project_name)
  wandb_image_indices = []

  for classNumber in range(10):
    for j in range(len(y_test)):
      if y_test[j] == classNumber:
        wandb_image_indices.append(x_test[j])
        break

  wandb_images = [wandb.Image(wandb_image_indices[i], caption = classes[i]) for i in range(10)]
  wandb.log({"Sample images for each class": wandb_images})
  wandb.finish()

# logClassImages("da6401_assignment1")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Feedforward neural network



In [137]:
import wandb
from keras.datasets import fashion_mnist
import numpy as np
import copy

In [81]:
"""
  ACTIVATION FUNCTIONS
"""
def identity(x):
    return x

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical Stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [99]:
"""
  LOSS FUNCTIONS
"""
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

import numpy as np

def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred), axis=-1)

In [132]:
"""
  DERIVATIVES OF ACTIVATION AND LOSS FUNCTIONS
"""
def identity_derivative(x):
    return np.ones_like(x)

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def mean_squared_error_derivative(y_true, y_pred):
    return y_pred - y_true

def cross_entropy_loss_derivative(y_true, y_pred):
    return -y_true / y_pred

def softmax_derivative(inp:np.array):
    derivates = []
    if(len(inp.shape) == 1):
      S_vector = inp.reshape(-1, 1)
      derivates = np.diag(inp) - np.dot(S_vector, S_vector.T)
    elif(len(inp.shape) == 2):
      for i in range(inp.shape[0]):
        S_vector = inp[i].reshape(-1, 1)
        derivates.append(np.diag(inp[i]) - np.dot(S_vector, S_vector.T))

    return np.array(derivates)

In [None]:
"""
  OPTIMIZERS UPDATE RULES
"""

# STOCHASTIC GRADIENT DESCENT
def sgd(optimizer_input_dict, wts_bias_history_dict):
  # cant update weights in one single matrix op as dimensions of weights can be different in each layer
  for i in range(optimizer_input_dict["n_hiddenLayers"]):
    # weight decay term added additionally to the formula in slides
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["dw"][i] + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["db"][i]

# MOMENTUM BASED GRADIENT DESCENT
def momentumGradientDescent(optimizer_input_dict, wts_bias_history_dict):
  for i in range(optimizer_input_dict["n_hiddenLayers"]):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i] + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i])
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_weights"][i]

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i]

# NAG(NESTEROV ACCELERATED GRADIENT DESCENT)
def nag(optimizer_input_dict, wts_bias_history_dict):
  for i in range(optimizer_input_dict["n_hiddenLayers"]):
    # dw,db will contain lookahead gradients only since forward and backward propagations are implemented accordingly
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i] + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i])
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_weights"][i]

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i]

# RMSPROP


In [133]:
class FeedForwardNeuralNetwork:
  # class variables
  optimizersMap = {"a":lambda x: x+1}
  lossFunctionsMap = {"mean_squared_error": mean_squared_error, "cross_entropy" : cross_entropy_loss}
  activationFunctionsMap = {"identity":identity, "sigmoid":sigmoid, "tanh":tanh, "ReLU":relu, "softmax": softmax}
  derivatesFuncMap = {"mean_squared_error": mean_squared_error_derivative, "cross_entropy_loss": cross_entropy_loss_derivative, "identity": identity_derivative,
                      "sigmoid": sigmoid_derivative, "tanh": tanh_derivative, "relu": relu_derivative, "softmax": softmax_derivative}

  def __init__(self,
               input_size=784, output_size=10,
               n_hiddenLayers=3, n_neuronsPerLayer=32,
               activationFun="sigmoid",
               weight_init="random",
               batch_size=64,
               lossFunc="cross_entropy",
               optimizer="adam",
               learning_rate=0.001,
               momentum=0.5,
               beta=0.9, beta1=0.9, beta2=0.99,
               epsilon=1e-8, weight_decay=0.01,
               epochs=10):

    # Inialtization parameters
    self.input_size = input_size  # no of features
    self.output_size = output_size
    self.n_hiddenLayers = n_hiddenLayers
    self.n_neuronsPerLayer = n_neuronsPerLayer
    self.weight_init = weight_init
    self.epochs = epochs

    self.activationFun = FeedForwardNeuralNetwork.activationFunctionsMap[activationFun]
    self.lossFunc = FeedForwardNeuralNetwork.lossFunctionsMap[lossFunc]
    self.optimizer = FeedForwardNeuralNetwork.optimizersMap[optimizer]

    # paramters required for optimizers
    self.batch_size = batch_size
    self.isLookAhead = False;

    if(optimizer == "nag"):
      self.isLookAhead = True;

    # add these parameters as dict
    self.optimizer_input_dict = { "learning_rate" : learning_rate,
                                  "momentum" : momentum,                  # used by momentumGD
                                  "beta" : beta,                          # used by rmsprop
                                  "beta1" : beta1,                        # used by adam & nadam
                                  "beta2" : beta2,                        # used by adam & nadam
                                  "epsilon" : epsilon,
                                  "weight_decay" : weight_decay,
                                  "n_hiddenLayers": n_hiddenLayers}

    # weights and biases matrices
    self.weights = []
    self.biases = []
    self.lookAheadWeights = []
    self.lookAheadBiases = []

    self.wts_bias_history_dict = {"weights": self.weights, "biases": self.biases,
                                  "history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers)],         # these will be modified before their first use (dimensions of each values will also be changed)
                                  "history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers)],
                                  "dw": [np.empty(1) for _ in range(self.n_hiddenLayers)],
                                  "dh": [np.empty(1) for _ in range(self.n_hiddenLayers)]}

    self.initializeWeightsAndBiases()

    # pre-activation(a) and post-activation(h) values
    self.a = []
    self.h = []

  '''
    Weights,Biases initialization based on weight_init parameter

    weights[0]: input layer to first hidden layer  : input_size x n_neuronsPerLayer
    weights[1]: first hidden layer to second hidden layer : n_neuronsPerLayer x n_neuronsPerLayer
    ...
    weights[n_hiddenLayers]: last hidden layer to output layer : n_neuronsPerLayer x output_size

    biases[i] : bias for ith layer : 1 x n_neuronsPerLayer   (i:0 to n_hiddenLayers-1)
    biases[n_hiddenLayers]: 1 x output_size
  '''
  def initializeWeightsAndBiases(self):
    # biases for both types
    for i in range(self.n_hiddenLayers):
      self.biases.append(np.zeros(self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_biases"][i] = np.zeros(self.n_neuronsPerLayer)

    self.biases.append(np.zeros(self.output_size))   # biases[n_hiddenLayers]
    self.wts_bias_history_dict["history_biases"][self.n_hiddenLayers] = np.zeros(self.output_size)

    if(self.weight_init == "random"):   # Random Normal
      # weights[0]
      self.weights.append(np.random.randn(self.input_size, self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))

      # weights[1] -> weights[n_hiddenLayers-1]
      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.n_neuronsPerLayer))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))


      # weights[n_hiddenLayers]
      self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.output_size))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers-1] = np.zeros((self.n_neuronsPerLayer, self.output_size))

    elif(self.weight_init == "Xavier"):   # Xavier Normal: mean = 0, variance = 2/(n_input + n_output)
      # weights[0]
      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.input_size + self.n_neuronsPerLayer)), size=(self.input_size, self.n_neuronsPerLayer)))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))


      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.n_neuronsPerLayer)), size=(self.n_neuronsPerLayer, self.n_neuronsPerLayer)))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))


      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.output_size)), size=(self.n_neuronsPerLayer, self.output_size)))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers-1] = np.zeros((self.n_neuronsPerLayer, self.output_size))

  '''
    Forward propagation through the neural network. (for batch)
    Instead of doing one input at a time, this function handles it for a batch using respective sized matrices

    x_batch: B x n where B - batch size, n- no of features = input_size
    x_batch is assumbed to be numpy array when given as input
  '''
  def forwardPropagation(self, x_batch):
    a_pre_activation = []
    h_post_activation = []

    # considering a0,h0 as X values as a1: first layer  (it is calculated from x values)
    a_pre_activation.append(x_batch)
    h_post_activation.append(x_batch)

    wt = []
    b = []

    if(self.isLookAhead):
      for i in range(self.n_hiddenLayers):
        wt.append(self.weights[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_weights"][i]))
        b.append(self.biases[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_biases"][i]))

      self.lookAheadWeights = wt
      self.lookAheadBiases = b
    else:
      wt = copy.deepcopy(self.weights)
      b = copy.deepcopy(self.biases)

    # Except last layer since activation function could be different
    for i in range(self.n_hiddenLayers):
      # ai: B x n_neuronsPerLayer, biases[i]: 1 x n_neuronsPerLayer (it will be broadcasted while adding)
      ai = np.matmul(h_post_activation[-1], wt[i]) + b[i]
      hi = self.activationFun(ai)

      a_pre_activation.append(ai)
      h_post_activation.append(hi)

    # aL: last layer (activation function is softmax)
    aL = np.matmul(h_post_activation[-1], wt[self.n_hiddenLayers]) + b[self.n_hiddenLayers]
    hL = softmax(aL)   # y_batch

    a_pre_activation.append(aL)
    h_post_activation.append(hL)

    return a_pre_activation, h_post_activation

  '''
    Backward propagation through the neural network. (for batch)
  '''
  def backwardPropagation(self, a_pre_activation, h_post_activation, y_batch, y_pred_batch):
    grad_w = []
    grad_b = []
    grad_a = []
    grad_h = []

    # Output gradient (wrt aL)
    grad_hL = self.derivatesFuncMap(self.lossFunc)(y_batch, y_pred_batch)

    for _ in range(self.batch_size):

    # grad_aL = grad_hL  self.derivatesFuncMap(self.activationFun.__name__)(a_pre_activation[-1])

    # grad_h.append(grad_hL)
    # grad_a.append(grad_aL)

    # Hidden layers
    # for k in range(self.n_hiddenLayers, 0, -1):
      # gradients w.r.t parameters
      grad_wk =
    pass

In [6]:
import numpy as np

# # Create a column vector (shape: (3,1))
# v1 = np.array([[1,2], [2,3], [3,4]])  # Shape (3,1)
# v2 = np.array([[4], [5], [6]])  # Shape (3,1)

# # Append to a list
# vector_list = []
# vector_list.append(v1)
# vector_list.append(v2)

# print(vector_list)

# # flatten each element of x_train, after flattening store it as np array
# x_ftrain = np.array([x.flatten() for x in x_train])
# x1 = [x.flatten() for x in x_train]
# # print(x_ftrain[0].shape)
# # print(x_ftrain.shape)
# # print(x_ftrain[:20].shape)
# # print(y_train[:20].shape)
# # print(x_test.shape)
# # print(y_test.shape)
# m = np.array(np.zeros((784,3)))
# y = np.matmul(x_ftrain[:20],m)
# print(y.shape)


y_batch = np.array([
    [0, 1, 0.5],  # Example 1
    [1, 0, 0],  # Example 2
    [0, 0, 1]   # Example 3
])

y_hat_batch = np.array([
    [0.2, 0.5, 0.3],
    [0.8, 0.1, 0.1],
    [0.3, 0.3, 0.4]
])

grad_batch = - (y_batch / y_hat_batch)  # Element-wise division
print(grad_batch)


[[-0.         -2.         -1.66666667]
 [-1.25       -0.         -0.        ]
 [-0.         -0.         -2.5       ]]


In [93]:
import numpy as np

def identity(x):
    return x

def identity_derivative(x):
    return np.ones_like(x)

x_single = 2.0
x_vector = np.array([1.0, 2.0, -3.0])
x_batch = np.array([[1.0, 1.0], [3.0, 4.0]])

print(identity(x_batch))  # Works
print(identity_derivative(x_batch))  # Works


[[1. 1.]
 [3. 4.]]
[[1. 1.]
 [1. 1.]]


In [66]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

print(sigmoid(x_vector))  # Works
print(sigmoid_derivative(x_vector))  # Works

[0.73105858 1.         0.04742587]
[0.19661193 0.         0.04517666]


In [67]:
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

print(tanh(x_vector))  # Works
print(tanh_derivative(x_single))  # Works

[ 0.76159416  1.         -0.99505475]
0.07065082485316443


In [68]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

print(relu(x_vector))  # Works
print(relu_derivative(x_vector))  # Works

[1.e+00 2.e+03 0.e+00]
[1 1 0]


In [74]:
# # def softmax(x):
# #     exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Stability trick
# #     return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# # Softmax needs a vector input
# x_vector = np.array([1.0, 20000.0, 3.0])
# x_batch = np.array([[1.0, 2.0, 3000.0], [1.5, 2.5, 3.5]])

# print(softmax(x_vector))  # Works
# print(softmax(x_batch))  # Works (batch-wise softmax)

In [39]:
print(y_train)

[9 0 0 ... 3 0 5]


In [75]:
import numpy as np

num_classes = 10
y_train_one_hot = np.eye(num_classes)[y_train]
y_test_one_hot = np.eye(num_classes)[y_test]

print(y_train_one_hot[-1])  # Output: (num_samples, 10)

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [95]:
print(x_vector)
x_vector=[1,2,3]

[ 1.  2. -3.]


In [110]:
y_true_single = np.array([0, 0, 1])  # One-hot encoded true label
y_pred_single = np.array([0.1, 0.7, 0.2])
cross_entropy_loss_derivative(y_true_single, y_pred_single)

y_true_batch = np.array([[0, 1, 0], [1, 0, 0]])  # One-hot encoded true labels for two examples
y_pred_batch = np.array([[0.1, 0.7, 0.2], [0.8, 0.1, 0.1]])
cross_entropy_loss_derivative(y_true_batch, y_pred_batch)

array([[ 0.        , -1.42857143,  0.        ],
       [-1.25      ,  0.        ,  0.        ]])

In [131]:
x = softmax_derivative(y_pred_single)
print(x)
print(x.shape)

y = softmax_derivative(y_pred_batch)
print(y)
print(y[0].shape)
print(y.shape)
# len(y_true_batch.shape)

[[ 0.09 -0.07 -0.02]
 [-0.07  0.21 -0.14]
 [-0.02 -0.14  0.16]]
(3, 3)
[[[ 0.09 -0.07 -0.02]
  [-0.07  0.21 -0.14]
  [-0.02 -0.14  0.16]]

 [[ 0.16 -0.08 -0.08]
  [-0.08  0.09 -0.01]
  [-0.08 -0.01  0.09]]]
(3, 3)
(2, 3, 3)


In [136]:
def f(dictx):
  dictx["key"] = 1
  dictx["dgy"] = 2
  print(dictx)

dicty = {}
f(dicty)
print(dicty)
dicty["2"] = 2
f(dicty)

{'key': 1, 'dgy': 2}
{'key': 1, 'dgy': 2}
{'key': 1, 'dgy': 2, '2': 2}
