<a href="https://colab.research.google.com/github/pankajrawat9075/CS6910_Assignment_1/blob/main/Final_DL_Question1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import the Libraries

In [1]:
!pip install wandb -qU

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import wandb
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mcs22m062[0m ([33miitmadras[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

##Question 1

###Load Dataset

In [4]:
(trainX, trainY), (testX, testY) = fashion_mnist.load_data()

# Split the X_train into a training set and validation set
trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size=0.1, random_state=42)

###Summarize loaded dataset

In [5]:
print('Train: X = %s, y = %s' % (trainX.shape, trainY.shape))
print('Validation: X = %s, y = %s' % (valX.shape, valY.shape))
print('Test: X = %s, y = %s' % (testX.shape, testY.shape))

Train: X = (54000, 28, 28), y = (54000,)
Validation: X = (6000, 28, 28), y = (6000,)
Test: X = (10000, 28, 28), y = (10000,)


### Display all labels images

In [6]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="dl_assignment_1"
)
# Get the unique labels
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
unique_labels = np.unique(trainY)
print("unique_labels = %s" % (unique_labels))

# Create a subplot for each label
image_list = []

# Loop through the labels and display an image for each label
for i, label in enumerate(unique_labels):
    # Get the first image with this label
    img = trainX[trainY == label][0]
    # wandb.log({class_names[label] : axes[i].inshoe(img, cmap = 'gray')})
    # Plot the image
    image_list.append(wandb.Image(img, caption=class_names[label]))
    

wandb.log({"Dataset":image_list})   


unique_labels = [0 1 2 3 4 5 6 7 8 9]


### Data Preprocessing

In [7]:
# Normalize the pixel values to the range [0, 1]
trainX = trainX.astype('float32') / 255.0
testX = testX.astype('float32') / 255.0

trainSize = trainY.shape[0]
testSize = testY.shape[0]

#One-Hot encoding for trainY and testY and valY
y_train = np.zeros(( 10, trainSize ))
y_val = np.zeros(( 10, 6000 ))
y_test = np.zeros(( 10, testSize ))

for i in range(0, trainSize ):
    y_train[trainY[i]][i] = 1

for i in range(6000):
    y_val[valY[i]][i] = 1

for i in range(0, testSize ):
    y_test[testY[i]][i] = 1

trainY = y_train
valY = y_val
testY = y_test

# reshape the X matrices
trainX = trainX.reshape(trainX.shape[0], 784)
valX = valX.reshape(6000, 784)
testX = testX.reshape(10000, 784)

trainX = trainX.T
valX = valX.T
testX = testX.T

In [8]:
trainX.shape

(784, 54000)

##Neural network 

### Activation functions

In [9]:
# sigmoid function that handles overflow
def sigmoid(x):
    return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))

def sigmoid_deriv(x):
    s = sigmoid(x)
    return np.multiply(s, np.subtract(1, s))

def relu( x):
    """
    Rectified Linear Unit (ReLU) activation function that avoids overflow.
    """
    return np.maximum(0, x)

def relu_deriv( x):
    """
    Derivative of the ReLU activation function that avoids overflow.
    """
    return np.where(x > 0, 1, 0)

def tanh(x):
    if x >= 0:
        return 1 - 2 / (np.exp(2*x) + 1)
    else:
        return 2 / (np.exp(-2*x) + 1) - 1

def tanh_deriv(x):
    tanh_x = tanh(x)
    return 1 - tanh_x**2

def softmax(x):
    x -= np.max(x, axis = 0)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis = 0)

### Forward Propogation

In [10]:
def forward_prop(W, B, x, act_func = 'sigmoid'):

    '''
    Returns
      -------

      output: numpy array
          contains the output probabilities for each class and each data sample after 1 pass
      A: numpy array
          contains all activations
      H: numpy array
          consists all pre-activations
      --------
    '''
    L = W.shape[0]+1
    A = [None] * (L - 1)

    H = [None] * (L - 1)
    Y_ = [None]

    A[0] = B[0].reshape(-1, 1) + np.matmul(W[0], x) # for the first pre-activation layer x is the input 

    for i in range(0, L-2):
        if act_func == "sigmoid":
            H[i] = sigmoid(A[i])         # computing the activation layer 

        elif act_func == "relu":
            H[i] = relu(A[i]) 

        elif act_func == "tanh":
            H[i] = tanh(A[i]) 

        A[i+1] = B[i+1].reshape(-1, 1) + np.matmul(W[i+1], H[i])

    Y_ = softmax(A[L-2])

    return A, H, Y_


### Backward Propogation

In [11]:
def back_prop(W, B,  x, y, A, H, Y_, act_func, loss_type):
    L = W.shape[0]+1

    del_A,  del_H = A,  H  # creating gradient variables
    del_W = [None] * (L-1)
    del_B = [None] * (L-1)

    if loss_type == "cross_entropy":
        del_A[-1] = -(y - Y_)

    elif loss_type == "MSE":
        del_A[-1] = (2 / y.shape[1]) * (Y_ - y)

    for i in range(L-2, 0, -1):
        del_W[i] = np.matmul(del_A[i],  np.transpose( H[i-1]))   # compute gradients with respect to weihts and bias
        del_B[i] = del_A[i]
        del_B[i] = np.array(np.sum(del_B[i], axis = 1))

        del_H[i-1] = np.matmul(np.transpose(W[i]), del_A[i])

        if act_func == "sigmoid":
            del_A[i-1] = del_H[i-1] * sigmoid_deriv(A[i-1])
        elif act_func == "relu":
            del_A[i-1] = del_H[i-1] * relu_deriv(A[i-1])
        elif act_func == "tanh":
            del_A[i-1] = del_H[i-1] * tanh_deriv(A[i-1])  

    del_W[0] = np.matmul(del_A[0] , np.transpose(x))  # compute gradients with respect to weihts and bias
    del_B[0] = del_A[0]

    del_B[0] = np.array(np.sum(del_B[0], axis = 1))


    return del_W, del_B

### Training the network

In [12]:
def train(layers, X, Y, epochs, alpha, activation_func, optimizer, batch_size, weight_init, weight_decay, loss_type, momentum = 0.9, 
          beta = 0.9, beta1 = 0.9, beta2 = 0.999, eps = 0.00001):   
    """
    alpha : learning rate

    """
    L = len(layers) # no. of layers

    # initialize the weights and biases
    W = []
    B = []
    u_b, u_w, v_w, v_b = [], [], [], []
    
    # initialize the u_b, u_w, v_w, v_b

    for i in range(1, L):
        temp_w = np.zeros((layers[i], layers[i-1]))
        temp_b = np.zeros((layers[i]))
        u_w.append(temp_w)
        v_w.append(temp_w)
        u_b.append(temp_b)
        v_b.append(temp_b)
    np.random.seed(42)
    # initialize weights and biases
    if weight_init == "random":
        for i in range(1, L):
            w = np.random.randn(layers[i], layers[i-1])
            b = np.random.randn(layers[i]) # b is bias vector
            W.append(w)
            B.append(b)
    if weight_init == "Xavier":
        for i in range(1, L):
            w = np.random.randn(layers[i], layers[i-1]) * np.sqrt(2.0 / (layers[i] + layers[i-1]))
            b = np.random.randn(layers[i]) * np.sqrt(2.0 / (layers[i] + layers[i-1]))       # b is bias vector
            W.append(w)
            B.append(b)
    
        
    W = np.array(W)
    B = np.array(B)
    u_w = np.array(u_w)
    v_w = np.array(v_w)
    u_b = np.array(u_b)
    v_b = np.array(v_b)
    lr = alpha
    

    for epoch in range(epochs):
        alpha = lr / (epoch+1)
        for i in range(0, X.shape[1], batch_size):
            batch_count = batch_size

            if i + batch_size > X.shape[1]: # the last mini-batch might contain fewer than "batch_size" examples
                batch_count = X.shape[1] - i + 1

            if optimizer == 'sgd':

                A, H, Y_ = forward_prop(W, B, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W, B, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type)

                W, B = update_parms_sgd(W, B, alpha, del_w, del_b, weight_decay)

            elif optimizer == 'momentum':

                A, H, Y_ = forward_prop(W, B, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W, B, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type) # compute the gradient at the lookahead

                u_w = momentum * u_w + del_w
                u_b = momentum * u_b + del_b
                W, B = update_parms_momentum(W, B, alpha, u_w, u_b,  weight_decay)

            elif optimizer == 'nag':

                A, H, Y_ = forward_prop(W - beta * u_w, B - alpha * u_b, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W - beta * u_w, B - alpha * u_b, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type) # compute the gradient at the lookahead

                u_w = momentum * u_w + del_w
                u_b = momentum * u_b + del_b
                
                W, B = update_parms_nag(W, B, alpha, u_w, u_b,  weight_decay)

            elif optimizer == 'RMSprop':

                A, H, Y_ = forward_prop(W, B, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W, B, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type) # compute the gradient at the lookahead

                
                W, B, u_w, u_b = update_parms_RMSprop(W, B, alpha, u_w, u_b, del_w, del_b, eps, beta,  weight_decay)

            elif optimizer == 'adam':

                A, H, Y_ = forward_prop(W, B, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W, B, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type) # compute the gradient at the lookahead

                
                W, B, u_w, u_b, v_w, v_b = update_parms_adam(W, B, alpha, u_w, u_b, v_w, v_b, del_w, del_b, eps, beta1, beta2, epoch,  weight_decay)

            elif optimizer == 'nadam':

                A, H, Y_ = forward_prop(W, B, X[:, i:i+batch_count], activation_func)
                del_w, del_b = back_prop(W, B, X[:, i:i+batch_count], Y[:, i:i+batch_count], A, H, Y_, activation_func, loss_type) # compute the gradient at the lookahead

                W, B, u_w, u_b, v_w, v_b = update_parms_nadam(W, B, alpha, u_w, u_b, v_w, v_b, del_w, del_b, eps, beta1, beta2, epoch,  weight_decay)

        show(W, B, activation_func, epoch, loss_type, weight_decay)

    return W, B


In [13]:
def show(W, B, activ_f, epoch, loss_type, wd):
    wandb.log({"epoch": epoch+1})

    if(loss_type == "cross_entropy"):

        # print the cross entropy validation loss
        val_loss = loss(valX, valY, W, B, activ_f, wd)
        wandb.log({"Validation_Loss": val_loss})

        # print the cross entropy validation accuracy
        val_accuracy = accuracy(valX, valY, W, B, activ_f)
        wandb.log({"val_accuracy": val_accuracy})

        # print the cross entropy training loss
        training_loss = loss(trainX, trainY, W, B, activ_f, wd)
        wandb.log({"Training_Loss": training_loss})

        # print the cross entropy training accuracy
        training_accuracy = accuracy(trainX, trainY,W, B, activ_f)
        wandb.log({"Training_accuracy": training_accuracy})

    elif(loss_type == "MSE"):

        # print the MSE validation loss
        val_loss = MSE(valX, valY, W, B, activ_f, wd)
        wandb.log({"Validation_Loss": val_loss})

        # print the MSE validation accuracy
        val_accuracy = accuracy(valX, valY, W, B, activ_f)
        wandb.log({"val_accuracy": val_accuracy})

        # print the MSE training loss
        training_loss = MSE(trainX, trainY, W, B, activ_f, wd)
        wandb.log({"Training_Loss": training_loss})

        # print the MSE training accuracy
        training_accuracy = accuracy(trainX, trainY,W, B, activ_f)
        wandb.log({"Training_accuracy": training_accuracy})
    


### Optimizers

In [14]:

def update_parms_sgd(W, B, alpha, del_w, del_b, wd):
    W = W - alpha * np.array(np.array(del_w)) - wd * W
    B = B - alpha * np.array(del_b) - wd * B
    return W, B

def update_parms_nag(W, B, alpha, u_w, u_b, wd):
    W = W - alpha * np.array(np.array(u_w)) - wd * W
    B = B - alpha * np.array(u_b)- wd * B
    return W, B

def update_parms_momentum(W, B, alpha, u_w, u_b, wd):
    W = W - alpha * np.array(np.array(u_w)) - wd * W
    B = B - alpha * np.array(u_b) - wd * B
    return W, B 

def update_parms_RMSprop(W, B, alpha, u_w, u_b, del_w, del_b, eps, beta, wd):

    for i in range(W.shape[0]):
        u_w[i] = beta * u_w[i] + (1-beta)*del_w[i]**2
        u_b[i] = beta * u_b[i] + (1-beta)*del_b[i]**2
        W[i] = W[i] - alpha * np.array(np.array(del_w[i])) / (np.sqrt(u_w[i]) + eps) - wd * W[i]
        B[i] = B[i] - alpha * np.array(del_b[i]) / (np.sqrt(u_b[i]) + eps) - wd * B[i]
    return W, B, u_w, u_b

def update_parms_adam(W, B, alpha, u_w, u_b, v_w, v_b, del_w, del_b, eps, beta, beta2, epoch, wd):

    for i in range(W.shape[0]):
        v_w[i] = beta * v_w[i] + (1-beta) * del_w[i]
        v_b[i] = beta * v_b[i] + (1-beta) * del_b[i]

        v_w_hat = v_w[i] / (1 - beta ** (epoch+1))
        v_b_hat = v_b[i] / (1 - beta ** (epoch+1))

        u_w[i] = beta2 * u_w[i] + (1-beta2)*del_w[i]**2
        u_b[i] = beta2 * u_b[i] + (1-beta2)*del_b[i]**2

        u_w_hat = u_w[i] / (1- beta2 ** (epoch+1))
        u_b_hat = u_b[i] / (1- beta2 ** (epoch+1))

        W[i] = W[i] - alpha * np.array(np.array(v_w_hat)) / (np.sqrt(u_w_hat) + eps) - wd * W[i]
        B[i] = B[i] - alpha * np.array(v_b_hat) / (np.sqrt(u_b_hat) + eps) - wd * B[i]

    return W, B, u_w, u_b, v_w, v_b

def update_parms_nadam(W, B, alpha, u_w, u_b, v_w, v_b, del_w, del_b, eps, beta, beta2, epoch, wd):

    for i in range(W.shape[0]):
        v_w[i] = beta * v_w[i] + (1-beta) * del_w[i]
        v_b[i] = beta * v_b[i] + (1-beta) * del_b[i]

        v_w_hat = v_w[i] / (1 - beta ** (epoch+1))
        v_b_hat = v_b[i] / (1 - beta ** (epoch+1))

        u_w[i] = beta2 * u_w[i] + (1-beta2)*del_w[i]**2
        u_b[i] = beta2 * u_b[i] + (1-beta2)*del_b[i]**2

        u_w_hat = u_w[i] / (1- beta2 ** (epoch+1))
        u_b_hat = u_b[i] / (1- beta2 ** (epoch+1))

        W[i] = W[i] - (alpha / np.sqrt(u_w_hat + eps)) * (beta * v_w_hat + (1-beta)*del_w[i]/(1-beta**(epoch+1))) - wd * W[i]
        B[i] = B[i] - (alpha / np.sqrt(u_b_hat + eps)) * (beta * v_b_hat + (1-beta)*del_b[i]/(1-beta**(epoch+1))) - wd * B[i]

    return W, B, u_w, u_b, v_w, v_b



### Predictions and Evaluations and Loss function

In [15]:
def predict(X, W, B, act_fun):
    '''
    forward propagate once and calculate labels

    '''
    _, _, output = forward_prop(W, B, X, act_fun)
    predictions = np.argmax(output, axis=0)
    return predictions

def accuracy(X, Y, W, B, activation_f):
    test_predictions = predict(X, W, B, activation_f)
    y_test = np.argmax(Y, axis=0)
    return accuracy_score(y_test, test_predictions) * 100

def loss(X, Y, W, B, activation_f, wd):

    _, _, output = forward_prop(W, B, X, activation_f)
    output = output.T
    Y = Y.T
    eps = 1e-12
    loss = -np.mean(np.sum(Y * np.log(output + eps), axis=1))
    return loss + 0.5 * wd ** 2

# added the squared loss funtion
def MSE(X, Y, W, B, activation_f, wd):

    _, _, output = forward_prop(W, B, X, activation_f)
    output = output.T
    Y = Y.T
    loss = np.mean(np.sum((Y - output) ** 2, axis = 1))
    return loss + 0.5 * wd ** 2

def evaluate(X_train, y_train, X_test, y_test, W, B, activation_f):
    '''
    print train,test accuracies and the classification report using sklearn

    '''
    y_train = np.argmax(y_train, axis=0)
    train_predictions = predict(X_train, W, B, activation_f)
    y_test = np.argmax(y_test, axis=0)
    test_predictions = predict(X_test, W, B, activation_f)

    print("Training accuracy = ", accuracy_score(y_train, train_predictions))
    print("Test accuracy = ", accuracy_score(y_test, test_predictions))

    return train_predictions, test_predictions

## Running the Experiments

### Train

#### sweep config for wandb

In [16]:
sweep_config = {
    'method' : 'random',
    'name' : 'first sweep',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'weight_init': {
            'values': ['random', "Xavier"]
        },
        'epochs': {
            'values' : [5, 10,20, 40]
        },
        "hidden_layers": {
            "values": [ 3,4,5,6]
        },
        "size_of_layer": {
            "values": [ 32, 64,128]
        },
        'learning_rate': {
            'values': [0.001, 0.0001,0.00001]
        },
        
        'batch_size': {
            'values': [32, 64, 128]
        },
        'optimizer': {
            'values': ['adam', 'nadam', 'sgd', 'RMSprop', 'nestrov', 'momentum']
        },
        'activation': {
            'values': ['relu', 'sigmoid', 'tanh']
        },
        'weight_decay':{
            'values': [0.0001, 0.0005, 0]
        }
    }
}

#### train_wand function

In [17]:
def train_wand(config=None):
    config_defaults = {
        'epochs': 10,
        'batch_size': 64,
        'learning_rate': 1e-3,
        'activation_f': 'relu',
        'optimizer': 'adam',
        'init_mode': 'xavier',
        'L2_lamb': 0,
        'num_neurons': 64,
        'num_hidden': 3
    }
    wandb.init(config = config)
    config = wandb.config
    layers = [config.size_of_layer] * (config.hidden_layers+1)
    layers[0] = 28*28
    layers.append(10)
    name='hl_'+str(config.hidden_layers)+"_lr_"+str(config.learning_rate)+"_bs_"+str(config.batch_size)+"_opt_"+str(config.optimizer)+ '_act_'+str(config.activation)
    wandb.init(name = name)
    W, B = train(layers, trainX, trainY, epochs=config.epochs, alpha = config.learning_rate, activation_func=config.activation, 
              optimizer =config.optimizer, batch_size=config.batch_size, weight_init = config.weight_init,weight_decay = config.weight_decay, loss_type = "cross_entropy" )




#### Wand Sweep

In [None]:
# Initialize WandB sweep
sweep_id = wandb.sweep(sweep_config,project="dl_assignment_1")
wandb.agent(sweep_id=sweep_id,function=train_wand)



Create sweep with ID: nfcyt612
Sweep URL: https://wandb.ai/iitmadras/dl_assignment_1/sweeps/nfcyt612


[34m[1mwandb[0m: Agent Starting Run: qd0ordup with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	size_of_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Exception in thread Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 980, in _bootstrap_inner
NetStatThr:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)    
self.run()  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/wandb_run.py", line 276, in check_stop_status

      File "/usr/lib/python3.9/threading.py", line 917, in run
self._loop_check_status(    
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/wandb_run.py", line 214, in _loop_check_status
    local_handle = request()
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/interface/interface.py", line 787, in deliver_stop_status
    return self._deliver_stop_status(status)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/interface/interface_shared.py", line 585, 

    self._sendall_with_error_handle(header + data)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/lib/sock_client.py", line 130, in _sendall_with_error_handle
    sent = self._sock.send(data)
BrokenPipeError: [Errno 32] Broken pipe
self.send_server_request(server_req)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/lib/sock_client.py", line 155, in send_server_request
    self._send_message(msg)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/lib/sock_client.py", line 152, in _send_message
    self._sendall_with_error_handle(header + data)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/lib/sock_client.py", line 130, in _sendall_with_error_handle
    sent = self._sock.send(data)
BrokenPipeError: [Errno 32] Broken pipe


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  u_w = mome

0,1
Training_Loss,█▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Training_accuracy,▁▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
Validation_Loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▁▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████████

0,1
Training_Loss,0.41439
Training_accuracy,85.03148
Validation_Loss,0.5387
epoch,40.0
val_accuracy,80.31667


[34m[1mwandb[0m: Agent Starting Run: k2w8ets3 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training_Loss,█▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
Training_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation_Loss,█▅▅▅▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
val_accuracy,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training_Loss,2.3026
Training_accuracy,10.09259
Validation_Loss,2.30266
epoch,20.0
val_accuracy,9.16667


[34m[1mwandb[0m: Agent Starting Run: lj06r750 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	size_of_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0.0001
[34m[1mwandb[0m: 	weight_init: Xavier


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - alpha * np.array(np.array(del_w)) - wd * W
  B = B - alpha * np.array(del_b) - wd * B
  W = W - al

0,1
Training_Loss,▂▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
Training_accuracy,▅▇██████████▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
Validation_Loss,▄▃▂▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▆▇████████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▁

0,1
Training_Loss,1.65872
Training_accuracy,61.85741
Validation_Loss,10.2492
epoch,40.0
val_accuracy,54.33333


[34m[1mwandb[0m: Agent Starting Run: 85syp8va with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁

0,1
epoch,1


[34m[1mwandb[0m: [32m[41mERROR[0m Run 85syp8va errored: ValueError('The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()')
[34m[1mwandb[0m: Agent Starting Run: jozlmlss with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: RMSprop
[34m[1mwandb[0m: 	size_of_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


[34m[1mwandb[0m: [32m[41mERROR[0m Run jozlmlss errored: ValueError('The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()')
[34m[1mwandb[0m: Agent Starting Run: 2wsx6me1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


[34m[1mwandb[0m: [32m[41mERROR[0m Run 2wsx6me1 errored: ValueError('The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()')
[34m[1mwandb[0m: Agent Starting Run: hyzrixzd with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 6
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁

0,1
epoch,1


[34m[1mwandb[0m: [32m[41mERROR[0m Run hyzrixzd errored: ValueError('The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()')
[34m[1mwandb[0m: Agent Starting Run: daakquq9 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	size_of_layer: 32
[34m[1mwandb[0m: 	weight_decay: 0.0001
[34m[1mwandb[0m: 	weight_init: random


  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = momentum * u_w + del_w
  u_b = momentum * u_b + del_b
  u_w = 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training_Loss,▅▃▁▁▁▁▂▂▃▄▅▆▇▇██████████████████████████
Training_accuracy,▅▇▇████▇▇▇▇▆▆▆▆▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation_Loss,█▇▇▇▇▇▇▇▆▆▆▆▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▄▆▇█████▇▆▇▇▆▆▆▅▅▅▄▄▄▄▄▄▂▁▁▃▄▁▁▁▁▁▁▁▁▁▁▁

0,1
Training_Loss,2.30258
Training_accuracy,10.00556
Validation_Loss,2.30259
epoch,40.0
val_accuracy,9.95


[34m[1mwandb[0m: Agent Starting Run: zxrx1b0s with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 6
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nestrov
[34m[1mwandb[0m: 	size_of_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0001
[34m[1mwandb[0m: 	weight_init: random


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training_Loss,▁▁▁▁▁
Training_accuracy,▁▁▁▁▁
Validation_Loss,▁▁▁▁▁
epoch,▁▃▅▆█
val_accuracy,▁▁▁▁▁

0,1
Training_Loss,23.72226
Training_accuracy,14.1463
Validation_Loss,23.94228
epoch,5.0
val_accuracy,13.35


[34m[1mwandb[0m: Agent Starting Run: cuj8nqu2 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	size_of_layer: 128
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))


0,1
Training_Loss,▁▁▃▅█
Training_accuracy,██▇▆▁
Validation_Loss,▁▁▂▅█
epoch,▁▃▅▆█
val_accuracy,███▆▁

0,1
Training_Loss,1.76822
Training_accuracy,58.89815
Validation_Loss,1.67947
epoch,5.0
val_accuracy,55.13333


[34m[1mwandb[0m: Agent Starting Run: ui8t33ps with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_layers: 6
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: RMSprop
[34m[1mwandb[0m: 	size_of_layer: 64
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


  W = np.array(W)
  B = np.array(B)
  u_w = np.array(u_w)
  v_w = np.array(v_w)
  u_b = np.array(u_b)
  v_b = np.array(v_b)


0,1
Training_Loss,█████▆▃▁▁▁▁▁▁▁▁▁▁▁▁▁
Training_accuracy,▁▁▁▂▃▅▇█▆▃▃▃▃▃▃▃▃▃▃▃
Validation_Loss,███████▇▇▅▂▁▁▁▁▁▁▁▁▁
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
val_accuracy,▁▁▁▂▂▃▄▅▆▇▇██▅▃▂▂▂▂▂

0,1
Training_Loss,2.30265
Training_accuracy,9.97963
Validation_Loss,2.30282
epoch,20.0
val_accuracy,10.18333


#### training the network

In [None]:
W, B = train([28*28, 32, 32, 32, 10], trainX, trainY, epochs=10, alpha = 0.0001, activation_func="relu", optimizer="adam", batch_size=16, weight_init = 'Xavier', loss_type = "MSE", weight_decay = 0)
evaluate(trainX, trainY, valX, valY, W, B, "relu")

## Confusion matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate example data
y_true = np.random.randint(0, 5, size=100)
y_pred = np.random.randint(0, 5, size=100)

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Define class labels
labels = ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4']

# Define marker shapes for each class
markers = ['o', 's', '^', 'd', 'v']

# Create scatter plot of predicted vs true labels, with markers and colors based on class
fig, ax = plt.subplots()
for i in range(len(labels)):
    ax.scatter(y_pred[y_true==i], y_true[y_true==i], marker=markers[i], label=labels[i])
ax.legend()
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')
ax.set_title('Confusion Matrix')
plt.show()


### Test

## 