<a href="https://colab.research.google.com/github/manasdeshpande125/da6401_assignment1/blob/main/DL_ASG1_Q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.datasets import fashion_mnist
import numpy as np
from  matplotlib import pyplot as plt
import time
import math
from sklearn.model_selection import train_test_split
import wandb

In [2]:
dataset= fashion_mnist.load_data()
(X_train_and_validation, y_train_and_validation), (X_test, y_test) = dataset
X_train, X_validation, y_train, y_validation = train_test_split(X_train_and_validation, y_train_and_validation, test_size=0.1, random_state=42)
X_train = (X_train/255.0).astype(np.float32)
X_validation = (X_validation/255.0).astype(np.float32)
X_test = (X_test/255.0).astype(np.float32)

print("Train Dataset Shape: ", X_train.shape)
print("Train Target Vector Shape: ", y_train.shape)
print("Test Dataset Shape:", X_test.shape)
print("Test Target Vector Shape", y_test.shape)
print("Validation Dataset Shape:", X_validation.shape)
print("Validation Target Vector Shape", y_validation.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Train Dataset Shape:  (54000, 28, 28)
Train Target Vector Shape:  (54000,)
Test Dataset Shape: (10000, 28, 28)
Test Target Vector Shape (10000,)
Validation Dataset Shape: (6000, 28, 28)
Validation Target Vector Shape (6000,)


In [3]:
X_train = np.array(X_train.reshape(X_train.shape[0], 784,1))
X_test = np.array(X_test.reshape(X_test.shape[0], 784,1))
X_validation = np.array(X_validation.reshape(X_validation.shape[0], 784,1))

In [4]:
def layer_init(arr,n1,n2,init_type):
    np.random.seed(10)
    if init_type=="random":
        arr.append(np.random.randn(n1,n2)*0.1)
    elif init_type=="xavier":
        arr.append(np.random.randn(n1,n2)*np.sqrt(2/(n1+n2)))
    return arr

def param(layers,init_type):
    W=[]
    B=[]
    for i in range(len(layers)-1):
        W=layer_init(W,layers[i+1],layers[i],init_type)
        B=layer_init(B,layers[i+1],1,init_type)
    return W,B

#Activation function
def activation(activation_function):
    if activation_function == 'sigmoid':
        return sigmoid
    if activation_function == 'tanh':
        return tanh
    if activation_function == 'ReLU':
        return relu

def sigmoid(x, derivative = False):
    if derivative:
        return sigmoid(x)*(1-sigmoid(x))
    return 1/(1 + np.exp(-x))

def tanh(x, derivative = False):
    if derivative:
        return 1 - tanh(x)**2
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))

def relu(x, derivative = False):
    if derivative:
        return (x>0)*1
    return x*(x>0)

def softmax(x,derivative = False):
    if derivative:
        return softmax(x)*(1- softmax(x))
    return np.exp(x)/np.sum(np.exp(x), axis = 0)

def one_hot(y, num_output_nodes):
    v = np.zeros((num_output_nodes, len(y)))
    for i,j in enumerate(y):
        v[j,i] = 1
    return v


def forward(x, W, B, activation_type):
    h = []
    a = []
    sigma = activation(activation_type)  #activation
    h.append(x)   #h0 = x
    a.append(np.dot(W[0], h[0]) + B[0])
    for i in range(len(W)-1):
        h.append(sigma(a[-1]))
        a.append(np.dot(W[i+1], h[-1]) + B[i+1])
    y_hat = softmax(a[-1])

    return y_hat, h, a

In [5]:
def loss(y,y_hat,l_type,W,reg,n_class):
    if l_type=='cross_entropy':
        err=-1*np.sum(np.multiply(one_hot(y,n_class),np.log(y_hat)))/one_hot(y,n_class).shape[1]
    elif l_type=='squared_error':
        err=np.sum((one_hot(y,n_class)-y_hat)**2)/(2*one_hot(y,n_class)).shape[1]

    if W:
        r=0
        for i in range(len(W)):
            r+=np.sum((np.array(W,dtype=object)**2)[i])
        err=err+reg*r
    return err

def eval_acc(y_hat, y_true):
    return np.mean(np.argmax(y_hat, axis = 0) ==y_true )*100


In [6]:
def back_prop(x, y, y_hat, a, h , W, B, batch_size,l_type,act_type):
    grad_h,grad_a,grad_W,grad_B = [0]*len(h),[0]*len(a),[0]*len(W),[0]*len(B)
    sigma = activation(act_type)

    if l_type == "cross_entropy":
        grad_h[-1] = -1*(y/y_hat)
        grad_a[-1] = -1*(y-y_hat)

    for i in range(len(W)-1, -1, -1):
        grad_W[i] = np.dot(grad_a[i], h[i].T)
        grad_B[i] = np.dot(grad_a[i], np.ones((batch_size,1)))
        if i > 0:
            grad_h[i-1] = np.dot(W[i].T, grad_a[i])
            grad_a[i-1]  = np.multiply(grad_h[i-1],sigma(a[i-1], derivative = True))

    return grad_W, grad_B, grad_h, grad_a

In [7]:
def sgd_step(W,B,grad_W,grad_B,lr,reg):
    W=np.array(W,dtype=object)
    B=np.array(B,dtype=object)
    W-=lr*reg*W+lr*np.array(grad_W,dtype=object)
    B-=lr*reg*B+lr*np.array(grad_B,dtype=object)

    return W.tolist(),B.tolist()

In [8]:
def momentum_setp(w, b, gW, gB, lr=0.001, gamma=0.9, reg=0):
    params = {'w': w, 'b': b}

    Wmoments = [np.zeros_like(p) for p in params['w']]
    Bmoments = [np.zeros_like(p) for p in params['b']]

    Wmoments = gamma * np.array(Wmoments,dtype=object) + lr * np.array(gW,dtype=object)
    W = (1 - lr * reg) * np.array(params['w'],dtype=object) - Wmoments
    Wmoments = Wmoments.tolist()

    Bmoments = gamma * np.array(Bmoments,dtype=object) + lr * np.array(gB,dtype=object)
    B = (1 - lr * reg) * np.array(params['b'],dtype=object) - Bmoments
    Bmoments = Bmoments.tolist()

    return W.tolist(), B.tolist()

In [9]:
def RMSprop_step(w, b, gW, gB, lr=0.01, beta=0.99):
    params = {'w': w, 'b': b}

    vW = [np.zeros_like(p) for p in params['w']]
    vB = [np.zeros_like(p) for p in params['b']]

    vW = beta * np.array(vW, dtype=object) + (1 - beta) * (np.array(gW, dtype=object) ** 2)
    W = np.array(params['w'], dtype=object) - (lr / ((vW + 1e-7) ** 0.5)) * np.array(gW, dtype=object)

    vB = beta * np.array(vB, dtype=object) + (1 - beta) * (np.array(gB, dtype=object) ** 2)
    B = np.array(params['b'], dtype=object) - (lr / ((vB + 1e-7) ** 0.5)) * np.array(gB, dtype=object)

    return W.tolist(), B.tolist()

In [10]:
def nesterov_sgd_step(w, b, gW, gB, lr=0.001, gamma=0.9, reg=0):
    params = {'w': w, 'b': b}

    Wmoments = [np.zeros_like(p) for p in params['w']]
    Bmoments = [np.zeros_like(p) for p in params['b']]

    # Lookahead step
    lookahead_W = np.array(params['w'], dtype=object) - gamma * np.array(Wmoments, dtype=object)
    lookahead_B = np.array(params['b'], dtype=object) - gamma * np.array(Bmoments, dtype=object)

    # Compute gradients at lookahead position
    Wmoments = gamma * np.array(Wmoments, dtype=object) + lr * np.array(gW, dtype=object)
    W = (1 - lr * reg) * lookahead_W - Wmoments
    Wmoments = Wmoments.tolist()

    Bmoments = gamma * np.array(Bmoments, dtype=object) + lr * np.array(gB, dtype=object)
    B = (1 - lr * reg) * lookahead_B - Bmoments
    Bmoments = Bmoments.tolist()

    return W.tolist(), B.tolist()


In [11]:
def adam_sgd_step(w, b, gW, gB, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, reg=0, t=1):
    params = {'w': w, 'b': b}

    Wm = [np.zeros_like(p) for p in params['w']]
    Wv = [np.zeros_like(p) for p in params['w']]
    Bm = [np.zeros_like(p) for p in params['b']]
    Bv = [np.zeros_like(p) for p in params['b']]

    # Update biased first moment estimate
    Wm = beta1 * np.array(Wm, dtype=object) + (1 - beta1) * np.array(gW, dtype=object)
    Bm = beta1 * np.array(Bm, dtype=object) + (1 - beta1) * np.array(gB, dtype=object)

    # Update biased second raw moment estimate
    Wv = beta2 * np.array(Wv, dtype=object) + (1 - beta2) * (np.array(gW, dtype=object) ** 2)
    Bv = beta2 * np.array(Bv, dtype=object) + (1 - beta2) * (np.array(gB, dtype=object) ** 2)

    # Compute bias-corrected moment estimates
    Wm_hat = Wm / (1 - beta1 ** t)
    Wv_hat = Wv / (1 - beta2 ** t)
    Bm_hat = Bm / (1 - beta1 ** t)
    Bv_hat = Bv / (1 - beta2 ** t)

    # Update parameters
    W = (1 - lr * reg) * np.array(params['w'], dtype=object) - lr * (Wm_hat / (np.sqrt(Wv_hat) + epsilon))
    B = (1 - lr * reg) * np.array(params['b'], dtype=object) - lr * (Bm_hat / (np.sqrt(Bv_hat) + epsilon))

    return W.tolist(), B.tolist()


In [12]:
def nadam_sgd_step(w, b, gW, gB, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, reg=0, t=1):
    params = {'w': w, 'b': b}

    Wm = [np.zeros_like(p) for p in params['w']]
    Wv = [np.zeros_like(p) for p in params['w']]
    Bm = [np.zeros_like(p) for p in params['b']]
    Bv = [np.zeros_like(p) for p in params['b']]

    # Compute lookahead momentum term for Nesterov-like update
    Wm = beta1 * np.array(Wm, dtype=object) + (1 - beta1) * np.array(gW, dtype=object)
    Bm = beta1 * np.array(Bm, dtype=object) + (1 - beta1) * np.array(gB, dtype=object)

    Wm_nesterov = beta1 * Wm + (1 - beta1) * np.array(gW, dtype=object)
    Bm_nesterov = beta1 * Bm + (1 - beta1) * np.array(gB, dtype=object)

    # Update biased second raw moment estimate
    Wv = beta2 * np.array(Wv, dtype=object) + (1 - beta2) * (np.array(gW, dtype=object) ** 2)
    Bv = beta2 * np.array(Bv, dtype=object) + (1 - beta2) * (np.array(gB, dtype=object) ** 2)

    # Compute bias-corrected moment estimates
    Wm_hat = Wm_nesterov / (1 - beta1 ** t)
    Wv_hat = Wv / (1 - beta2 ** t)
    Bm_hat = Bm_nesterov / (1 - beta1 ** t)
    Bv_hat = Bv / (1 - beta2 ** t)

    # Update parameters
    W = (1 - lr * reg) * np.array(params['w'], dtype=object) - lr * (Wm_hat / (np.sqrt(Wv_hat) + epsilon))
    B = (1 - lr * reg) * np.array(params['b'], dtype=object) - lr * (Bm_hat / (np.sqrt(Bv_hat) + epsilon))

    return W.tolist(), B.tolist()


In [19]:
def train(X_train, y_train, x_val, y_val, num_inputs_nodes, hidden_layers, out_num, init_type, epochs,
          batch_size, l_type, act_type, op_name, lr_rate, reg=0):
    in_node = [num_inputs_nodes]
    layers = []
    layers.extend(in_node)
    layers.extend(hidden_layers)
    layers.extend([out_num])
    W, B = param(layers, init_type)
    N = X_train.shape[0]
    n_batches = int(np.floor(N / batch_size))


    for epoch in range(epochs):

        train_loss = []
        train_accuracy = []
        val_loss = []
        val_accuracy = []
        l = 0
        acc = 0
        temp = 0
        ds = 0
        steps = 1
        while ds < N:
            mini_batch_size = min((N - ds), batch_size)
            x = np.squeeze(X_train[ds:ds + mini_batch_size]).T
            y = one_hot(y_train[ds:ds + mini_batch_size], out_num)
            y_hat, h, a = forward(x, W, B, act_type)
            grad_W, grad_B, grad_h, grad_a = back_prop(x, y, y_hat, a, h, W, B, batch_size, l_type, act_type)
            if op_name=='sgd':
                W,B=sgd_step(W, B, grad_W, grad_B,lr_rate,reg)
            elif op_name=='momentum':
                W,B=momentum_setp(W, B, grad_W, grad_B,lr_rate,reg)
            elif op_name=='rmsprop':
                W,B=RMSprop_step(W, B, grad_W, grad_B,lr_rate,reg)
            elif op_name=="nesterov":
                W,B=nesterov_sgd_step(W, B, grad_W, grad_B,lr_rate,reg)
            elif op_name=="adam":
                W,B=adam_sgd_step(W, B, grad_W, grad_B,lr_rate,reg)
            elif op_name=="nadam":
                W,B=nadam_sgd_step(W, B, grad_W, grad_B,lr_rate,reg)
            l += loss(y_train[ds:ds + mini_batch_size], y_hat, l_type, W, reg, out_num)
            acc += eval_acc(y_hat, y_train[ds:ds + mini_batch_size])
            steps += 1
            if mini_batch_size == (N - ds):
                sample_size = mini_batch_size
            ds += batch_size
        l = l / (n_batches + sample_size)
        acc = acc / steps

        train_loss.append(l)
        train_accuracy.append(acc)

        y_val_hat, _, _ = forward(np.squeeze(x_val).T, W, B, act_type)
        val_acc = eval_acc(y_val_hat, y_val)
        val_l = loss(y_val, y_val_hat, l_type, W=None, reg=reg, n_class=out_num)
        val_accuracy.append(val_acc)
        val_loss.append(val_l)

        wandb.log({"epoch": epoch, "Train_loss": l, "Train_acc": acc, "val_loss": val_l, "val_Accuracy": val_acc})
#         print('Epoch', epoch)
#         print('Train loss', l)
#         print('Train acc', acc)
#         print('Val loss', val_l)
#         print('Val accu', val_acc)
    return W, B, train_loss, train_accuracy, val_loss, val_accuracy


In [20]:
sweep_configuration = {'method'    : "grid",
                       'metric'    : {'name': 'val_Accuracy','goal':'maximize'},
                       'parameters': {'epochs':{'values':[5,10]},
                                      'hidden_layers':{'values':[[32,64,128]] },
                                      'learning_rate':{'values':[1e-3,1e-4]},
                                      'weight_decay':{'values':[0, 0.0005, 0.5]},
                                      'optimizer_name': {'values':['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam']},
                                      'batch_size':{'values':[16, 32, 64]},
                                      'init_type': {'values':['random','xavier']},
                                      'activation_type':{'values':['sigmoid','tanh','ReLU']},
                                      'loss_type': {'values':['cross_entropy','squared_error']} }}

In [21]:
def sweep_train():

  # hyperparameters=dict(epochs = 5,
  #                     hidden_layers= [64,32],
  #                     learning_rate=1e-4,
  #                     weight_decay=0,
  #                     optimizer_name='sgd',
  #                     batch_size=16,
  #                     init_type='random',
  #                     activation_type='sigmoid',
  #                     loss_type='cross_entropy',
  #                     reg_lamda=0)

  wandb.init(project="DA6401-Assignment-1", entity="Manas")
  config=wandb.config
  epochs=config.epochs
  hidden_layers=config.hidden_layers
  learning_rate=config.learning_rate
  weight_decay=config.weight_decay
  optimizer_name=config.optimizer_name
  batch_size=config.batch_size
  init_type=config.init_type
  activation_type=config.activation_type
  loss_type=config.loss_type
  #reg_lamda=config.reg_lamda
  wandb.run.name = "e_{}_hl_{}_lr_{}_wd_{}_o_{}_bs_{}_winit_{}_ac_{}_los_{}".format(epochs,\
                                                                                    hidden_layers,\
                                                                                    learning_rate,\
                                                                                    weight_decay,\
                                                                                    optimizer_name,\
                                                                                    batch_size,\
                                                                                    init_type,\
                                                                                    activation_type,\
                                                                                    loss_type)

  _,_,train_loss, train_accuracy, val_loss, val_accuracy = train(X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate)

In [22]:
sweep_id = wandb.sweep(sweep_configuration,project='DA6401-Assignment-1')
wandb.agent(sweep_id,function=sweep_train,project='DA6401-Assignment-1',count=1)

Create sweep with ID: eqxtf5vg
Sweep URL: https://wandb.ai/manasdeshpande4902-iit-madras/DA6401-Assignment-1/sweeps/eqxtf5vg


[34m[1mwandb[0m: Agent Starting Run: hg6mh9fz with config:
[34m[1mwandb[0m: 	activation_type: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: [32, 64, 128]
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0


0,1
Train_acc,▁▃▆▇█
Train_loss,█▆▃▂▁
epoch,▁▃▅▆█
val_Accuracy,▁▂▅▇█
val_loss,█▄▃▂▁

0,1
Train_acc,64.08286
Train_loss,0.95479
epoch,4.0
val_Accuracy,65.98333
val_loss,0.89419
