In [None]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting appdirs>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.17.0-py2.py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
from keras.datasets import fashion_mnist, mnist
import numpy as np
import math
import wandb

In [None]:
def process(x) :
  x_proc = x.reshape(len(x), -1)
  x_proc = x_proc.astype('float64')
  x_proc = x_proc / 255.0
  return x_proc

In [None]:
def load_data(dataset = "fashion_mnist"):
  if dataset == "fashion_mnist" :
      (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  elif dataset == "mnist":
      (x_train, y_train), (x_test, y_test) = mnist.load_data()
  
  x_train, x_valid = x_train[:int(len(x_train) * 0.9)], x_train[int(len(x_train) * 0.9):]
  y_train, y_valid = y_train[:int(len(y_train) * 0.9)], y_train[int(len(y_train) * 0.9):]

  x_train = process(x_train)
  x_valid = process(x_valid)
  x_test = process(x_test) 

  k = 10
  y_train = np.eye(k)[y_train] # one-hot
  y_valid = np.eye(k)[y_valid]
  y_test = np.eye(k)[y_test]
  
  return x_train, y_train, x_valid, y_valid, x_test, y_test

In [None]:
def sigmoid(x) :
  return 1. / (1. + np.exp(-x))

def tanh(x) :
  return (2. / (1. + np.exp(-2.*x))) - 1.

def relu(x) : # do not use relu with random
  return np.where(x >= 0, x, 0.)

def softmax(x) :
  x = x - np.max(x, axis=0)
  y = np.exp(x)
  return y / y.sum(axis=0)

In [None]:
class my_nn :

  def __init__(self, n_feature = 784, n_class = 10, nhl = 1, sz = 4, weight_init = "random", act_fun = "sigmoid", loss = "cross_entropy", 
               epochs = 1, b_sz = 4, optimizer = "sgd", lr = 0.1, mom = 0.9, beta = 0.9, beta1 = 0.9, beta2 = 0.999, epsilon = 0.000001, w_d = 0.005) :
    self.n_feature = n_feature
    self.n_class = n_class
    self.nhl = nhl
    self.L = nhl + 1
    self.sz = sz
    self.weight_init = weight_init
    self.act_fun = act_fun
    self.loss = loss
    self.epochs = epochs
    self.b_sz = b_sz
    self.optimizer = optimizer
    self.lr = lr
    self.mom = mom
    self.beta = beta
    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon
    self.w_d = w_d

    self.W = [0 for i in range(0, self.L+1, 1)]
    self.b = [0 for i in range(0, self.L+1, 1)]

    self.d_a = [0 for i in range(0, self.L+1, 1)]
    self.d_b = [0 for i in range(0, self.L+1, 1)]
    self.d_W = [0 for i in range(0, self.L+1, 1)]

    self.a = [0 for i in range(0, self.L+1, 1)]
    self.h = [0 for i in range(0, self.L+1, 1)]

    self.u_W = [0 for i in range(0, self.L+1, 1)]
    self.u_b = [0 for i in range(0, self.L+1, 1)]

    self.W_look = [0 for i in range(0, self.L+1, 1)]
    self.b_look = [0 for i in range(0, self.L+1, 1)]

    self.v_W = [0 for i in range(0, self.L+1, 1)]
    self.v_b = [0 for i in range(0, self.L+1, 1)]

    self.m_W = [0 for i in range(0, self.L+1, 1)]
    self.m_b = [0 for i in range(0, self.L+1, 1)]

    self.initialization()

  ######################################################

  def initialization(self) :
    if self.act_fun == "ReLU" :
      self.W[1] = np.random.randn(self.sz, self.n_feature) * np.sqrt(2.0/self.n_feature)
      for i in range(2, self.L, 1) :
        self.W[i] = np.random.randn(self.sz, self.sz) * math.sqrt(2.0/self.sz)
      self.W[self.L] = np.random.randn(self.n_class, self.sz) * math.sqrt(2.0/self.sz)

    elif self.weight_init == "random" :
      self.W[1] = np.random.randn(self.sz, self.n_feature)
      for i in range(2, self.L, 1) :
        self.W[i] = np.random.randn(self.sz, self.sz)
      self.W[self.L] = np.random.randn(self.n_class, self.sz)

    elif self.weight_init == "Xavier" :
      self.W[1] = np.random.randn(self.sz, self.n_feature) * np.sqrt(2.0/self.n_feature)
      for i in range(2, self.L, 1) :
        self.W[i] = np.random.randn(self.sz, self.sz) * math.sqrt(2.0/self.sz)
      self.W[self.L] = np.random.randn(self.n_class, self.sz) * math.sqrt(2.0/self.sz)
    
    for i in range(1, self.L, 1) :
      self.b[i] = np.zeros((self.sz, 1))
    self.b[self.L] = np.zeros((self.n_class, 1))
  
  #########################################################

  def forward_propagation(self, x) :
    self.h[0] = x

    for i in range(1, self.L, 1) :
      self.a[i] = self.b[i] + np.dot(self.W[i], self.h[i-1])

      if self.act_fun == "sigmoid" :
        self.h[i] = sigmoid(self.a[i])
      elif self.act_fun == "tanh" :
        self.h[i] = tanh(self.a[i])
      elif self.act_fun == "ReLU" :
        self.h[i] = relu(self.a[i])
    
    self.a[self.L] = self.b[self.L] + np.dot(self.W[self.L], self.h[self.L-1])
    self.h[self.L] = softmax(self.a[self.L]) # h[L] = y_hat

  #########################################################

  def back_propagation(self, y) :
    if self.loss == "cross_entropy" :
      self.d_a[self.L] = self.h[self.L] - y
    elif self.loss == "mean_squared_error" :
      self.d_a[self.L] = (self.h[self.L] - y) * (self.h[self.L] * (1. - self.h[self.L]))
    
    self.d_b[self.L] = np.sum(self.d_a[self.L], axis=1, keepdims=True)
    self.d_W[self.L] = np.dot(self.d_a[self.L], self.h[self.L-1].T) + self.w_d * self.W[self.L]
    
    for i in range(self.L-1, 0, -1) :
      d_h_i = np.dot(self.W[i+1].T, self.d_a[i+1])
      
      if self.act_fun == "sigmoid" :
        g_dash_a_i = self.h[i] * (1. - self.h[i])
      elif self.act_fun == "tanh" :
        g_dash_a_i = 1. - self.h[i]**2
      elif self.act_fun == "ReLU" :
        g_dash_a_i = np.where(self.h[i] > 0., 1., 0.)
      
      self.d_a[i] = d_h_i * g_dash_a_i
      self.d_b[i] = np.sum(self.d_a[i], axis=1, keepdims=True)
      self.d_W[i] = np.dot(self.d_a[i], self.h[i-1].T) + self.w_d * self.W[i]

  ############################################################

  def nag_forward_propagation(self, x) :
    self.h[0] = x

    for i in range(1, self.L, 1) :
      self.a[i] = self.b_look[i] + np.dot(self.W_look[i], self.h[i-1])

      if self.act_fun == "sigmoid" :
        self.h[i] = sigmoid(self.a[i])
      elif self.act_fun == "tanh" :
        self.h[i] = tanh(self.a[i])
      elif self.act_fun == "ReLU" :
        self.h[i] = relu(self.a[i])
    
    self.a[self.L] = self.b_look[self.L] + np.dot(self.W_look[self.L], self.h[self.L-1])
    self.h[self.L] = softmax(self.a[self.L]) # h[L] = y_hat

  #########################################################

  def nag_back_propagation(self, y) :
    if self.loss == "cross_entropy" :
      self.d_a[self.L] = self.h[self.L] - y
    elif self.loss == "mean_squared_error" :
      self.d_a[self.L] = (self.h[self.L] - y) * (self.h[self.L] * (1. - self.h[self.L]))
    
    self.d_b[self.L] = np.sum(self.d_a[self.L], axis=1, keepdims=True)
    self.d_W[self.L] = np.dot(self.d_a[self.L], self.h[self.L-1].T) + self.w_d * self.W_look[self.L]
    
    for i in range(self.L-1, 0, -1) :
      d_h_i = np.dot(self.W_look[i+1].T, self.d_a[i+1])
      
      if self.act_fun == "sigmoid" :
        g_dash_a_i = self.h[i] * (1. - self.h[i])
      elif self.act_fun == "tanh" :
        g_dash_a_i = 1. - self.h[i]**2
      elif self.act_fun == "ReLU" :
        g_dash_a_i = np.where(self.h[i] > 0., 1., 0.)
      
      self.d_a[i] = d_h_i * g_dash_a_i
      self.d_b[i] = np.sum(self.d_a[i], axis=1, keepdims=True)
      self.d_W[i] = np.dot(self.d_a[i], self.h[i-1].T) + self.w_d * self.W_look[i]

  ############################################################

  def predict_prob(self, x) :
    a_temp = [0 for i in range(0, self.L+1, 1)]
    h_temp = [0 for i in range(0, self.L+1, 1)]
    h_temp[0] = x

    for i in range(1, self.L, 1) :
      a_temp[i] = self.b[i] + np.dot(self.W[i], h_temp[i-1])

      if self.act_fun == "sigmoid" :
        h_temp[i] = sigmoid(a_temp[i])
      elif self.act_fun == "tanh" :
        h_temp[i] = tanh(a_temp[i])
      elif self.act_fun == "ReLU" :
        h_temp[i] = relu(a_temp[i])
    
    a_temp[self.L] = self.b[self.L] + np.dot(self.W[self.L], h_temp[self.L-1])
    h_temp[self.L] = softmax(a_temp[self.L]) # h[L] = y_hat

    return h_temp[self.L].T
  
  #############################################################

  def loss_val(self, y_hat, y) :
    loss_val = 0.0
    N = y.shape[0]

    if self.loss == "cross_entropy" :
      for i in range(0, N, 1) :
        temp_loss = math.log(y_hat[i][y[i].argmax()])
        loss_val += temp_loss
      
      loss_val *= (-1.0/N)
    
    elif self.loss == "mean_squared_error" :
      loss_val = np.sum((y - y_hat)**2) / N

    return loss_val

  ##############################################################

  def accuracy(self, y_hat, y) :
    N = y.shape[0]
    n_correct = 0

    for i in range(0, N, 1) :
      if y[i].argmax() == y_hat[i].argmax() :
        n_correct += 1
    
    return 100 * n_correct / N

  ###############################################################

  def sgd(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        self.forward_propagation(X[j:r_idx].T)
        self.back_propagation(y[j:r_idx].T)
        
        for idx in range(1, self.L+1, 1) :
          self.W[idx] = self.W[idx] - (self.lr * self.d_W[idx])
          self.b[idx] = self.b[idx] - (self.lr * self.d_b[idx])
      
      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})

      t += 1

  #################################################################

  def mgd(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]
    n_step = 0

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        n_step += 1
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        self.forward_propagation(X[j:r_idx].T)
        self.back_propagation(y[j:r_idx].T)

        for idx in range(1, self.L+1, 1) :
          if n_step == 1 :
            self.u_W[idx] = (self.lr * self.d_W[idx])
            self.u_b[idx] = (self.lr * self.d_b[idx])
          else :
            self.u_W[idx] = (self.mom * self.u_W[idx]) + (self.lr * self.d_W[idx])
            self.u_b[idx] = (self.mom * self.u_b[idx]) + (self.lr * self.d_b[idx])
          
          self.W[idx] = self.W[idx] - self.u_W[idx]
          self.b[idx] = self.b[idx] - self.u_b[idx]

      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})
      
      t += 1

  ##################################################################

  def nagd(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]
    n_step = 0

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        n_step += 1
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        if n_step == 1 :
          self.forward_propagation(X[j:r_idx].T)
          self.back_propagation(y[j:r_idx].T)
        else :
          for idx in range(1, self.L+1, 1) :
            self.W_look[idx] = self.W[idx] - (self.mom * self.u_W[idx])
            self.b_look[idx] = self.b[idx] - (self.mom * self.u_b[idx])
          self.nag_forward_propagation(X[j:r_idx].T)
          self.nag_back_propagation(y[j:r_idx].T)

        for idx in range(1, self.L+1, 1) :
          if n_step == 1 :
            self.u_W[idx] = (self.lr * self.d_W[idx])
            self.u_b[idx] = (self.lr * self.d_b[idx])
          else :
            self.u_W[idx] = (self.mom * self.u_W[idx]) + (self.lr * self.d_W[idx])
            self.u_b[idx] = (self.mom * self.u_b[idx]) + (self.lr * self.d_b[idx])
          
          self.W[idx] = self.W[idx] - self.u_W[idx]
          self.b[idx] = self.b[idx] - self.u_b[idx]
        
      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})
      t += 1

  ##############################################################

  def rmsprop(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]
    n_step = 0

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        n_step += 1
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        self.forward_propagation(X[j:r_idx].T)
        self.back_propagation(y[j:r_idx].T)

        for idx in range(1, self.L+1, 1) :
          if n_step == 1 :
            self.v_W[idx] = ((1. - self.beta) * (self.d_W[idx]**2))
            self.v_b[idx] = ((1. - self.beta) * (self.d_b[idx]**2))
          else :
            self.v_W[idx] = (self.beta * self.v_W[idx]) + ((1. - self.beta) * (self.d_W[idx]**2))
            self.v_b[idx] = (self.beta * self.v_b[idx]) + ((1. - self.beta) * (self.d_b[idx]**2))
          
          self.W[idx] = self.W[idx] - (self.lr / (np.sqrt(self.v_W[idx] + self.epsilon))) * self.d_W[idx]
          self.b[idx] = self.b[idx] - (self.lr / (np.sqrt(self.v_b[idx] + self.epsilon))) * self.d_b[idx]
        
      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})
      t += 1
  
  ##############################################################

  def adam(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]
    n_step = 0

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        n_step += 1
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        self.forward_propagation(X[j:r_idx].T)
        self.back_propagation(y[j:r_idx].T)

        for idx in range(1, self.L+1, 1) :
          if n_step == 1 :
            self.m_W[idx] = ((1. - self.beta1) * self.d_W[idx])
            self.m_b[idx] = ((1. - self.beta1) * self.d_b[idx])

            self.v_W[idx] = ((1. - self.beta2) * (self.d_W[idx]**2))
            self.v_b[idx] = ((1. - self.beta2) * (self.d_b[idx]**2))
          else :
            self.m_W[idx] = (self.beta1 * self.m_W[idx]) + ((1. - self.beta1) * self.d_W[idx])
            self.m_b[idx] = (self.beta1 * self.m_b[idx]) + ((1. - self.beta1) * self.d_b[idx])

            self.v_W[idx] = (self.beta2 * self.v_W[idx]) + ((1. - self.beta2) * (self.d_W[idx]**2))
            self.v_b[idx] = (self.beta2 * self.v_b[idx]) + ((1. - self.beta2) * (self.d_b[idx]**2))
          
          self.W[idx] = self.W[idx] - (self.lr / (np.sqrt(self.v_W[idx] / (1. - self.beta2**n_step) + self.epsilon))) * (self.m_W[idx] / (1. - self.beta1**n_step))
          self.b[idx] = self.b[idx] - (self.lr / (np.sqrt(self.v_b[idx] / (1. - self.beta2**n_step) + self.epsilon))) * (self.m_b[idx] / (1. - self.beta1**n_step))
        
      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})
      t += 1

  ##############################################################

  def nadam(self, X, y, X_valid, y_valid) :
    t = 0
    N = X.shape[0]
    n_step = 0

    while t < self.epochs :
      for j in range(0, N, self.b_sz) :
        n_step += 1
        r_idx = j + self.b_sz
        if (j + self.b_sz) > N :
          r_idx = N
        self.forward_propagation(X[j:r_idx].T)
        self.back_propagation(y[j:r_idx].T)

        for idx in range(1, self.L+1, 1) :
          if n_step == 1 :
            self.m_W[idx] = ((1. - self.beta1) * self.d_W[idx])
            self.m_b[idx] = ((1. - self.beta1) * self.d_b[idx])

            self.v_W[idx] = ((1. - self.beta2) * (self.d_W[idx]**2))
            self.v_b[idx] = ((1. - self.beta2) * (self.d_b[idx]**2))
          else :
            self.m_W[idx] = (self.beta1 * self.m_W[idx]) + ((1. - self.beta1) * self.d_W[idx])
            self.m_b[idx] = (self.beta1 * self.m_b[idx]) + ((1. - self.beta1) * self.d_b[idx])

            self.v_W[idx] = (self.beta2 * self.v_W[idx]) + ((1. - self.beta2) * (self.d_W[idx]**2))
            self.v_b[idx] = (self.beta2 * self.v_b[idx]) + ((1. - self.beta2) * (self.d_b[idx]**2))
          
          W_term = (self.beta1 / (1. - self.beta1**n_step)) * self.m_W[idx]  + ((1. - self.beta1) / (1. - self.beta1**n_step)) * self.d_W[idx]
          b_term = (self.beta1 / (1. - self.beta1**n_step)) * self.m_b[idx]  + ((1. - self.beta1) / (1. - self.beta1**n_step)) * self.d_b[idx]

          self.W[idx] = self.W[idx] - (self.lr / (np.sqrt(self.v_W[idx] / (1. - self.beta2**n_step) + self.epsilon))) * W_term
          self.b[idx] = self.b[idx] - (self.lr / (np.sqrt(self.v_b[idx] / (1. - self.beta2**n_step) + self.epsilon))) * b_term
        
      y_hat = self.predict_prob(X.T)
      tr_loss = self.loss_val(y_hat, y)
      tr_acc = self.accuracy(y_hat, y)

      y_val_hat = self.predict_prob(X_valid.T)
      val_loss = self.loss_val(y_val_hat, y_valid)
      val_acc = self.accuracy(y_val_hat, y_valid)

      print(f"epoch {t + 1} : train_loss = {tr_loss:.2f} valid_loss = {val_loss:.2f}, train accuracy = {tr_acc:.2f} valid_accuracy = {val_acc:.2f}")
      wandb.log({'tr_loss' : tr_loss, 'tr_accuracy' : tr_acc, 'val_loss' : val_loss, 'val_accuracy' : val_acc})
      t += 1

  ##############################################################

  def train(self, X_train, y_train, X_valid, y_valid) :
    if self.optimizer == "sgd" :
      self.sgd(X_train, y_train, X_valid, y_valid)
    elif self.optimizer == "momentum" :
      self.mgd(X_train, y_train, X_valid, y_valid)
    elif self.optimizer == "nag" :
      self.nagd(X_train, y_train, X_valid, y_valid)
    elif self.optimizer == "rmsprop" :
      self.rmsprop(X_train, y_train, X_valid, y_valid)
    elif self.optimizer == "adam" :
      self.adam(X_train, y_train, X_valid, y_valid)
    elif self.optimizer == "nadam" :
      self.nadam(X_train, y_train, X_valid, y_valid)
  

In [None]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
# hyperparameters for the best model identified in Q4
sweep_config = {
    'method': 'grid',
    'name' : 'Cross_entropy_vs_MSE',
    'metric': {
      'name': 'val accuracy',
      'goal': 'maximize'  
    },
    'parameters': {
        'epochs': {
            'values': [10]
        },
        'num_layers': {
            'values': [3]
        },
         'hidden_size': {
            'values': [128]
        },
        'weight_decay': {
            'values': [0.0]
        },
         'learning_rate': {
            'values': [0.001]
        },
         'optimizer': {
            'values': ['sgd']           
        },
        'batch_size': {
            'values': [32]
        },
         'weight_init':{
            'values': ['Xavier']            
        },
        'activation': {
            'values': ['ReLU']
        },
        'loss': {
            'values': ['cross_entropy', 'mean_squared_error']
        }, 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project = 'cs6910_dl_assgn_1_q_8')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: 58ziki6a
Sweep URL: https://wandb.ai/cs22m059/cs6910_dl_assgn_1_q_8/sweeps/58ziki6a


In [None]:
def main():
  with wandb.init() as run:
    run_name = "loss_" + wandb.config.loss + "_opt_" + wandb.config.optimizer + "_ac_" + wandb.config.activation + "_bs_" + str(wandb.config.batch_size)\
            + "_hl_" + str(wandb.config.num_layers) + "_lr_" + str(wandb.config.learning_rate)
    wandb.run.name = run_name

    epochs = wandb.config.epochs
    nhl = wandb.config.num_layers
    sz = wandb.config.hidden_size
    w_d = wandb.config.weight_decay
    lr = wandb.config.learning_rate
    optimizer = wandb.config.optimizer
    b_sz = wandb.config.batch_size
    weight_init = wandb.config.weight_init
    act_fun = wandb.config.activation
    loss = wandb.config.loss

    nn_model = my_nn(loss=loss, epochs = epochs, nhl = nhl, sz = sz, w_d = w_d, lr = lr, optimizer = optimizer, b_sz = b_sz, weight_init = weight_init, act_fun = act_fun)
    nn_model.train(x_train, y_train, x_valid, y_valid)
    
wandb.agent(sweep_id, function = main, count = 2)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: mbzx3un4 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier
[34m[1mwandb[0m: Currently logged in as: [33mcs22m059[0m. Use [1m`wandb login --relogin`[0m to force relogin


epoch 1 : train_loss = 0.43 valid_loss = 0.45, train accuracy = 84.79 valid_accuracy = 83.88
epoch 2 : train_loss = 0.37 valid_loss = 0.40, train accuracy = 86.63 valid_accuracy = 85.73
epoch 3 : train_loss = 0.34 valid_loss = 0.38, train accuracy = 87.65 valid_accuracy = 86.78
epoch 4 : train_loss = 0.32 valid_loss = 0.36, train accuracy = 88.51 valid_accuracy = 87.33
epoch 5 : train_loss = 0.30 valid_loss = 0.35, train accuracy = 89.08 valid_accuracy = 87.48
epoch 6 : train_loss = 0.28 valid_loss = 0.34, train accuracy = 89.44 valid_accuracy = 87.67
epoch 7 : train_loss = 0.27 valid_loss = 0.34, train accuracy = 89.81 valid_accuracy = 87.83
epoch 8 : train_loss = 0.26 valid_loss = 0.33, train accuracy = 90.34 valid_accuracy = 88.12
epoch 9 : train_loss = 0.26 valid_loss = 0.34, train accuracy = 90.40 valid_accuracy = 88.05
epoch 10 : train_loss = 0.25 valid_loss = 0.33, train accuracy = 90.73 valid_accuracy = 88.28


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
tr_accuracy,▁▃▄▅▆▆▇███
tr_loss,█▆▅▄▃▂▂▂▁▁
val_accuracy,▁▄▆▆▇▇▇███
val_loss,█▅▄▃▂▂▁▁▁▁

0,1
tr_accuracy,90.73148
tr_loss,0.2465
val_accuracy,88.28333
val_loss,0.33287


[34m[1mwandb[0m: Agent Starting Run: bbeci4yc with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: mean_squared_error
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


epoch 1 : train_loss = 0.29 valid_loss = 0.29, train accuracy = 80.34 valid_accuracy = 80.08
epoch 2 : train_loss = 0.25 valid_loss = 0.26, train accuracy = 82.34 valid_accuracy = 81.35
epoch 3 : train_loss = 0.23 valid_loss = 0.24, train accuracy = 83.61 valid_accuracy = 82.57
epoch 4 : train_loss = 0.22 valid_loss = 0.23, train accuracy = 84.56 valid_accuracy = 83.20
epoch 5 : train_loss = 0.21 valid_loss = 0.23, train accuracy = 85.17 valid_accuracy = 83.97
epoch 6 : train_loss = 0.20 valid_loss = 0.22, train accuracy = 85.82 valid_accuracy = 84.60
epoch 7 : train_loss = 0.19 valid_loss = 0.21, train accuracy = 86.36 valid_accuracy = 85.07
epoch 8 : train_loss = 0.19 valid_loss = 0.20, train accuracy = 86.95 valid_accuracy = 85.62
epoch 9 : train_loss = 0.18 valid_loss = 0.20, train accuracy = 87.27 valid_accuracy = 85.90
epoch 10 : train_loss = 0.18 valid_loss = 0.20, train accuracy = 87.63 valid_accuracy = 86.13


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
tr_accuracy,▁▃▄▅▆▆▇▇██
tr_loss,█▆▅▄▃▃▂▂▁▁
val_accuracy,▁▂▄▅▅▆▇▇██
val_loss,█▆▅▄▃▂▂▂▁▁

0,1
tr_accuracy,87.62963
tr_loss,0.17778
val_accuracy,86.13333
val_loss,0.19802
