## Installation

In [1]:
pip install wandb numpy pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


## Q1: fashion-MNIST dataset

In [2]:
# import wandb
# from keras.datasets import fashion_mnist

# (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# classes = {
#     0: "T-shirt/top",
#     1: "Trouser",
#     2: "Pullover",
#     3: "Dress",
#     4: "Coat",
#     5: "Sandal",
#     6: "Shirt",
#     7: "Sneaker",
#     8: "Bag",
#     9: "Ankle boot"
# }

# def logClassImages(project_name:str):
#   wandb.init(project=project_name)
#   wandb_image_indices = []

#   for classNumber in range(10):
#     for j in range(len(y_test)):
#       if y_test[j] == classNumber:
#         wandb_image_indices.append(x_test[j])
#         break

#   wandb_images = [wandb.Image(wandb_image_indices[i], caption = classes[i]) for i in range(10)]
#   wandb.log({"Sample images for each class": wandb_images})
#   wandb.finish()

# # logClassImages("da6401_assignment1")

## Feedforward neural network



### Libraries required

In [3]:
import wandb
from keras.datasets import fashion_mnist
import numpy as np
import copy

### Activation functions

In [4]:
"""
  ACTIVATION FUNCTIONS
"""
def identity(x):
    return x

def sigmoid(x):
    x = np.clip(x,-10,10)
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    # print(x)
    x = np.clip(x, -200,200)
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical Stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

### Loss Functions

In [5]:
"""
  LOSS FUNCTIONS
"""
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

import numpy as np

def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 1e-9), axis=-1)    # 1e-9 to avy_pred + 1e-9oid log 0

### Derivatives

In [6]:
"""
  DERIVATIVES OF ACTIVATION AND LOSS FUNCTIONS
"""
def identity_derivative(x):
    return np.ones_like(x)

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def mean_squared_error_derivative(y_true, y_pred):
    return y_pred - y_true

def cross_entropy_loss_derivative(y_true, y_pred):
    return -y_true / (y_pred + 1e-9)

def softmax_derivative(inp:np.array):
    derivates = []
    if(len(inp.shape) == 1):
      S_vector = inp.reshape(-1, 1)
      derivates = np.diag(inp) - np.dot(S_vector, S_vector.T)
    elif(len(inp.shape) == 2):
      for i in range(inp.shape[0]):
        S_vector = inp[i].reshape(-1, 1)
        derivates.append(np.diag(inp[i]) - np.dot(S_vector, S_vector.T))

    return np.array(derivates)

### Optimizers

In [8]:
"""
  OPTIMIZERS UPDATE RULES
"""

# STOCHASTIC GRADIENT DESCENT
def sgd(optimizer_input_dict, wts_bias_history_dict, itr=None):
  # cant update weights in one single matrix op as dimensions of weights can be different in each layer
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # weight decay term added additionally to the formula in slides
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["dw"][i] + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["db"][i])

# MOMENTUM BASED GRADIENT DESCENT
def momentumGradientDescent(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i])

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - (optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i])

# NAG(NESTEROV ACCELERATED GRADIENT DESCENT)
def nag(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # dw,db will contain lookahead gradients only since forward and backward propagations are implemented accordingly
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i] +  (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i]

# RMSPROP
def rmsProp(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["dw"][i] ** 2)
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] *((wts_bias_history_dict["dw"][i]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"]))  + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["dw"][i]))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["db"][i] ** 2)
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_biases"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["db"][i]))

# ADAM
def adam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"]*((history_weights_hat/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (history_weights_hat))

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (history_biases_hat))

# NADAM
def nadam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_dw = optimizer_input_dict["beta1"] * history_weights_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"]*(lookahead_dw/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * ((optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (lookahead_dw))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_db = optimizer_input_dict["beta1"] * history_biases_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["db"][i])
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (lookahead_db))


### Network (forward and back propagation)

In [9]:
class FeedForwardNeuralNetwork:
  # class variables
  optimizersMap = {"sgd": sgd, "momentum": momentumGradientDescent, "nag": nag, "rmsprop": rmsProp, "adam": adam, "nadam": nadam}
  lossFunctionsMap = {"mean_squared_error": mean_squared_error, "cross_entropy" : cross_entropy_loss}
  activationFunctionsMap = {"identity":identity, "sigmoid":sigmoid, "tanh":tanh, "ReLU":relu, "softmax": softmax}
  derivatesFuncMap = {"mean_squared_error": mean_squared_error_derivative, "cross_entropy_loss": cross_entropy_loss_derivative, "identity": identity_derivative,
                      "sigmoid": sigmoid_derivative, "tanh": tanh_derivative, "relu": relu_derivative, "softmax": softmax_derivative}

  def __init__(self,
               input_size=784, output_size=10,
               n_hiddenLayers=3, n_neuronsPerLayer=32,
               activationFun="sigmoid",
               weight_init="random",
               batch_size=64,
               lossFunc="cross_entropy",
               optimizer="adam",
               learning_rate=0.001,
               momentum=0.5,
               beta=0.9, beta1=0.9, beta2=0.99,
               epsilon=1e-8, weight_decay=0.01,
               epochs=10):

    # Inialtization parameters
    self.input_size = input_size  # no of features
    self.output_size = output_size
    self.n_hiddenLayers = n_hiddenLayers
    self.n_neuronsPerLayer = n_neuronsPerLayer
    self.weight_init = weight_init
    self.epochs = epochs

    self.activationFun = FeedForwardNeuralNetwork.activationFunctionsMap[activationFun]
    self.lossFunc = FeedForwardNeuralNetwork.lossFunctionsMap[lossFunc]
    self.optimizer = FeedForwardNeuralNetwork.optimizersMap[optimizer]

    # paramters required for optimizers
    self.batch_size = batch_size
    self.isLookAhead = False;

    if(optimizer == "nag"):
      self.isLookAhead = True;

    # add these parameters as dict
    self.optimizer_input_dict = { "learning_rate" : learning_rate,
                                  "momentum" : momentum,                  # used by momentumGD
                                  "beta" : beta,                          # used by rmsprop
                                  "beta1" : beta1,                        # used by adam & nadam
                                  "beta2" : beta2,                        # used by adam & nadam
                                  "epsilon" : epsilon,
                                  "weight_decay" : weight_decay,
                                  "n_hiddenLayers": n_hiddenLayers}

    # weights and biases matrices
    self.weights = []
    self.biases = []
    self.lookAheadWeights = []
    self.lookAheadBiases = []

    self.wts_bias_history_dict = {"weights": self.weights, "biases": self.biases,
                                  "history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],         # these will be modified before their first use (dimensions of each values will also be changed)
                                  "history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dw": [np.empty(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dh": [np.empty(1) for _ in range(self.n_hiddenLayers+1)]}

    self.initializeWeightsAndBiases()
    self.wts_bias_history_dict["second_history_weights"] = copy.deepcopy(self.wts_bias_history_dict["history_weights"])
    self.wts_bias_history_dict["second_history_biases"] = copy.deepcopy(self.wts_bias_history_dict["history_biases"])

    # pre-activation(a) and post-activation(h) values
    self.a = []
    self.h = []

  '''
    Weights,Biases initialization based on weight_init parameter

    weights[0]: input layer to first hidden layer  : input_size x n_neuronsPerLayer
    weights[1]: first hidden layer to second hidden layer : n_neuronsPerLayer x n_neuronsPerLayer
    ...
    weights[n_hiddenLayers]: last hidden layer to output layer : n_neuronsPerLayer x output_size

    biases[i] : bias for ith layer : 1 x n_neuronsPerLayer   (i:0 to n_hiddenLayers-1)
    biases[n_hiddenLayers]: 1 x output_size
  '''
  def initializeWeightsAndBiases(self):
    # biases for both types
    for i in range(self.n_hiddenLayers):
      self.biases.append(np.zeros(self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_biases"][i] = np.zeros(self.n_neuronsPerLayer)

    self.biases.append(np.zeros(self.output_size))   # biases[n_hiddenLayers]
    self.wts_bias_history_dict["history_biases"][self.n_hiddenLayers] = np.zeros(self.output_size)

    if(self.weight_init == "random"):   # Random Normal
      # weights[0]
      self.weights.append(np.random.randn(self.input_size, self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))

      # weights[1] -> weights[n_hiddenLayers-1]
      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.n_neuronsPerLayer))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))

      # weights[n_hiddenLayers]
      self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.output_size))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

    elif(self.weight_init == "Xavier"):   # Xavier Normal: mean = 0, variance = 2/(n_input + n_output)
      # weights[0]
      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.input_size + self.n_neuronsPerLayer)), size=(self.input_size, self.n_neuronsPerLayer)))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))


      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.n_neuronsPerLayer)), size=(self.n_neuronsPerLayer, self.n_neuronsPerLayer)))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))


      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.output_size)), size=(self.n_neuronsPerLayer, self.output_size)))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

  '''
    Forward propagation through the neural network. (for batch)
    Instead of doing one input at a time, this function handles it for a batch using respective sized matrices

    x_batch: B x n where B - batch size, n- no of features = input_size
    x_batch is assumbed to be numpy array when given as input
  '''
  def forwardPropagation(self, x_batch, isValidation=False):
    a_pre_activation = []
    h_post_activation = []

    # considering a0,h0 as X values as a1: first layer  (it is calculated from x values)
    a_pre_activation.append(x_batch)
    h_post_activation.append(x_batch)

    wt = []
    b = []

    if(self.isLookAhead and not isValidation):
      for i in range(self.n_hiddenLayers+1):
        wt.append(self.weights[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_weights"][i]))
        b.append(self.biases[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_biases"][i]))

      self.lookAheadWeights = wt
      self.lookAheadBiases = b
    else:
      wt = copy.deepcopy(self.weights)
      b = copy.deepcopy(self.biases)

    # Except last layer since activation function could be different
    for i in range(self.n_hiddenLayers):
      # ai: B x n_neuronsPerLayer, biases[i]: 1 x n_neuronsPerLayer (it will be broadcasted while adding)
      ai = np.matmul(h_post_activation[-1], wt[i]) + b[i]
      hi = self.activationFun(ai)

      a_pre_activation.append(ai)
      h_post_activation.append(hi)

    # aL: last layer (activation function is softmax)
    aL = np.matmul(h_post_activation[-1], wt[self.n_hiddenLayers]) + b[self.n_hiddenLayers]
    hL = softmax(aL)   # y_batch

    a_pre_activation.append(aL)
    h_post_activation.append(hL)

    return a_pre_activation, h_post_activation

  '''
    Backward propagation through the neural network. (for batch)
  '''
  def backwardPropagation(self, a_pre_activation, h_post_activation, y_batch, y_pred_batch):
    grad_w = []
    grad_b = []
    grad_a = []
    grad_h = []

    # Output gradient (wrt aL)
    grad_hL = self.derivatesFuncMap[self.lossFunc.__name__](y_batch, y_pred_batch)
    grad_h.append(grad_hL)

    if(self.lossFunc.__name__ == "cross_entropy_loss"):
      grad_aL = y_pred_batch - y_batch    # just to reduce computation of jacobian matrix
      grad_a.append(grad_aL)
    else:
      grad_aL_list = []
      # softmax derivatives of each input is a matrix of size output_size x output_size, we need to perform matrix_mul for each input of batch
      for i in range(y_batch.shape[0]):   # self.batch_size = y_batch.shape[0] but better to take y_batch.shape[0] since last batch inputs can have less
        grad_aL_inp_i = grad_hL[i] @ softmax_derivative(y_pred_batch[i])
        grad_aL_list.append(grad_aL_inp_i)

      grad_aL = np.array(grad_aL_list)
      grad_a.append(grad_aL)                    # aL contains (aL) values of all inputs in the batch

    # Hidden layers
    for k in range(self.n_hiddenLayers, -1, -1):
      # gradients w.r.t parameters
      # wk
      grad_wk = np.zeros_like(self.weights[k])    # will be equal to sum across

      for inpNum in range(y_batch.shape[0]):
        grad_wk_inp_num = np.matmul(h_post_activation[k][inpNum].reshape(-1,1), grad_a[-1][inpNum].reshape(1,-1))
        grad_wk += grad_wk_inp_num
      grad_w.append(grad_wk)                   # contains sum across all batches

      # bk
      grad_bk = np.zeros_like(self.biases[k])
      for inpNum in range(y_batch.shape[0]):
        grad_bk += grad_a[-1][inpNum]
      grad_b.append(grad_bk)                     # contains sum across all batches

      if(k > 0):
        # gradients w.r.t layer below
        grad_hk_1 = grad_a[-1] @ self.weights[k].T
        grad_h.append(grad_hk_1)

        # gradients w.r.t layer below (pre-activation)
        grad_ak_1 = grad_hk_1 * self.derivatesFuncMap[self.activationFun.__name__](a_pre_activation[k])
        grad_a.append(grad_ak_1)

    grad_w = grad_w[::-1]
    grad_b = grad_b[::-1]

    return grad_w, grad_b

  def updateWeights(self, grad_w, grad_b, itr):
    grad_w = [np.clip(dw, -10,10) for dw in grad_w]
    grad_h = [np.clip(db, -10,10) for db in grad_b]
    self.wts_bias_history_dict["dw"] = grad_w
    self.wts_bias_history_dict["db"] = grad_b
    self.optimizer(self.optimizer_input_dict, self.wts_bias_history_dict, itr)


## Loading data

In [10]:
from keras.datasets import fashion_mnist, mnist
import numpy as np

datasets = {"fashion_mnist": fashion_mnist, "mnist": mnist}

def load_data(dataset_name):
  (x_train, y_train), (x_test, y_test) = datasets[dataset_name].load_data()
  num_classes = len(np.unique(y_train))

  y_train = np.eye(num_classes)[y_train]
  y_test = np.eye(num_classes)[y_test]

  x_train = x_train.reshape(x_train.shape[0], -1)
  x_test = x_test.reshape(x_test.shape[0], -1)

  x_train = np.array(x_train/255, dtype=np.float64)
  y_train = np.array(y_train, dtype=np.float64)
  x_test = np.array(x_test/255, dtype=np.float64)
  y_test = np.array(y_test, dtype=np.float64)

  return x_train, y_train, x_test, y_test, num_classes

## Sweep Configuration

In [11]:
sweep_configuration = {
    "method": "random",
    "metric": {"name": "validation_accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"values": [1e-3, 1e-4]},
        "optimizer": {"values": ["sgd", "momentum", "nag", "rmsprop",  "adam", "nadam"]},
        "num_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "weight_decay": {"values": [0, 0.0005, 0.5]},
        "weight_init": {"values": ["random", "Xavier"]},
        "epochs": {"values": [10, 5]},
        "loss": {"values": ["cross_entropy"]},
        "momentum": {"values": [0.9]},
        "beta": {"values": [0.9]},
        "beta1": {"values":[0.9]},
        "beta2": {"values": [0.999]},
        "epsilon": {"values": [1e-8]},
        "dataset": {"values":["fashion_mnist"]}
    }
}

In [12]:
# sweep_configuration = {
#     "method": "random",
#     # "metric": {"name": "validation_accuracy", "goal": "maximize"},
#     "parameters": {
#         "learning_rate": {"values": [1e-3, 1e-4]},
#         "optimizer": {"values": ["momentum"]},
#         "num_layers": {"values": [4]},
#         "hidden_size": {"values": [64]},
#         "batch_size": {"values": [64]},
#         "activation": {"values": ["ReLU"]},
#         "weight_decay": {"values": [0, 0.0005, 0.5]},
#         "weight_init": {"values": ["random", "Xavier"]},
#         "epochs": {"values": [10]},
#         "loss": {"values": ["cross_entropy"]},
#         "momentum": {"values": [0.9]},
#         "beta": {"values": [0.9]},
#         "beta1": {"values":[0.9]},
#         "beta2": {"values": [0.999]},
#         "epsilon": {"values": [1e-8]},
#         "dataset": {"values":["fashion_mnist"]}
#     }
# }

## Training the Feed Forward Neural Network

In [13]:
# train.py file (suppose parser gives all args to train network)
from sklearn.model_selection import train_test_split
import math

def calculateAccuracy(y_true, y_pred):
  y_pred_labels = np.argmax(y_pred, axis=1)
  y_true_labels = np.argmax(y_true, axis=1)
  accuracy = np.mean(y_pred_labels == y_true_labels)
  return accuracy*100

def trainNeuralNetwork():
  wandb.init(mode="online")
  args = wandb.config
  x_train, y_train, x_test, y_test, num_classes = load_data(args["dataset"])
  input_size = len(x_train[0])
  output_size = num_classes
  n_hiddenLayers = args["num_layers"]
  n_neuronsPerLayer = args["hidden_size"]
  activationFun = args["activation"]
  weight_init = args["weight_init"]
  batch_size = args["batch_size"]
  lossFunc = args["loss"]
  optimizer = args["optimizer"]
  learning_rate = args["learning_rate"]
  momentum = args["momentum"]
  beta = args["beta"]
  beta1 = args["beta1"]
  beta2 = args["beta2"]
  epsilon = args["epsilon"]
  weight_decay = args["weight_decay"]
  epochs = args["epochs"]

  wandb.run.name = f"{optimizer}_{activationFun}_{n_hiddenLayers}_{n_neuronsPerLayer}_{epochs}_{weight_init}"

  # paste all above paramters as fun params
  fnn = FeedForwardNeuralNetwork(input_size, output_size, n_hiddenLayers, n_neuronsPerLayer,
                                 activationFun, weight_init, batch_size, lossFunc,
                                 optimizer, learning_rate, momentum,
                                 beta, beta1, beta2,
                                 epsilon, weight_decay, epochs)

  x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
  num_batches = math.ceil(len(x_train)/batch_size)

  for epochNum in range(epochs):
    for batchNum in range(num_batches):
      start_idx = batchNum * batch_size
      end_idx = start_idx + batch_size

      x_batch = x_train[start_idx:end_idx]
      y_batch = y_train[start_idx:end_idx]

      # Forward Propagation
      a_pre_activation, h_post_activation = fnn.forwardPropagation(x_batch)
      y_pred_batch = h_post_activation[-1]

      # Back Propagation
      grad_w, grad_b = fnn.backwardPropagation(a_pre_activation, h_post_activation, y_batch, y_pred_batch)

      # Update weights
      itr = epochNum * num_batches + batchNum + 1
      fnn.updateWeights(grad_w, grad_b, itr)

    # Validation accuracy
    _, h_validation = fnn.forwardPropagation(x_validation, isValidation=True)
    y_pred_validation = h_validation[-1]
    validation_accuracy = calculateAccuracy(y_validation, y_pred_validation)
    wandb.run.summary["metric_name"] = validation_accuracy


    # Train accuracy
    _, h_train = fnn.forwardPropagation(x_train, isValidation=True)
    y_pred_train = h_train[-1]
    train_accuracy = calculateAccuracy(y_train, y_pred_train)

    wandb.log({
        "epoch": epochNum + 1,
        "validation_loss": np.mean(fnn.lossFunc(y_validation, y_pred_validation)),
        "validation_accuracy": validation_accuracy,
        "train_loss": np.mean(fnn.lossFunc(y_train, y_pred_train)),
        "train_accuracy": train_accuracy
        },commit=True)

  # Test accuracy
  _,h_test = fnn.forwardPropagation(x_test, isValidation=True)
  y_pred_test = h_test[-1]
  test_accuracy = calculateAccuracy(y_test, y_pred_test)
  wandb.log({ "test_accuracy": test_accuracy,
             "test_loss": np.mean(fnn.lossFunc(y_test, y_pred_test))})

  wandb.finish()

In [14]:
# pip uninstall wandb -y
# pip install wandb
wandb.login(key="36c31601b44b627cac1224e51b177e99fdef3a5c")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnikhithaa[0m ([33mnikhithaa-iit-madras[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# wandb.init(project="DA6401_Assignment1")
wandb_id = wandb.sweep(sweep_configuration, project="DA6401_Assignment1")
wandb.agent(wandb_id, function=trainNeuralNetwork)

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁█▇▇▇▇▇▇▇█
train_loss,▁▁▁▁▁▁▃▆█▆
validation_accuracy,▁█▆▇▇▆▆▆▆▇
validation_loss,▁▁▁▁▁▁▃▆█▆

0,1
epoch,10.0
metric_name,10.13333
test_accuracy,10.35
test_loss,3.37823
train_accuracy,10.48148
train_loss,3.37736
validation_accuracy,10.13333
validation_loss,3.3666


[34m[1mwandb[0m: Agent Starting Run: bsdo5xiq with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▄█
train_loss,▂▃▄█▁
validation_accuracy,███▆▁
validation_loss,▂▃▄█▁

0,1
epoch,5.0
metric_name,9.78333
test_accuracy,10.0
test_loss,2.26465
train_accuracy,10.02593
train_loss,2.26596
validation_accuracy,9.78333
validation_loss,2.24432


[34m[1mwandb[0m: Agent Starting Run: 17ae1cxa with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄█▆▆█▇▇▆
train_loss,█▇▅▁▃▃▁▂▂▂
validation_accuracy,▁▂▄▇▆▆█▇▇▆
validation_loss,█▇▅▁▃▃▁▂▂▃

0,1
epoch,10.0
metric_name,82.7
test_accuracy,82.22
test_loss,0.50186
train_accuracy,83.54259
train_loss,0.44379
validation_accuracy,82.7
validation_loss,0.47342


[34m[1mwandb[0m: Agent Starting Run: os6fv6aj with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁▄▄▇█
train_loss,██████▇▄▂▁
validation_accuracy,▁▁▁▁▁▁▄▄▇█
validation_loss,██████▇▄▂▁

0,1
epoch,10.0
metric_name,45.25
test_accuracy,45.05
test_loss,1.30044
train_accuracy,45.3037
train_loss,1.29005
validation_accuracy,45.25
validation_loss,1.29367


[34m[1mwandb[0m: Agent Starting Run: 0kpf74jq with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,86.46667
test_accuracy,85.71
test_loss,0.39403
train_accuracy,87.91852
train_loss,0.32953
validation_accuracy,86.46667
validation_loss,0.35842


[34m[1mwandb[0m: Agent Starting Run: 8y0gdant with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▁▁▁
train_loss,█▁▁▁▁
validation_accuracy,▁████
validation_loss,█▁▁▁▁

0,1
epoch,5.0
metric_name,10.35
test_accuracy,10.0
test_loss,2.30261
train_accuracy,9.96296
train_loss,2.30258
validation_accuracy,10.35
validation_loss,2.30254


[34m[1mwandb[0m: Agent Starting Run: iy954rz2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▄▅▃▂▃▂▂▂
train_loss,▁█▅▄▆▇▆▇▇▇
validation_accuracy,█▁▄▄▃▂▃▂▂▂
validation_loss,▁█▅▅▆▇▇▇▇▇

0,1
epoch,10.0
metric_name,9.83333
test_accuracy,10.0
test_loss,18.65094
train_accuracy,10.01852
train_loss,18.6471
validation_accuracy,9.83333
validation_loss,18.68548


[34m[1mwandb[0m: Agent Starting Run: h2xkmyf6 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▃▂▂▂▁▁▁
validation_accuracy,▁▅▆▇▇▇████
validation_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,86.73333
test_accuracy,85.65
test_loss,0.41562
train_accuracy,88.98519
train_loss,0.31921
validation_accuracy,86.73333
validation_loss,0.38777


[34m[1mwandb[0m: Agent Starting Run: olso6re4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇██
train_loss,█▁▁▃▁
validation_accuracy,▁▆███
validation_loss,█▁▁▃▂

0,1
epoch,5.0
metric_name,19.78333
test_accuracy,20.64
test_loss,1.88627
train_accuracy,20.61481
train_loss,1.89322
validation_accuracy,19.78333
validation_loss,1.9082


[34m[1mwandb[0m: Agent Starting Run: esgf8htu with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▂▃▄▆█▁▁▁▁▁
train_loss,▁▁▂▄▅▆▇▇██
validation_accuracy,▂▃▄▆█▁▁▁▁▁
validation_loss,▂▁▂▄▅▆▇▇██

0,1
epoch,10.0
metric_name,10.35
test_accuracy,10.0
test_loss,2.3026
train_accuracy,9.96111
train_loss,2.30261
validation_accuracy,10.35
validation_loss,2.30252


[34m[1mwandb[0m: Agent Starting Run: vcu4ptzz with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆███
train_loss,█▇▆▄▁
validation_accuracy,▁▆███
validation_loss,█▇▆▄▁

0,1
epoch,5.0
metric_name,33.61667
test_accuracy,34.46
test_loss,2.2635
train_accuracy,33.81111
train_loss,2.26333
validation_accuracy,33.61667
validation_loss,2.26384


[34m[1mwandb[0m: Agent Starting Run: xpusqc6p with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▇▁▄▂▄▂▂███
train_loss,▃█▃▂▁▃▃▇▅▄
validation_accuracy,▂█▅▇▅▇▇▁▁▁
validation_loss,▃█▃▂▁▃▃▇▅▄

0,1
epoch,10.0
metric_name,9.78333
test_accuracy,10.0
test_loss,7.31248
train_accuracy,10.02407
train_loss,7.31543
validation_accuracy,9.78333
validation_loss,7.28598


[34m[1mwandb[0m: Agent Starting Run: azbonjlv with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▇█
train_loss,█▆▄▂▁
validation_accuracy,▁▃▅▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,39.56667
test_accuracy,37.85
test_loss,2.09883
train_accuracy,38.68889
train_loss,2.09683
validation_accuracy,39.56667
validation_loss,2.09664


[34m[1mwandb[0m: Agent Starting Run: wdz6zwp9 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▆▇▇▇███
train_loss,█▆▅▃▂▂▂▂▂▁
validation_accuracy,▁▃▅█▇▇████
validation_loss,█▆▄▂▂▁▁▁▂▁

0,1
epoch,10.0
metric_name,85.55
test_accuracy,85.38
test_loss,0.41778
train_accuracy,87.47963
train_loss,0.34119
validation_accuracy,85.55
validation_loss,0.39404


[34m[1mwandb[0m: Agent Starting Run: hb486l9m with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇▇████
train_loss,█▅▄▃▂▂▁▁▁▁
validation_accuracy,▁▄▆▇▇▇████
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,56.75
test_accuracy,57.12
test_loss,1.79363
train_accuracy,57.43148
train_loss,1.79047
validation_accuracy,56.75
validation_loss,1.79368


[34m[1mwandb[0m: Agent Starting Run: tmt00zbv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▄▆▆
train_loss,█▁▁▁▁
validation_accuracy,█▁▅▆▆
validation_loss,█▂▁▁▁

0,1
epoch,5.0
metric_name,33.11667
test_accuracy,33.67
test_loss,1.75822
train_accuracy,33.27037
train_loss,1.74837
validation_accuracy,33.11667
validation_loss,1.7536


[34m[1mwandb[0m: Agent Starting Run: 8dubpizj with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▅█▇▇▁
train_loss,▂▁▁▄█
validation_accuracy,▄▇██▁
validation_loss,▂▁▁▃█

0,1
epoch,5.0
metric_name,40.68333
test_accuracy,38.29
test_loss,1.77847
train_accuracy,39.8463
train_loss,1.74348
validation_accuracy,40.68333
validation_loss,1.74563


[34m[1mwandb[0m: Agent Starting Run: 1fn3t9pw with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,86.46667
test_accuracy,85.97
test_loss,0.38778
train_accuracy,88.25741
train_loss,0.32838
validation_accuracy,86.46667
validation_loss,0.36783


[34m[1mwandb[0m: Agent Starting Run: ro02otkx with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄█▃▃▃▃▃▃▃
train_loss,▁▃▁█▇▇▄▆▅▃
validation_accuracy,▁▄█▁▁▁▁▁▁▁
validation_loss,▁▃▁█▇▇▄▆▅▄

0,1
epoch,10.0
metric_name,9.81667
test_accuracy,10.03
test_loss,15.18198
train_accuracy,10.05185
train_loss,15.15554
validation_accuracy,9.81667
validation_loss,15.352


[34m[1mwandb[0m: Agent Starting Run: ftyj6g80 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▂▁▁

0,1
epoch,5.0
metric_name,87.91667
test_accuracy,86.98
test_loss,0.36549
train_accuracy,89.3963
train_loss,0.29094
validation_accuracy,87.91667
validation_loss,0.33687


[34m[1mwandb[0m: Agent Starting Run: fketfhrw with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄██▇
train_loss,█▆▄▂▁
validation_accuracy,▁▄█▆▇
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,13.5
test_accuracy,13.55
test_loss,6.8858
train_accuracy,13.68519
train_loss,6.84209
validation_accuracy,13.5
validation_loss,6.84621


[34m[1mwandb[0m: Agent Starting Run: u7xfxjai with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇▇▇███
train_loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▆▆▇▇▇████
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,83.8
test_accuracy,83.17
test_loss,0.50095
train_accuracy,84.52222
train_loss,0.4635
validation_accuracy,83.8
validation_loss,0.46845


[34m[1mwandb[0m: Agent Starting Run: dihqtifa with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁█▂█
train_loss,▁▂▅▆█
validation_accuracy,██▁▇▁
validation_loss,▁▁▆▆█

0,1
epoch,5.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.31685
train_accuracy,10.09259
train_loss,2.31659
validation_accuracy,9.16667
validation_loss,2.3192


[34m[1mwandb[0m: Agent Starting Run: p63njj9i with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▅█▇▅▅▆▆▅▃▁
train_loss,▄▁▃▄▄▄▄▅▇█
validation_accuracy,▅██▅▅▇▆▅▃▁
validation_loss,▄▁▂▄▄▄▄▅▇█

0,1
epoch,10.0
metric_name,36.15
test_accuracy,36.2
test_loss,1.86677
train_accuracy,36.46111
train_loss,1.857
validation_accuracy,36.15
validation_loss,1.85574


[34m[1mwandb[0m: Agent Starting Run: o99eegc0 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄▆█
train_loss,█▆▄▂▁
validation_accuracy,▁▁▄▆█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,12.13333
test_accuracy,12.88
test_loss,4.60705
train_accuracy,12.73333
train_loss,4.61317
validation_accuracy,12.13333
validation_loss,4.70475


[34m[1mwandb[0m: Agent Starting Run: cjs7v50f with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▇▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,69.43333
test_accuracy,69.43
test_loss,0.84609
train_accuracy,70.36667
train_loss,0.81096
validation_accuracy,69.43333
validation_loss,0.83134


[34m[1mwandb[0m: Agent Starting Run: 7fj0nk23 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,55.7
test_accuracy,55.53
test_loss,2.75085
train_accuracy,58.5463
train_loss,2.43192
validation_accuracy,55.7
validation_loss,2.71764


[34m[1mwandb[0m: Agent Starting Run: mwre4jvy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁
train_loss,▁▇████████
validation_accuracy,▁▁▁▁▁▁▁▁▁▁
validation_loss,█▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: j2rsa2px with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇███
train_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,80.91667
test_accuracy,80.78
test_loss,0.55201
train_accuracy,83.28889
train_loss,0.47174
validation_accuracy,80.91667
validation_loss,0.51683


[34m[1mwandb[0m: Agent Starting Run: xot5a2s5 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▇▇▇████
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▇▇▇████
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,79.15
test_accuracy,78.52
test_loss,0.59033
train_accuracy,79.84259
train_loss,0.56176
validation_accuracy,79.15
validation_loss,0.56408


[34m[1mwandb[0m: Agent Starting Run: qmz8hwra with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▅▆▃▁
train_loss,█▃▂▁▁
validation_accuracy,█▅▅▃▁
validation_loss,█▃▂▁▁

0,1
epoch,5.0
metric_name,4.11667
test_accuracy,3.97
test_loss,2.30367
train_accuracy,4.04259
train_loss,2.3037
validation_accuracy,4.11667
validation_loss,2.30367


[34m[1mwandb[0m: Agent Starting Run: ht386ruq with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁▁█▁▁
train_loss,█▄▂▂▁▁▁▁▁▁
validation_accuracy,▂▂▂▂▂▂▂█▁▁
validation_loss,█▅▃▂▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30251
train_accuracy,10.09259
train_loss,2.30249
validation_accuracy,9.16667
validation_loss,2.30273


[34m[1mwandb[0m: Agent Starting Run: hbicbpx0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▂▃▂▂▁▁▁▁▁
train_loss,█▁▁▁▁▁▁▁▁▁
validation_accuracy,█▃▃▃▂▁▁▁▁▁
validation_loss,█▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.95
test_accuracy,10.0
test_loss,2.3026
train_accuracy,10.00556
train_loss,2.30259
validation_accuracy,9.95
validation_loss,2.30314


[34m[1mwandb[0m: Agent Starting Run: gbxtf2mu with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆██▇▇▇▇▇█
train_loss,█▃▁▁▂▂▂▂▂▁
validation_accuracy,▁▅▇██▇▇▇▇▇
validation_loss,█▃▁▁▁▂▁▁▁▁

0,1
epoch,10.0
metric_name,80.13333
test_accuracy,79.19
test_loss,0.6385
train_accuracy,80.12222
train_loss,0.61284
validation_accuracy,80.13333
validation_loss,0.61246


[34m[1mwandb[0m: Agent Starting Run: k76s7dc8 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇▇█
train_loss,█▄▂▁▁
validation_accuracy,▁▆▇██
validation_loss,█▄▂▁▁

0,1
epoch,5.0
metric_name,72.41667
test_accuracy,71.81
test_loss,0.76066
train_accuracy,73.19074
train_loss,0.72442
validation_accuracy,72.41667
validation_loss,0.73942


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 891wgsig with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▅▇▂▇█▆▄▄▃▁
train_loss,▅▂▇▁▁▃▅▅▆█
validation_accuracy,▅▇▂▇█▅▄▄▃▁
validation_loss,▅▃▇▁▁▄▆▅▇█

0,1
epoch,10.0
metric_name,9.6
test_accuracy,8.74
test_loss,18.78954
train_accuracy,9.02407
train_loss,18.71643
validation_accuracy,9.6
validation_loss,18.60374


[34m[1mwandb[0m: Agent Starting Run: l4tx59zp with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▇█▆▂▁▇▇▆▆▆
train_loss,█▆▄▃▂▂▂▁▁▁
validation_accuracy,▇█▅▂▁▆▆▇▇▇
validation_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,10.13333
test_accuracy,10.0
test_loss,2.46546
train_accuracy,9.98519
train_loss,2.4646
validation_accuracy,10.13333
validation_loss,2.47112


[34m[1mwandb[0m: Agent Starting Run: pzazo8a0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇███████
train_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▆▇███████
validation_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,83.25
test_accuracy,82.07
test_loss,0.52961
train_accuracy,83.56111
train_loss,0.49799
validation_accuracy,83.25
validation_loss,0.50139


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tmp4wmeq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▄▆▇▇███
train_loss,█▆▅▄▃▂▂▁▁▁
validation_accuracy,▁▂▃▄▆▇▇███
validation_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,59.88333
test_accuracy,59.66
test_loss,0.9818
train_accuracy,60.44074
train_loss,0.94034
validation_accuracy,59.88333
validation_loss,0.96743


[34m[1mwandb[0m: Agent Starting Run: 3hqulhkm with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▅▆▇▇▇██
train_loss,█▆▄▄▃▂▂▂▁▁
validation_accuracy,▁▄▆▆▇▇████
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,88.15
test_accuracy,87.09
test_loss,0.36311
train_accuracy,90.09259
train_loss,0.26672
validation_accuracy,88.15
validation_loss,0.33535


[34m[1mwandb[0m: Agent Starting Run: eer4f7y7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃█▆▅
train_loss,█▆▁▃▅
validation_accuracy,▁▃█▇▅
validation_loss,█▆▁▃▅

0,1
epoch,5.0
metric_name,38.1
test_accuracy,38.36
test_loss,12.29878
train_accuracy,38.57222
train_loss,12.29224
validation_accuracy,38.1
validation_loss,12.39263


[34m[1mwandb[0m: Agent Starting Run: sxcbmwkl with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▃▅▆▆▇▇███
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,87.88333
test_accuracy,86.79
test_loss,0.37678
train_accuracy,88.55741
train_loss,0.31852
validation_accuracy,87.88333
validation_loss,0.34539


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8njrjvb1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▆▆▇▇▇██
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,65.83333
test_accuracy,65.45
test_loss,1.05554
train_accuracy,68.41852
train_loss,0.92117
validation_accuracy,65.83333
validation_loss,1.0459


[34m[1mwandb[0m: Agent Starting Run: yr24yudl with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,42.56667
test_accuracy,41.16
test_loss,2.89138
train_accuracy,43.19074
train_loss,2.70603
validation_accuracy,42.56667
validation_loss,2.79897


[34m[1mwandb[0m: Agent Starting Run: 4t2obdby with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▃█▇
train_loss,█▃▆▁▂
validation_accuracy,▁▆▃█▆
validation_loss,█▃▆▁▂

0,1
epoch,5.0
metric_name,84.56667
test_accuracy,83.8
test_loss,0.45056
train_accuracy,85.92037
train_loss,0.38066
validation_accuracy,84.56667
validation_loss,0.42545


[34m[1mwandb[0m: Agent Starting Run: zw1atedr with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▅▆▆▇▇▇███
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,79.06667
test_accuracy,79.0
test_loss,0.59963
train_accuracy,80.00741
train_loss,0.57456
validation_accuracy,79.06667
validation_loss,0.58248


[34m[1mwandb[0m: Agent Starting Run: dbb0zx12 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇▇▇▇████
train_loss,█▄▃▂▂▂▂▁▁▁
validation_accuracy,▁▅▇███████
validation_loss,█▄▂▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,85.96667
test_accuracy,84.95
test_loss,0.42908
train_accuracy,86.84259
train_loss,0.37973
validation_accuracy,85.96667
validation_loss,0.39777


[34m[1mwandb[0m: Agent Starting Run: hlytsqzi with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,86.31667
test_accuracy,85.51
test_loss,0.40621
train_accuracy,87.45556
train_loss,0.35963
validation_accuracy,86.31667
validation_loss,0.37469


[34m[1mwandb[0m: Agent Starting Run: wijkqwjo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁
train_loss,▁▁▁▁▁
validation_accuracy,▁▁▁▁▁
validation_loss,▁▁▁▁▁

0,1
epoch,5.0
metric_name,10.31667
test_accuracy,10.0
test_loss,2.30348
train_accuracy,9.96481
train_loss,2.3036
validation_accuracy,10.31667
validation_loss,2.30235


[34m[1mwandb[0m: Agent Starting Run: jekilo6c with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▇▁▁▁▅█▇▁▂
train_loss,▁▂▃▂▄▃▃▅█▂
validation_accuracy,██▂▂▂▆▇▇▁▄
validation_loss,▁▂▃▂▄▃▃▅█▂

0,1
epoch,10.0
metric_name,11.13333
test_accuracy,10.78
test_loss,9.54098
train_accuracy,10.66667
train_loss,9.55585
validation_accuracy,11.13333
validation_loss,9.5153


[34m[1mwandb[0m: Agent Starting Run: munctl08 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▄█▁▂
train_loss,▇█▇▂▁
validation_accuracy,▁▅▁█▇
validation_loss,███▂▁

0,1
epoch,5.0
metric_name,10.35
test_accuracy,10.0
test_loss,2.32253
train_accuracy,9.96111
train_loss,2.32238
validation_accuracy,10.35
validation_loss,2.32381


[34m[1mwandb[0m: Agent Starting Run: g4vy1zw4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▃▃▂▂▁▁▁▁
validation_accuracy,▁▄▆▆▇▇████
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,84.36667
test_accuracy,83.64
test_loss,0.4694
train_accuracy,85.16111
train_loss,0.43182
validation_accuracy,84.36667
validation_loss,0.44341


[34m[1mwandb[0m: Agent Starting Run: an21759z with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▂▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,83.1
test_accuracy,82.34
test_loss,0.5062
train_accuracy,83.81296
train_loss,0.47402
validation_accuracy,83.1
validation_loss,0.48561


[34m[1mwandb[0m: Agent Starting Run: 4w1yxyw7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▅▄▃▃▂▂▁▁
validation_accuracy,▁▄▅▆▆▇▇▇██
validation_loss,█▅▄▃▃▂▂▂▁▁

0,1
epoch,10.0
metric_name,87.98333
test_accuracy,86.99
test_loss,0.37085
train_accuracy,89.53519
train_loss,0.28903
validation_accuracy,87.98333
validation_loss,0.33785


[34m[1mwandb[0m: Agent Starting Run: 7e0ryp29 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▃▅▅▆▆▇▇██
validation_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,77.73333
test_accuracy,76.39
test_loss,0.66311
train_accuracy,78.49259
train_loss,0.59384
validation_accuracy,77.73333
validation_loss,0.62301


[34m[1mwandb[0m: Agent Starting Run: 02soffe8 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▄▄▆██▇▁▁▁
train_loss,▂▁▃▃▅▂▂▂▅█
validation_accuracy,▁▅▅▃▁▁▂███
validation_loss,▂▁▃▃▄▂▂▂▄█

0,1
epoch,10.0
metric_name,10.45
test_accuracy,10.0
test_loss,3.3071
train_accuracy,9.95
train_loss,3.30745
validation_accuracy,10.45
validation_loss,3.30394


[34m[1mwandb[0m: Agent Starting Run: 22lyvts4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▅█
train_loss,█▃▁▂▃
validation_accuracy,█▅▄▄▁
validation_loss,█▃▁▂▃

0,1
epoch,5.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.36006
train_accuracy,10.09259
train_loss,2.35915
validation_accuracy,9.16667
validation_loss,2.36824


[34m[1mwandb[0m: Agent Starting Run: 1sgsu0mi with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▂▁▁▁
validation_accuracy,▁▆▆▇▇▇████
validation_loss,█▄▃▂▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,78.58333
test_accuracy,78.05
test_loss,0.62308
train_accuracy,78.90185
train_loss,0.60424
validation_accuracy,78.58333
validation_loss,0.60298


[34m[1mwandb[0m: Agent Starting Run: xngkvbcr with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▅▇▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,44.93333
test_accuracy,44.48
test_loss,2.03887
train_accuracy,44.94074
train_loss,2.03729
validation_accuracy,44.93333
validation_loss,2.03967


[34m[1mwandb[0m: Agent Starting Run: 4irlp33l with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▆▆▇▇████
train_loss,█▄▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▅▆▆▇▇██▇
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,79.03333
test_accuracy,78.74
test_loss,0.63433
train_accuracy,80.88333
train_loss,0.573
validation_accuracy,79.03333
validation_loss,0.60926


[34m[1mwandb[0m: Agent Starting Run: huvhnygb with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▆▂▁▂▂▂▄▂▂
train_loss,▁▁▄▄▆▆▆▇▇█
validation_accuracy,█▆▂▁▂▂▂▄▂▂
validation_loss,▁▂▄▄▆▆▇▇▇█

0,1
epoch,10.0
metric_name,9.78333
test_accuracy,10.0
test_loss,2.38792
train_accuracy,10.02407
train_loss,2.38769
validation_accuracy,9.78333
validation_loss,2.3925


[34m[1mwandb[0m: Agent Starting Run: f106jlvz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▇▇█
train_loss,█▅▂▂▁
validation_accuracy,▁▄▆▆█
validation_loss,█▅▂▂▁

0,1
epoch,5.0
metric_name,83.93333
test_accuracy,82.69
test_loss,0.49017
train_accuracy,84.32778
train_loss,0.43318
validation_accuracy,83.93333
validation_loss,0.45155


[34m[1mwandb[0m: Agent Starting Run: 5q27vzkm with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,52.46667
test_accuracy,52.15
test_loss,1.43893
train_accuracy,52.82963
train_loss,1.41505
validation_accuracy,52.46667
validation_loss,1.42734


[34m[1mwandb[0m: Agent Starting Run: jwf2z4vt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▄▂▂▁
validation_accuracy,▁▅▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,77.11667
test_accuracy,76.12
test_loss,0.68963
train_accuracy,78.22222
train_loss,0.62567
validation_accuracy,77.11667
validation_loss,0.66175


[34m[1mwandb[0m: Agent Starting Run: q2bw7bff with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇█
train_loss,█▃▂▂▁
validation_accuracy,▁▆▇▇█
validation_loss,█▁▂▃▂

0,1
epoch,5.0
metric_name,84.91667
test_accuracy,83.69
test_loss,0.44986
train_accuracy,85.51481
train_loss,0.39945
validation_accuracy,84.91667
validation_loss,0.4203


[34m[1mwandb[0m: Agent Starting Run: hfuxktwo with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▅▅▅▅▃▂▁▆▇█
train_loss,█▇▆▅▄▃▃▂▂▁
validation_accuracy,██▅▄▄▂▄▂▁▃
validation_loss,█▇▆▅▄▃▃▂▂▁

0,1
epoch,10.0
metric_name,8.75
test_accuracy,9.28
test_loss,10.63776
train_accuracy,9.08148
train_loss,10.62791
validation_accuracy,8.75
validation_loss,10.74035


[34m[1mwandb[0m: Agent Starting Run: 3i8x22tr with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▆▆██▇
train_loss,▁▁▁▂▂▂▁▅█▄
validation_accuracy,▁▁▂▁▁▆▆███
validation_loss,▁▁▁▂▂▂▁▅█▄

0,1
epoch,10.0
metric_name,18.95
test_accuracy,19.16
test_loss,3.25021
train_accuracy,19.19074
train_loss,3.26912
validation_accuracy,18.95
validation_loss,3.26456


[34m[1mwandb[0m: Agent Starting Run: 423sakbf with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▄▄█
train_loss,████▁
validation_accuracy,▁▁▄▄█
validation_loss,████▁

0,1
epoch,5.0
metric_name,31.18333
test_accuracy,30.55
test_loss,1.72367
train_accuracy,30.38704
train_loss,1.72224
validation_accuracy,31.18333
validation_loss,1.72624


[34m[1mwandb[0m: Agent Starting Run: dhkwdrg6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆██
train_loss,█▅▃▂▁
validation_accuracy,▁▅██▇
validation_loss,█▅▂▂▁

0,1
epoch,5.0
metric_name,87.4
test_accuracy,86.58
test_loss,0.38161
train_accuracy,89.05741
train_loss,0.29807
validation_accuracy,87.4
validation_loss,0.35414


[34m[1mwandb[0m: Agent Starting Run: 1sc34box with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁█▆▆▆
train_loss,▆▁█▄▄
validation_accuracy,▁█▇▇▇
validation_loss,▆▁█▄▄

0,1
epoch,5.0
metric_name,10.45
test_accuracy,10.0
test_loss,12.75278
train_accuracy,9.95
train_loss,12.77088
validation_accuracy,10.45
validation_loss,12.58989


[34m[1mwandb[0m: Agent Starting Run: uogs5216 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,83.46667
test_accuracy,82.14
test_loss,0.50584
train_accuracy,84.86296
train_loss,0.42408
validation_accuracy,83.46667
validation_loss,0.47719


[34m[1mwandb[0m: Agent Starting Run: nekd7ndw with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▄▅▆▆▇██
train_loss,█▅▄▃▂▁▁▁▁▁
validation_accuracy,▁▃▄▄▅▆▆▇██
validation_loss,█▅▄▃▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,75.7
test_accuracy,75.2
test_loss,0.70485
train_accuracy,75.95926
train_loss,0.67278
validation_accuracy,75.7
validation_loss,0.68856


[34m[1mwandb[0m: Agent Starting Run: 0kayghuf with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁█████████
train_loss,█▁▁▁▁▁▁▁▁▁
validation_accuracy,█▁▁▁▁▁▁▁▁▁
validation_loss,█▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: 8mf0ekqo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▅▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,64.98333
test_accuracy,65.47
test_loss,6.76326
train_accuracy,66.57778
train_loss,6.53718
validation_accuracy,64.98333
validation_loss,6.83912


[34m[1mwandb[0m: Agent Starting Run: apcus7vn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▅▅▅▁
train_loss,▁▃▅▅█
validation_accuracy,█▅▅▅▁
validation_loss,▁▃▅▄█

0,1
epoch,5.0
metric_name,5.51667
test_accuracy,5.91
test_loss,2.33368
train_accuracy,5.69815
train_loss,2.33405
validation_accuracy,5.51667
validation_loss,2.3298


[34m[1mwandb[0m: Agent Starting Run: lqw1iqtt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▆▇▇▇███
train_loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▄▆▇▇▇████
validation_loss,█▄▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,78.23333
test_accuracy,77.58
test_loss,0.65022
train_accuracy,79.86296
train_loss,0.58385
validation_accuracy,78.23333
validation_loss,0.62835


[34m[1mwandb[0m: Agent Starting Run: 4ca5bvbt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▃▄▅▆▆▇▇██
validation_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,59.06667
test_accuracy,59.58
test_loss,1.21349
train_accuracy,60.59444
train_loss,1.15843
validation_accuracy,59.06667
validation_loss,1.20293


[34m[1mwandb[0m: Agent Starting Run: 6i63ma3r with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


In [None]:
# !pkill -9 -f wandb  # Kill all WandB processes
# !wandb login --relogin
# !wandb login --cloud