## Installation

In [1]:
pip install wandb numpy pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


## Q1: fashion-MNIST dataset

In [2]:
# import wandb
# from keras.datasets import fashion_mnist

# (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# classes = {
#     0: "T-shirt/top",
#     1: "Trouser",
#     2: "Pullover",
#     3: "Dress",
#     4: "Coat",
#     5: "Sandal",
#     6: "Shirt",
#     7: "Sneaker",
#     8: "Bag",
#     9: "Ankle boot"
# }

# def logClassImages(project_name:str):
#   wandb.init(project=project_name)
#   wandb_image_indices = []

#   for classNumber in range(10):
#     for j in range(len(y_test)):
#       if y_test[j] == classNumber:
#         wandb_image_indices.append(x_test[j])
#         break

#   wandb_images = [wandb.Image(wandb_image_indices[i], caption = classes[i]) for i in range(10)]
#   wandb.log({"Sample images for each class": wandb_images})
#   wandb.finish()

# # logClassImages("da6401_assignment1")

## Feedforward neural network



### Libraries required

In [3]:
import wandb
from keras.datasets import fashion_mnist
import numpy as np
import copy

### Activation functions

In [4]:
"""
  ACTIVATION FUNCTIONS
"""
def identity(x):
    return x

def sigmoid(x):
    # x = np.clip(x,-10,10)
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    # print(x)
    # x = np.clip(x, -200,200)
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical Stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

### Loss Functions

In [5]:
"""
  LOSS FUNCTIONS
"""
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

import numpy as np

def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 1e-9), axis=-1)    # 1e-9 to avy_pred + 1e-9oid log 0

### Derivatives

In [6]:
"""
  DERIVATIVES OF ACTIVATION AND LOSS FUNCTIONS
"""
def identity_derivative(x):
    return np.ones_like(x)

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def mean_squared_error_derivative(y_true, y_pred):
    return y_pred - y_true

def cross_entropy_loss_derivative(y_true, y_pred):
    return -y_true / (y_pred + 1e-9)

def softmax_derivative(inp:np.array):
    derivates = []
    if(len(inp.shape) == 1):
      S_vector = inp.reshape(-1, 1)
      derivates = np.diag(inp) - np.dot(S_vector, S_vector.T)
    elif(len(inp.shape) == 2):
      for i in range(inp.shape[0]):
        S_vector = inp[i].reshape(-1, 1)
        derivates.append(np.diag(inp[i]) - np.dot(S_vector, S_vector.T))

    return np.array(derivates)

### Optimizers

In [7]:
"""
  OPTIMIZERS UPDATE RULES
"""

# STOCHASTIC GRADIENT DESCENT
def sgd(optimizer_input_dict, wts_bias_history_dict, itr=None):
  # cant update weights in one single matrix op as dimensions of weights can be different in each layer
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # weight decay term added additionally to the formula in slides
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["dw"][i] + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["db"][i])

# MOMENTUM BASED GRADIENT DESCENT
def momentumGradientDescent(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i])

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - (optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i])

# NAG(NESTEROV ACCELERATED GRADIENT DESCENT)
def nag(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    # dw,db will contain lookahead gradients only since forward and backward propagations are implemented accordingly
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_weights"][i]) + wts_bias_history_dict["dw"][i]
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] * (wts_bias_history_dict["history_weights"][i] +  (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["momentum"] * wts_bias_history_dict["history_biases"][i]) + wts_bias_history_dict["db"][i]
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - optimizer_input_dict["learning_rate"] * wts_bias_history_dict["history_biases"][i]

# RMSPROP
def rmsProp(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["dw"][i] ** 2)
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - optimizer_input_dict["learning_rate"] *((wts_bias_history_dict["dw"][i]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"]))  + (optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_weights"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["dw"][i]))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta"]) * (wts_bias_history_dict["db"][i] ** 2)
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/np.sqrt(wts_bias_history_dict["history_biases"][i] + optimizer_input_dict["epsilon"])) * (wts_bias_history_dict["db"][i]))

# ADAM
def adam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - (optimizer_input_dict["learning_rate"]*((history_weights_hat/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])))) - ((optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (history_weights_hat))

    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** (itr)))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** (itr)))

    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (history_biases_hat))

# NADAM
def nadam(optimizer_input_dict, wts_bias_history_dict, itr=None):
  for i in range(optimizer_input_dict["n_hiddenLayers"]+1):
    wts_bias_history_dict["history_weights"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_weights"][i]) + (1 - optimizer_input_dict["beta1"]) * (wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["second_history_weights"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_weights"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["dw"][i] ** 2)

    history_weights_hat = wts_bias_history_dict["history_weights"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_weights_hat = wts_bias_history_dict["second_history_weights"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_dw = optimizer_input_dict["beta1"] * history_weights_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["dw"][i])
    wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - (optimizer_input_dict["learning_rate"]*(lookahead_dw/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"]))) - ((optimizer_input_dict["weight_decay"] * wts_bias_history_dict["weights"][i]))
    # wts_bias_history_dict["weights"][i] = wts_bias_history_dict["weights"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_weights_hat) + optimizer_input_dict["epsilon"])) * (lookahead_dw))


    wts_bias_history_dict["history_biases"][i] = (optimizer_input_dict["beta1"] * wts_bias_history_dict["history_biases"][i]) + (1 - optimizer_input_dict["beta1"]) * wts_bias_history_dict["db"][i]
    wts_bias_history_dict["second_history_biases"][i] = (optimizer_input_dict["beta2"] * wts_bias_history_dict["second_history_biases"][i]) + (1 - optimizer_input_dict["beta2"]) * (wts_bias_history_dict["db"][i] ** 2)

    history_biases_hat = wts_bias_history_dict["history_biases"][i] / (1 - (optimizer_input_dict["beta1"] ** itr))
    second_history_biases_hat = wts_bias_history_dict["second_history_biases"][i] / (1 - (optimizer_input_dict["beta2"] ** itr))

    lookahead_db = optimizer_input_dict["beta1"] * history_biases_hat + (((1-optimizer_input_dict["beta1"])/(1-(optimizer_input_dict["beta1"] ** itr))) * wts_bias_history_dict["db"][i])
    wts_bias_history_dict["biases"][i] = wts_bias_history_dict["biases"][i] - ((optimizer_input_dict["learning_rate"]/(np.sqrt(second_history_biases_hat) + optimizer_input_dict["epsilon"])) * (lookahead_db))


### Network (forward and back propagation)

In [8]:
class FeedForwardNeuralNetwork:
  # class variables
  optimizersMap = {"sgd": sgd, "momentum": momentumGradientDescent, "nag": nag, "rmsprop": rmsProp, "adam": adam, "nadam": nadam}
  lossFunctionsMap = {"mean_squared_error": mean_squared_error, "cross_entropy" : cross_entropy_loss}
  activationFunctionsMap = {"identity":identity, "sigmoid":sigmoid, "tanh":tanh, "ReLU":relu, "softmax": softmax}
  derivatesFuncMap = {"mean_squared_error": mean_squared_error_derivative, "cross_entropy_loss": cross_entropy_loss_derivative, "identity": identity_derivative,
                      "sigmoid": sigmoid_derivative, "tanh": tanh_derivative, "relu": relu_derivative, "softmax": softmax_derivative}

  def __init__(self,
               input_size=784, output_size=10,
               n_hiddenLayers=3, n_neuronsPerLayer=32,
               activationFun="sigmoid",
               weight_init="random",
               batch_size=64,
               lossFunc="cross_entropy",
               optimizer="adam",
               learning_rate=0.001,
               momentum=0.5,
               beta=0.9, beta1=0.9, beta2=0.99,
               epsilon=1e-8, weight_decay=0.01,
               epochs=10):

    # Inialtization parameters
    self.input_size = input_size  # no of features
    self.output_size = output_size
    self.n_hiddenLayers = n_hiddenLayers
    self.n_neuronsPerLayer = n_neuronsPerLayer
    self.weight_init = weight_init
    self.epochs = epochs

    self.activationFun = FeedForwardNeuralNetwork.activationFunctionsMap[activationFun]
    self.lossFunc = FeedForwardNeuralNetwork.lossFunctionsMap[lossFunc]
    self.optimizer = FeedForwardNeuralNetwork.optimizersMap[optimizer]

    # paramters required for optimizers
    self.batch_size = batch_size
    self.isLookAhead = False;

    if(optimizer == "nag"):
      self.isLookAhead = True;

    # add these parameters as dict
    self.optimizer_input_dict = { "learning_rate" : learning_rate,
                                  "momentum" : momentum,                  # used by momentumGD
                                  "beta" : beta,                          # used by rmsprop
                                  "beta1" : beta1,                        # used by adam & nadam
                                  "beta2" : beta2,                        # used by adam & nadam
                                  "epsilon" : epsilon,
                                  "weight_decay" : weight_decay,
                                  "n_hiddenLayers": n_hiddenLayers}

    # weights and biases matrices
    self.weights = []
    self.biases = []
    self.lookAheadWeights = []
    self.lookAheadBiases = []

    self.wts_bias_history_dict = {"weights": self.weights, "biases": self.biases,
                                  "history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],         # these will be modified before their first use (dimensions of each values will also be changed)
                                  "history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_weights": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "second_history_biases": [np.zeros(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dw": [np.empty(1) for _ in range(self.n_hiddenLayers+1)],
                                  "dh": [np.empty(1) for _ in range(self.n_hiddenLayers+1)]}

    self.initializeWeightsAndBiases()
    self.wts_bias_history_dict["second_history_weights"] = copy.deepcopy(self.wts_bias_history_dict["history_weights"])
    self.wts_bias_history_dict["second_history_biases"] = copy.deepcopy(self.wts_bias_history_dict["history_biases"])

    # pre-activation(a) and post-activation(h) values
    self.a = []
    self.h = []

  '''
    Weights,Biases initialization based on weight_init parameter

    weights[0]: input layer to first hidden layer  : input_size x n_neuronsPerLayer
    weights[1]: first hidden layer to second hidden layer : n_neuronsPerLayer x n_neuronsPerLayer
    ...
    weights[n_hiddenLayers]: last hidden layer to output layer : n_neuronsPerLayer x output_size

    biases[i] : bias for ith layer : 1 x n_neuronsPerLayer   (i:0 to n_hiddenLayers-1)
    biases[n_hiddenLayers]: 1 x output_size
  '''
  def initializeWeightsAndBiases(self):
    # biases for both types
    for i in range(self.n_hiddenLayers):
      self.biases.append(np.zeros(self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_biases"][i] = np.zeros(self.n_neuronsPerLayer)

    self.biases.append(np.zeros(self.output_size))   # biases[n_hiddenLayers]
    self.wts_bias_history_dict["history_biases"][self.n_hiddenLayers] = np.zeros(self.output_size)

    if(self.weight_init == "random"):   # Random Normal
      # weights[0]
      self.weights.append(np.random.randn(self.input_size, self.n_neuronsPerLayer))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))

      # weights[1] -> weights[n_hiddenLayers-1]
      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.n_neuronsPerLayer))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))

      # weights[n_hiddenLayers]
      self.weights.append(np.random.randn(self.n_neuronsPerLayer, self.output_size))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

    elif(self.weight_init == "Xavier"):   # Xavier Normal: mean = 0, variance = 2/(n_input + n_output)
      # weights[0]
      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.input_size + self.n_neuronsPerLayer)), size=(self.input_size, self.n_neuronsPerLayer)))
      self.wts_bias_history_dict["history_weights"][0] = np.zeros((self.input_size, self.n_neuronsPerLayer))


      for i in range(self.n_hiddenLayers-1):
        self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.n_neuronsPerLayer)), size=(self.n_neuronsPerLayer, self.n_neuronsPerLayer)))
        self.wts_bias_history_dict["history_weights"][i+1] = np.zeros((self.n_neuronsPerLayer, self.n_neuronsPerLayer))


      self.weights.append(np.random.normal(loc=0.0, scale=np.sqrt(2/(self.n_neuronsPerLayer + self.output_size)), size=(self.n_neuronsPerLayer, self.output_size)))
      self.wts_bias_history_dict["history_weights"][self.n_hiddenLayers] = np.zeros((self.n_neuronsPerLayer, self.output_size))

  '''
    Forward propagation through the neural network. (for batch)
    Instead of doing one input at a time, this function handles it for a batch using respective sized matrices

    x_batch: B x n where B - batch size, n- no of features = input_size
    x_batch is assumbed to be numpy array when given as input
  '''
  def forwardPropagation(self, x_batch, isValidation=False):
    a_pre_activation = []
    h_post_activation = []

    # considering a0,h0 as X values as a1: first layer  (it is calculated from x values)
    a_pre_activation.append(x_batch)
    h_post_activation.append(x_batch)

    wt = []
    b = []

    if(self.isLookAhead and not isValidation):
      for i in range(self.n_hiddenLayers+1):
        wt.append(self.weights[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_weights"][i]))
        b.append(self.biases[i] - (self.optimizer_input_dict["momentum"] * self.wts_bias_history_dict["history_biases"][i]))

      self.lookAheadWeights = wt
      self.lookAheadBiases = b
    else:
      wt = copy.deepcopy(self.weights)
      b = copy.deepcopy(self.biases)

    # Except last layer since activation function could be different
    for i in range(self.n_hiddenLayers):
      # ai: B x n_neuronsPerLayer, biases[i]: 1 x n_neuronsPerLayer (it will be broadcasted while adding)
      ai = np.matmul(h_post_activation[-1], wt[i]) + b[i]
      hi = self.activationFun(ai)

      a_pre_activation.append(ai)
      h_post_activation.append(hi)

    # aL: last layer (activation function is softmax)
    aL = np.matmul(h_post_activation[-1], wt[self.n_hiddenLayers]) + b[self.n_hiddenLayers]
    hL = softmax(aL)   # y_batch

    a_pre_activation.append(aL)
    h_post_activation.append(hL)

    return a_pre_activation, h_post_activation

  '''
    Backward propagation through the neural network. (for batch)
  '''
  def backwardPropagation(self, a_pre_activation, h_post_activation, y_batch, y_pred_batch, isValidation=False):
    grad_w = []
    grad_b = []
    grad_a = []
    grad_h = []

    wt = []
    b = []
    if(self.isLookAhead and not isValidation):
        wt = self.lookAheadWeights
        b = self.lookAheadBiases 
    else:
        wt = copy.deepcopy(self.weights)
        b = copy.deepcopy(self.biases)  

    # Output gradient (wrt aL)
    grad_hL = self.derivatesFuncMap[self.lossFunc.__name__](y_batch, y_pred_batch)
    grad_h.append(grad_hL)

    if(self.lossFunc.__name__ == "cross_entropy_loss"):
      grad_aL = y_pred_batch - y_batch    # just to reduce computation of jacobian matrix
      grad_a.append(grad_aL)
    else:
      grad_aL_list = []
      # softmax derivatives of each input is a matrix of size output_size x output_size, we need to perform matrix_mul for each input of batch
      for i in range(y_batch.shape[0]):   # self.batch_size = y_batch.shape[0] but better to take y_batch.shape[0] since last batch inputs can have less
        grad_aL_inp_i = grad_hL[i] @ softmax_derivative(y_pred_batch[i])
        grad_aL_list.append(grad_aL_inp_i)

      grad_aL = np.array(grad_aL_list)
      grad_a.append(grad_aL)                    # aL contains (aL) values of all inputs in the batch

    # Hidden layers
    for k in range(self.n_hiddenLayers, -1, -1):
      # gradients w.r.t parameters
      # wk
      grad_wk = np.zeros_like(wt[k])    # will be equal to sum across

      for inpNum in range(y_batch.shape[0]):
        grad_wk_inp_num = np.matmul(h_post_activation[k][inpNum].reshape(-1,1), grad_a[-1][inpNum].reshape(1,-1))
        grad_wk += grad_wk_inp_num
      grad_w.append(grad_wk)                   # contains sum across all batches

      # bk
      grad_bk = np.zeros_like(self.biases[k])
      for inpNum in range(y_batch.shape[0]):
        grad_bk += grad_a[-1][inpNum]
      grad_b.append(grad_bk)                     # contains sum across all batches

      if(k > 0):
        # gradients w.r.t layer below
        grad_hk_1 = grad_a[-1] @ wt[k].T
        grad_h.append(grad_hk_1)

        # gradients w.r.t layer below (pre-activation)
        grad_ak_1 = grad_hk_1 * self.derivatesFuncMap[self.activationFun.__name__](a_pre_activation[k])
        grad_a.append(grad_ak_1)

    grad_w = grad_w[::-1]
    grad_b = grad_b[::-1]

    return grad_w, grad_b

  def updateWeights(self, grad_w, grad_b, itr):
    grad_w = [np.clip(dw, -10,10) for dw in grad_w]
    grad_h = [np.clip(db, -10,10) for db in grad_b]
    self.wts_bias_history_dict["dw"] = grad_w
    self.wts_bias_history_dict["db"] = grad_b
    self.optimizer(self.optimizer_input_dict, self.wts_bias_history_dict, itr)

## Loading data

In [9]:
from keras.datasets import fashion_mnist, mnist
import numpy as np

datasets = {"fashion_mnist": fashion_mnist, "mnist": mnist}

def load_data(dataset_name):
  (x_train, y_train), (x_test, y_test) = datasets[dataset_name].load_data()
  num_classes = len(np.unique(y_train))

  y_train = np.eye(num_classes)[y_train]
  y_test = np.eye(num_classes)[y_test]

  x_train = x_train.reshape(x_train.shape[0], -1)
  x_test = x_test.reshape(x_test.shape[0], -1)

  x_train = np.array(x_train/255, dtype=np.float64)
  y_train = np.array(y_train, dtype=np.float64)
  x_test = np.array(x_test/255, dtype=np.float64)
  y_test = np.array(y_test, dtype=np.float64)

  return x_train, y_train, x_test, y_test, num_classes

## Sweep Configuration

In [10]:
sweep_configuration = {
    "method": "random",
    "metric": {"name": "validation_accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"values": [1e-3, 1e-4]},
        "optimizer": {"values": ["sgd", "momentum", "nag", "rmsprop",  "adam", "nadam"]},
        "num_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "weight_decay": {"values": [0, 0.0005, 0.5]},
        "weight_init": {"values": ["random", "Xavier"]},
        "epochs": {"values": [10, 5]},
        "loss": {"values": ["cross_entropy"]},
        "momentum": {"values": [0.9]},
        "beta": {"values": [0.9]},
        "beta1": {"values":[0.9]},
        "beta2": {"values": [0.999]},
        "epsilon": {"values": [1e-8]},
        "dataset": {"values":["fashion_mnist"]}
    }
}

In [11]:
# sweep_configuration = {
#     "method": "random",
#     # "metric": {"name": "validation_accuracy", "goal": "maximize"},
#     "parameters": {
#         "learning_rate": {"values": [1e-3, 1e-4]},
#         "optimizer": {"values": ["momentum"]},
#         "num_layers": {"values": [4]},
#         "hidden_size": {"values": [64]},
#         "batch_size": {"values": [64]},
#         "activation": {"values": ["ReLU"]},
#         "weight_decay": {"values": [0, 0.0005, 0.5]},
#         "weight_init": {"values": ["random", "Xavier"]},
#         "epochs": {"values": [10]},
#         "loss": {"values": ["cross_entropy"]},
#         "momentum": {"values": [0.9]},
#         "beta": {"values": [0.9]},
#         "beta1": {"values":[0.9]},
#         "beta2": {"values": [0.999]},
#         "epsilon": {"values": [1e-8]},
#         "dataset": {"values":["fashion_mnist"]}
#     }
# }

## Training the Feed Forward Neural Network

In [12]:
# train.py file (suppose parser gives all args to train network)
from sklearn.model_selection import train_test_split
import math

def calculateAccuracy(y_true, y_pred):
  y_pred_labels = np.argmax(y_pred, axis=1)
  y_true_labels = np.argmax(y_true, axis=1)
  accuracy = np.mean(y_pred_labels == y_true_labels)
  return accuracy*100

def trainNeuralNetwork():
  wandb.init(mode="online")
  args = wandb.config
  x_train, y_train, x_test, y_test, num_classes = load_data(args["dataset"])
  input_size = len(x_train[0])
  output_size = num_classes
  n_hiddenLayers = args["num_layers"]
  n_neuronsPerLayer = args["hidden_size"]
  activationFun = args["activation"]
  weight_init = args["weight_init"]
  batch_size = args["batch_size"]
  lossFunc = args["loss"]
  optimizer = args["optimizer"]
  learning_rate = args["learning_rate"]
  momentum = args["momentum"]
  beta = args["beta"]
  beta1 = args["beta1"]
  beta2 = args["beta2"]
  epsilon = args["epsilon"]
  weight_decay = args["weight_decay"]
  epochs = args["epochs"]

  wandb.run.name = f"{optimizer}_{activationFun}_{n_hiddenLayers}_{n_neuronsPerLayer}_{epochs}_{weight_init}"

  # paste all above paramters as fun params
  fnn = FeedForwardNeuralNetwork(input_size, output_size, n_hiddenLayers, n_neuronsPerLayer,
                                 activationFun, weight_init, batch_size, lossFunc,
                                 optimizer, learning_rate, momentum,
                                 beta, beta1, beta2,
                                 epsilon, weight_decay, epochs)

  x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
  num_batches = math.ceil(len(x_train)/batch_size)

  for epochNum in range(epochs):
    for batchNum in range(num_batches):
      start_idx = batchNum * batch_size
      end_idx = start_idx + batch_size

      x_batch = x_train[start_idx:end_idx]
      y_batch = y_train[start_idx:end_idx]

      # Forward Propagation
      a_pre_activation, h_post_activation = fnn.forwardPropagation(x_batch)
      y_pred_batch = h_post_activation[-1]

      # Back Propagation
      grad_w, grad_b = fnn.backwardPropagation(a_pre_activation, h_post_activation, y_batch, y_pred_batch)

      # Update weights
      itr = epochNum * num_batches + batchNum + 1
      fnn.updateWeights(grad_w, grad_b, itr)

    # Validation accuracy
    _, h_validation = fnn.forwardPropagation(x_validation, isValidation=True)
    y_pred_validation = h_validation[-1]
    validation_accuracy = calculateAccuracy(y_validation, y_pred_validation)
    wandb.run.summary["metric_name"] = validation_accuracy


    # Train accuracy
    _, h_train = fnn.forwardPropagation(x_train, isValidation=True)
    y_pred_train = h_train[-1]
    train_accuracy = calculateAccuracy(y_train, y_pred_train)

    wandb.log({
        "epoch": epochNum + 1,
        "validation_loss": np.mean(fnn.lossFunc(y_validation, y_pred_validation)),
        "validation_accuracy": validation_accuracy,
        "train_loss": np.mean(fnn.lossFunc(y_train, y_pred_train)),
        "train_accuracy": train_accuracy
        },commit=True)

  # Test accuracy
  _,h_test = fnn.forwardPropagation(x_test, isValidation=True)
  y_pred_test = h_test[-1]
  test_accuracy = calculateAccuracy(y_test, y_pred_test)
  wandb.log({ "test_accuracy": test_accuracy,
             "test_loss": np.mean(fnn.lossFunc(y_test, y_pred_test))})

  wandb.finish()

In [13]:
# pip uninstall wandb -y
# pip install wandb
wandb.login(key="36c31601b44b627cac1224e51b177e99fdef3a5c")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnikhithaa[0m ([33mnikhithaa-iit-madras[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# wandb.init(project="DA6401_Assignment1")
wandb_id = wandb.sweep(sweep_configuration, project="DA6401_Assignment1")
wandb.agent(wandb_id, function=trainNeuralNetwork)

Create sweep with ID: 0lp2xf2w
Sweep URL: https://wandb.ai/nikhithaa-iit-madras/DA6401_Assignment1/sweeps/0lp2xf2w


[34m[1mwandb[0m: Agent Starting Run: 2uv2gcx4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  return 1 / (1 + np.exp(-x))


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▆██
train_loss,▄█▁▄▃
validation_accuracy,▁▁▆▇█
validation_loss,▄█▁▄▃

0,1
epoch,5.0
metric_name,19.83333
test_accuracy,19.85
test_loss,7.38858
train_accuracy,19.80185
train_loss,7.3869
validation_accuracy,19.83333
validation_loss,7.38223


[34m[1mwandb[0m: Agent Starting Run: 601oa9eh with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁
train_loss,█▅▄▃▃▃▂▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁
validation_loss,█▅▄▃▃▃▂▂▁▁

0,1
epoch,10.0
metric_name,10.31667
test_accuracy,10.0
test_loss,2.30842
train_accuracy,9.96481
train_loss,2.3084
validation_accuracy,10.31667
validation_loss,2.30864


[34m[1mwandb[0m: Agent Starting Run: rrlkdsw2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄▆█
train_loss,█▂▂▁▁
validation_accuracy,▁▂▄▇█
validation_loss,█▂▁▁▁

0,1
epoch,5.0
metric_name,60.98333
test_accuracy,61.37
test_loss,1.20534
train_accuracy,62.20926
train_loss,1.12025
validation_accuracy,60.98333
validation_loss,1.17462


[34m[1mwandb[0m: Agent Starting Run: 8pwusog2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇█▇
train_loss,█▅▄▂▁
validation_accuracy,▁▆▇█▇
validation_loss,█▅▄▂▁

0,1
epoch,5.0
metric_name,48.91667
test_accuracy,49.18
test_loss,6.01451
train_accuracy,49.37778
train_loss,5.98643
validation_accuracy,48.91667
validation_loss,6.13434


[34m[1mwandb[0m: Agent Starting Run: 22o298d3 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▆█
train_loss,█▅▃▂▁
validation_accuracy,▁▃▅▆█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,12.8
test_accuracy,12.68
test_loss,7.74278
train_accuracy,12.54444
train_loss,7.76373
validation_accuracy,12.8
validation_loss,7.62054


[34m[1mwandb[0m: Agent Starting Run: d6zjy1ni with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,75.56667
test_accuracy,74.41
test_loss,0.70352
train_accuracy,76.11852
train_loss,0.66075
validation_accuracy,75.56667
validation_loss,0.67172


[34m[1mwandb[0m: Agent Starting Run: 5esuz9db with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,████▇▄▂▁▁▁
train_loss,█▆▅▄▃▃▂▂▁▁
validation_accuracy,████▇▄▂▁▁▁
validation_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,10.0
metric_name,5.2
test_accuracy,5.24
test_loss,3.25215
train_accuracy,5.26296
train_loss,3.24496
validation_accuracy,5.2
validation_loss,3.26396


[34m[1mwandb[0m: Agent Starting Run: tgc3bvc7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▅▁▁█▄
train_loss,▅▁▅█▅
validation_accuracy,▅▁▂█▄
validation_loss,▅▁▅█▅

0,1
epoch,5.0
metric_name,11.85
test_accuracy,11.12
test_loss,3.47481
train_accuracy,11.54074
train_loss,3.47352
validation_accuracy,11.85
validation_loss,3.45084


[34m[1mwandb[0m: Agent Starting Run: vy2ow7w7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁█████████
train_loss,█▁▂▂▂▂▂▂▂▂
validation_accuracy,█▁▁▁▁▁▁▁▁▁
validation_loss,█▁▂▂▂▂▂▂▂▂

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: 9uqzg1o1 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▆▇▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
validation_accuracy,▁▃▅▆▆▇▇███
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,79.91667
test_accuracy,79.36
test_loss,0.5702
train_accuracy,80.50185
train_loss,0.54453
validation_accuracy,79.91667
validation_loss,0.5476


[34m[1mwandb[0m: Agent Starting Run: xus64ejy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▆▄▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,36.81667
test_accuracy,37.21
test_loss,2.13694
train_accuracy,37.08889
train_loss,2.13689
validation_accuracy,36.81667
validation_loss,2.13958


[34m[1mwandb[0m: Agent Starting Run: oqolztdt with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▁▁▁
train_loss,▁████
validation_accuracy,█▁▁▁▁
validation_loss,▁████

0,1
epoch,5.0
metric_name,10.31667
test_accuracy,10.0
test_loss,2.30352
train_accuracy,9.96481
train_loss,2.30357
validation_accuracy,10.31667
validation_loss,2.30304


[34m[1mwandb[0m: Agent Starting Run: jin06mwt with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇██
train_loss,█▄▂▂▁
validation_accuracy,▁▅▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,69.9
test_accuracy,70.16
test_loss,0.82226
train_accuracy,70.36481
train_loss,0.80886
validation_accuracy,69.9
validation_loss,0.82336


[34m[1mwandb[0m: Agent Starting Run: 00j67nob with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▅▄▃▂▂▂▂▁▁
validation_accuracy,▁▃▅▅▆▇▇███
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,74.53333
test_accuracy,74.69
test_loss,0.70977
train_accuracy,77.45741
train_loss,0.60542
validation_accuracy,74.53333
validation_loss,0.68763


[34m[1mwandb[0m: Agent Starting Run: 2wmo73cy with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▆▇█
train_loss,█▃▂▁▁
validation_accuracy,▁▃▆▇█
validation_loss,█▃▂▁▁

0,1
epoch,5.0
metric_name,51.21667
test_accuracy,50.57
test_loss,1.47963
train_accuracy,51.44815
train_loss,1.4664
validation_accuracy,51.21667
validation_loss,1.46855


[34m[1mwandb[0m: Agent Starting Run: dacfpnzz with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▆▆▇▇▇██
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▆▇▇███
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,60.36667
test_accuracy,61.37
test_loss,1.15483
train_accuracy,63.60185
train_loss,1.03914
validation_accuracy,60.36667
validation_loss,1.15984


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rd9ivjfd with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇▇██
train_loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▅▆▇▇▇▇███
validation_loss,█▄▂▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,87.15
test_accuracy,86.37
test_loss,0.39363
train_accuracy,89.48889
train_loss,0.29484
validation_accuracy,87.15
validation_loss,0.36615


[34m[1mwandb[0m: Agent Starting Run: tg8q7dm0 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇▇██
train_loss,█▆▅▄▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▇▇▇▇██
validation_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,74.66667
test_accuracy,74.67
test_loss,0.72637
train_accuracy,75.5463
train_loss,0.70741
validation_accuracy,74.66667
validation_loss,0.71392


[34m[1mwandb[0m: Agent Starting Run: uaojil7r with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▃▁▂▃▄▅▅▇▇█
train_loss,█▄▃▃▂▂▂▁▁▁
validation_accuracy,▃▁▂▃▄▅▅▇▇█
validation_loss,█▄▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,62.91667
test_accuracy,63.75
test_loss,1.07541
train_accuracy,64.44444
train_loss,0.98424
validation_accuracy,62.91667
validation_loss,1.07565


[34m[1mwandb[0m: Agent Starting Run: 3expj460 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▃▃▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▇████
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,81.48333
test_accuracy,80.97
test_loss,0.60601
train_accuracy,81.83333
train_loss,0.58464
validation_accuracy,81.48333
validation_loss,0.59008


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ebeb0gy8 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄▆█
train_loss,█▆▄▃▁
validation_accuracy,▁▂▄▆█
validation_loss,█▆▄▃▁

0,1
epoch,5.0
metric_name,12.55
test_accuracy,12.77
test_loss,2.28419
train_accuracy,13.33889
train_loss,2.27953
validation_accuracy,12.55
validation_loss,2.28276


[34m[1mwandb[0m: Agent Starting Run: rov2z93i with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄▅▆▆█▇██
train_loss,█▅▄▃▂▂▂▁▁▁
validation_accuracy,▁▂▄▅▆▇████
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,54.53333
test_accuracy,53.19
test_loss,1.22832
train_accuracy,54.22778
train_loss,1.2203
validation_accuracy,54.53333
validation_loss,1.22268


[34m[1mwandb[0m: Agent Starting Run: r6tukg49 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,72.03333
test_accuracy,71.53
test_loss,0.78052
train_accuracy,73.79074
train_loss,0.71914
validation_accuracy,72.03333
validation_loss,0.76527


[34m[1mwandb[0m: Agent Starting Run: ija4pcse with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▃▄▅▆▇▇██
train_loss,█▆▅▄▃▃▂▂▁▁
validation_accuracy,▁▂▃▄▅▆▇▇██
validation_loss,█▆▅▄▃▂▂▂▁▁

0,1
epoch,10.0
metric_name,24.61667
test_accuracy,23.65
test_loss,2.29599
train_accuracy,23.55
train_loss,2.29597
validation_accuracy,24.61667
validation_loss,2.29573


[34m[1mwandb[0m: Agent Starting Run: aqexo4dz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▆▆▇▇▇███
train_loss,█▄▂▂▂▂▁▁▁▁
validation_accuracy,▁▂▅▆▇▇▇▇██
validation_loss,█▄▂▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,81.1
test_accuracy,80.55
test_loss,0.55474
train_accuracy,81.58889
train_loss,0.51799
validation_accuracy,81.1
validation_loss,0.54134


[34m[1mwandb[0m: Agent Starting Run: w71xjve8 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
validation_accuracy,▁▄▅▆▆▇▇███
validation_loss,█▅▄▃▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,74.96667
test_accuracy,74.87
test_loss,4.94142
train_accuracy,76.28333
train_loss,4.59194
validation_accuracy,74.96667
validation_loss,4.88278


[34m[1mwandb[0m: Agent Starting Run: 7m0lau2q with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▅▅█
train_loss,█▆▄▂▁
validation_accuracy,▁▆▄▅█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,13.56667
test_accuracy,13.99
test_loss,5.00774
train_accuracy,13.49259
train_loss,5.03646
validation_accuracy,13.56667
validation_loss,5.09015


[34m[1mwandb[0m: Agent Starting Run: oma2yhnk with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▇▇▇██
train_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▃▅▅▆▇▇▇██
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,76.88333
test_accuracy,75.89
test_loss,0.69511
train_accuracy,78.80556
train_loss,0.57535
validation_accuracy,76.88333
validation_loss,0.673


[34m[1mwandb[0m: Agent Starting Run: pwqn1vv4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▇▇████
validation_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,87.2
test_accuracy,86.36
test_loss,0.40187
train_accuracy,88.76296
train_loss,0.31704
validation_accuracy,87.2
validation_loss,0.36317


[34m[1mwandb[0m: Agent Starting Run: yu8m1mdg with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,86.18333
test_accuracy,85.22
test_loss,0.42928
train_accuracy,86.77407
train_loss,0.38647
validation_accuracy,86.18333
validation_loss,0.39976


[34m[1mwandb[0m: Agent Starting Run: c0o8dbpl with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆████████
train_loss,████▇▁▁▁▁▁
validation_accuracy,▁▄▇▇▇█████
validation_loss,████▆▁▁▁▁▁

0,1
epoch,10.0
metric_name,10.13333
test_accuracy,10.0
test_loss,2.31313
train_accuracy,9.98519
train_loss,2.31311
validation_accuracy,10.13333
validation_loss,2.31326


[34m[1mwandb[0m: Agent Starting Run: znplqhhv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
validation_accuracy,▁▄▅▆▆▇▇▇██
validation_loss,█▅▄▃▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,79.18333
test_accuracy,78.92
test_loss,0.5847
train_accuracy,80.53889
train_loss,0.53674
validation_accuracy,79.18333
validation_loss,0.56086


[34m[1mwandb[0m: Agent Starting Run: 96u3qu1p with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇▇▇████
train_loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▄▆▇▇▇▇███
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,81.56667
test_accuracy,80.85
test_loss,0.57117
train_accuracy,81.94074
train_loss,0.54186
validation_accuracy,81.56667
validation_loss,0.54831


[34m[1mwandb[0m: Agent Starting Run: dhturwbv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇██
train_loss,█▃▁▁▁
validation_accuracy,▁▆▇██
validation_loss,█▃▁▁▁

0,1
epoch,5.0
metric_name,74.55
test_accuracy,74.46
test_loss,0.68479
train_accuracy,75.56481
train_loss,0.6337
validation_accuracy,74.55
validation_loss,0.66878


[34m[1mwandb[0m: Agent Starting Run: 4bq3135g with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▇████████
train_loss,█▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▇▇▇▇█████
validation_loss,█▂▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,83.46667
test_accuracy,82.5
test_loss,0.50193
train_accuracy,83.92778
train_loss,0.46512
validation_accuracy,83.46667
validation_loss,0.4726


[34m[1mwandb[0m: Agent Starting Run: ao1t8l31 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▆▇▇▇██
validation_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,67.2
test_accuracy,66.44
test_loss,0.91596
train_accuracy,67.4
train_loss,0.8812
validation_accuracy,67.2
validation_loss,0.88693


[34m[1mwandb[0m: Agent Starting Run: c4luzi4a with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▇▇▇█
train_loss,▁▄█▇▇
validation_accuracy,▁▇▆▆█
validation_loss,▁▄█▇▇

0,1
epoch,5.0
metric_name,21.76667
test_accuracy,21.32
test_loss,12.3898
train_accuracy,21.25185
train_loss,12.39516
validation_accuracy,21.76667
validation_loss,12.49796


[34m[1mwandb[0m: Agent Starting Run: ft239vke with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▃▅▆▆▆▆██
train_loss,█▆▄▄▃▂▂▁▁▁
validation_accuracy,▁▁▃▅▆▆▆▆██
validation_loss,█▆▄▄▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,47.75
test_accuracy,48.04
test_loss,1.23883
train_accuracy,48.04259
train_loss,1.22839
validation_accuracy,47.75
validation_loss,1.23359


[34m[1mwandb[0m: Agent Starting Run: co2zdqzr with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇▇█
train_loss,█▄▂▂▁
validation_accuracy,▁▆▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,62.16667
test_accuracy,62.7
test_loss,7.44927
train_accuracy,63.12593
train_loss,7.32056
validation_accuracy,62.16667
validation_loss,7.46372


[34m[1mwandb[0m: Agent Starting Run: xag12v5d with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,87.0
test_accuracy,85.97
test_loss,0.39973
train_accuracy,87.91111
train_loss,0.34086
validation_accuracy,87.0
validation_loss,0.36326


[34m[1mwandb[0m: Agent Starting Run: 9m12s9sq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▁▂▂
train_loss,█▂▂▁▁
validation_accuracy,█▁▁▂▃
validation_loss,█▂▂▁▁

0,1
epoch,5.0
metric_name,14.78333
test_accuracy,14.89
test_loss,2.30757
train_accuracy,15.0963
train_loss,2.25608
validation_accuracy,14.78333
validation_loss,2.2651


[34m[1mwandb[0m: Agent Starting Run: gikx70vq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▄▆▇▆████
train_loss,█▆▅▃▂▃▁▁▁▁
validation_accuracy,▁▄▄▅▇▆██▇▇
validation_loss,█▅▅▃▁▃▁▁▂▂

0,1
epoch,10.0
metric_name,87.56667
test_accuracy,86.51
test_loss,0.40269
train_accuracy,89.22778
train_loss,0.29472
validation_accuracy,87.56667
validation_loss,0.36862


[34m[1mwandb[0m: Agent Starting Run: uze4e8qp with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▄▅█
train_loss,█▇▄▃▁
validation_accuracy,▁▂▅▅█
validation_loss,█▇▄▃▁

0,1
epoch,5.0
metric_name,28.08333
test_accuracy,28.68
test_loss,1.67475
train_accuracy,29.02963
train_loss,1.66848
validation_accuracy,28.08333
validation_loss,1.67616


[34m[1mwandb[0m: Agent Starting Run: 1ddtkbeo with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆██
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,82.46667
test_accuracy,82.02
test_loss,0.50671
train_accuracy,84.17037
train_loss,0.43839
validation_accuracy,82.46667
validation_loss,0.4678


[34m[1mwandb[0m: Agent Starting Run: rk2lygza with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁██▁
train_loss,█▂▁▃▁
validation_accuracy,██▁▁█
validation_loss,█▂▁▃▁

0,1
epoch,5.0
metric_name,9.95
test_accuracy,10.0
test_loss,2.32541
train_accuracy,10.00556
train_loss,2.32546
validation_accuracy,9.95
validation_loss,2.32494


[34m[1mwandb[0m: Agent Starting Run: eq2tkb5t with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▄▃▁▅▂▄▂▂█
train_loss,▂▂▂▁▁▂█▁█▂
validation_accuracy,▁▅▆█▄▇▅▇▇▁
validation_loss,▂▂▂▁▁▂█▁█▂

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,3.32216
train_accuracy,10.09259
train_loss,3.31959
validation_accuracy,9.16667
validation_loss,3.34531


[34m[1mwandb[0m: Agent Starting Run: ozdhj3v2 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▇██
validation_loss,█▄▂▁▁

0,1
epoch,5.0
metric_name,87.08333
test_accuracy,85.96
test_loss,0.40134
train_accuracy,88.69815
train_loss,0.31166
validation_accuracy,87.08333
validation_loss,0.36415


[34m[1mwandb[0m: Agent Starting Run: kphpnan7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇██████
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▆▇▇██████
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,84.53333
test_accuracy,83.58
test_loss,0.50622
train_accuracy,84.77778
train_loss,0.47106
validation_accuracy,84.53333
validation_loss,0.47591


[34m[1mwandb[0m: Agent Starting Run: 2kkakn2l with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇▇█▇██▇
train_loss,█▄▃▂▂▁▂▁▁▂
validation_accuracy,▁▅▆▇██▇██▇
validation_loss,█▄▃▂▂▁▂▁▁▂

0,1
epoch,10.0
metric_name,79.46667
test_accuracy,79.01
test_loss,0.59653
train_accuracy,79.79444
train_loss,0.56256
validation_accuracy,79.46667
validation_loss,0.56273


[34m[1mwandb[0m: Agent Starting Run: 5g05zvp2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▆▄▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,35.85
test_accuracy,34.92
test_loss,2.09116
train_accuracy,35.17963
train_loss,2.08861
validation_accuracy,35.85
validation_loss,2.08597


[34m[1mwandb[0m: Agent Starting Run: 1enys1ly with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▂▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,83.95
test_accuracy,83.34
test_loss,0.46395
train_accuracy,84.79074
train_loss,0.42699
validation_accuracy,83.95
validation_loss,0.43034


[34m[1mwandb[0m: Agent Starting Run: m3unjaiq with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂██▁
train_loss,█▄▁▁▂
validation_accuracy,█▇▁▁█
validation_loss,█▄▁▁▂

0,1
epoch,5.0
metric_name,10.45
test_accuracy,10.0
test_loss,3.10887
train_accuracy,9.95
train_loss,3.11061
validation_accuracy,10.45
validation_loss,3.09324


[34m[1mwandb[0m: Agent Starting Run: t3xpoubd with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁██
train_loss,█▂▁▁▁
validation_accuracy,███▁▁
validation_loss,█▂▁▁▁

0,1
epoch,5.0
metric_name,9.83333
test_accuracy,10.0
test_loss,2.30258
train_accuracy,10.01852
train_loss,2.30259
validation_accuracy,9.83333
validation_loss,2.3025


[34m[1mwandb[0m: Agent Starting Run: 617wyjv5 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
validation_accuracy,▁▅▆▆▆▆▇███
validation_loss,█▅▄▃▃▂▂▂▁▁

0,1
epoch,10.0
metric_name,86.78333
test_accuracy,85.47
test_loss,0.40088
train_accuracy,87.43148
train_loss,0.34813
validation_accuracy,86.78333
validation_loss,0.37002


[34m[1mwandb[0m: Agent Starting Run: 4u9gvc9o with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▆▁▁▁█
train_loss,▅▄▃▁█
validation_accuracy,▃███▁
validation_loss,▅▄▃▁█

0,1
epoch,5.0
metric_name,9.78333
test_accuracy,10.0
test_loss,2.34294
train_accuracy,10.02407
train_loss,2.34301
validation_accuracy,9.78333
validation_loss,2.34229


[34m[1mwandb[0m: Agent Starting Run: keddr4rl with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,██▁▁▂
train_loss,█▁▂▁▁
validation_accuracy,▁▁██▇
validation_loss,█▁▂▁▁

0,1
epoch,5.0
metric_name,10.31667
test_accuracy,10.0
test_loss,2.30726
train_accuracy,9.96481
train_loss,2.30725
validation_accuracy,10.31667
validation_loss,2.30726


[34m[1mwandb[0m: Agent Starting Run: p84xgd3d with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁█████████
train_loss,█▁▁▁▁▁▁▁▁▁
validation_accuracy,█▁▁▁▁▁▁▁▁▁
validation_loss,█▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: nxx6w4zn with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆█▆▄▇▇█▇█
train_loss,█▂▁▃▄▁▃▁▂▁
validation_accuracy,▁▆█▆▄▇▇█▇█
validation_loss,█▂▁▃▅▁▃▁▂▁

0,1
epoch,10.0
metric_name,83.58333
test_accuracy,82.52
test_loss,0.49745
train_accuracy,84.06296
train_loss,0.44856
validation_accuracy,83.58333
validation_loss,0.46424


[34m[1mwandb[0m: Agent Starting Run: 38l5wfgi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇▇████
train_loss,█▅▃▃▂▂▁▁▁▁
validation_accuracy,▁▄▆▇▇▇████
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,66.73333
test_accuracy,66.61
test_loss,6.29968
train_accuracy,67.62963
train_loss,6.0775
validation_accuracy,66.73333
validation_loss,6.26313


[34m[1mwandb[0m: Agent Starting Run: ztqpbxso with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇▇█
train_loss,█▄▂▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,82.96667
test_accuracy,81.81
test_loss,0.51616
train_accuracy,83.28519
train_loss,0.48148
validation_accuracy,82.96667
validation_loss,0.48922


[34m[1mwandb[0m: Agent Starting Run: seqebqkj with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▂▃▄▆▆▇█
train_loss,█▇▆▅▄▃▃▂▁▁
validation_accuracy,▂▁▂▂▄▅▆▆▇█
validation_loss,█▇▆▅▄▃▃▂▁▁

0,1
epoch,10.0
metric_name,17.11667
test_accuracy,18.78
test_loss,3.28534
train_accuracy,18.75556
train_loss,3.29374
validation_accuracy,17.11667
validation_loss,3.38637


[34m[1mwandb[0m: Agent Starting Run: t2nnmz1j with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,██▆▂▁
train_loss,█▆▄▂▁
validation_accuracy,█▇▅▂▁
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,7.21667
test_accuracy,7.63
test_loss,3.63129
train_accuracy,7.28889
train_loss,3.64474
validation_accuracy,7.21667
validation_loss,3.65513


[34m[1mwandb[0m: Agent Starting Run: j5tb04o4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▂▄█
train_loss,█▆▅▃▁
validation_accuracy,▁▁▃▅█
validation_loss,█▆▅▃▁

0,1
epoch,5.0
metric_name,11.16667
test_accuracy,11.02
test_loss,2.30082
train_accuracy,10.95556
train_loss,2.30083
validation_accuracy,11.16667
validation_loss,2.30058


[34m[1mwandb[0m: Agent Starting Run: se7zyhty with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▄▂▁▁▂▃▄▅▇█
train_loss,▅▇██▇▆▅▃▂▁
validation_accuracy,▇▅▂▃▄▁▂▄▇█
validation_loss,▃▄▇▆▆██▆▄▁

0,1
epoch,10.0
metric_name,5.53333
test_accuracy,5.61
test_loss,19.5545
train_accuracy,5.37778
train_loss,19.60508
validation_accuracy,5.53333
validation_loss,19.56245


[34m[1mwandb[0m: Agent Starting Run: j6co8j8h with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,█▄▁▄▄
train_loss,█▂▁▁▁
validation_accuracy,▁▅█▅▅
validation_loss,█▂▁▁▁

0,1
epoch,5.0
metric_name,9.95
test_accuracy,10.0
test_loss,2.31108
train_accuracy,10.00556
train_loss,2.311
validation_accuracy,9.95
validation_loss,2.31182


[34m[1mwandb[0m: Agent Starting Run: 0do1ipno with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▄▄▃▁▂▅▅█▆█
train_loss,██▇▇▅▄▃▂▂▁
validation_accuracy,▅▄▃▁▂▅▅▆▇█
validation_loss,███▇▅▄▃▂▂▁

0,1
epoch,10.0
metric_name,10.21667
test_accuracy,10.06
test_loss,4.9247
train_accuracy,10.2537
train_loss,4.88748
validation_accuracy,10.21667
validation_loss,4.90415


[34m[1mwandb[0m: Agent Starting Run: upt1r65e with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▆█
train_loss,█▆▄▃▁
validation_accuracy,▁▃▅▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,13.71667
test_accuracy,12.07
test_loss,18.21965
train_accuracy,13.01481
train_loss,18.02387
validation_accuracy,13.71667
validation_loss,17.87822


[34m[1mwandb[0m: Agent Starting Run: w1ev09kp with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇███
train_loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▇▇████
validation_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,87.81667
test_accuracy,86.83
test_loss,0.36871
train_accuracy,89.38148
train_loss,0.29135
validation_accuracy,87.81667
validation_loss,0.34322


[34m[1mwandb[0m: Agent Starting Run: 3c2zvpil with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇██████
train_loss,█▅▃▂▂▁▁▁▁▁
validation_accuracy,▁▃▅▇▇█████
validation_loss,█▅▄▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,85.21667
test_accuracy,84.05
test_loss,0.46989
train_accuracy,85.36111
train_loss,0.43185
validation_accuracy,85.21667
validation_loss,0.44047


[34m[1mwandb[0m: Agent Starting Run: yozyuo9o with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▆▇█
train_loss,█▅▄▂▁
validation_accuracy,▁▃▆▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,10.68333
test_accuracy,10.42
test_loss,7.71523
train_accuracy,10.77037
train_loss,7.70855
validation_accuracy,10.68333
validation_loss,7.76692


[34m[1mwandb[0m: Agent Starting Run: jd5v2f1f with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,87.15
test_accuracy,86.24
test_loss,0.38627
train_accuracy,88.18519
train_loss,0.32868
validation_accuracy,87.15
validation_loss,0.35422


[34m[1mwandb[0m: Agent Starting Run: i57nu6ei with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁████
train_loss,█▄▂▂▁▁▁▁▁▁
validation_accuracy,██████▁▁▁▁
validation_loss,█▄▂▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.95
test_accuracy,10.0
test_loss,2.30286
train_accuracy,10.00556
train_loss,2.30282
validation_accuracy,9.95
validation_loss,2.30322


[34m[1mwandb[0m: Agent Starting Run: 9s1qrk7v with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▇█
train_loss,█▆▄▂▁
validation_accuracy,▁▃▅▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,26.9
test_accuracy,27.17
test_loss,15.08262
train_accuracy,27.32593
train_loss,15.05233
validation_accuracy,26.9
validation_loss,15.14255


[34m[1mwandb[0m: Agent Starting Run: 3ainotf5 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁██
train_loss,▁▂▃█▃
validation_accuracy,███▁▁
validation_loss,▁▂▃█▃

0,1
epoch,5.0
metric_name,10.31667
test_accuracy,10.0
test_loss,3.12083
train_accuracy,9.96481
train_loss,3.12362
validation_accuracy,10.31667
validation_loss,3.09568


[34m[1mwandb[0m: Agent Starting Run: 3gmz6nf6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂████████
train_loss,█▁▁▁▁▁▁▁▁▁
validation_accuracy,█▇▁▁▁▁▁▁▁▁
validation_loss,█▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: 6358t70j with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▃▃▄▅▅▆▇█
train_loss,█▇▆▅▅▄▃▂▂▁
validation_accuracy,▁▁▂▃▅▅▅▆▇█
validation_loss,█▇▆▅▅▄▃▂▂▁

0,1
epoch,10.0
metric_name,8.4
test_accuracy,8.05
test_loss,13.75194
train_accuracy,8.24444
train_loss,13.80361
validation_accuracy,8.4
validation_loss,13.70108


[34m[1mwandb[0m: Agent Starting Run: zypqdyn6 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,75.13333
test_accuracy,74.99
test_loss,0.79121
train_accuracy,75.13333
train_loss,0.77873
validation_accuracy,75.13333
validation_loss,0.78534


[34m[1mwandb[0m: Agent Starting Run: m2c6y0hs with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,73.75
test_accuracy,74.04
test_loss,0.73468
train_accuracy,74.70556
train_loss,0.70435
validation_accuracy,73.75
validation_loss,0.72913


[34m[1mwandb[0m: Agent Starting Run: 0sfrkpf2 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▄▃▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,83.56667
test_accuracy,82.49
test_loss,0.51289
train_accuracy,84.1
train_loss,0.48072
validation_accuracy,83.56667
validation_loss,0.48531


[34m[1mwandb[0m: Agent Starting Run: bnohz5gh with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇████
train_loss,█▅▄▃▂▂▁▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███
validation_loss,█▅▃▂▂▁▁▁▁▂

0,1
epoch,10.0
metric_name,85.35
test_accuracy,84.4
test_loss,0.48323
train_accuracy,87.35185
train_loss,0.36165
validation_accuracy,85.35
validation_loss,0.44365


[34m[1mwandb[0m: Agent Starting Run: y2agd936 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▃▅▅▆▇███
train_loss,███▇▆▄▂▂▁▁
validation_accuracy,▁▂▃▅▅▆▇███
validation_loss,███▇▆▄▂▂▁▁

0,1
epoch,10.0
metric_name,70.16667
test_accuracy,69.07
test_loss,0.91778
train_accuracy,70.31852
train_loss,0.8916
validation_accuracy,70.16667
validation_loss,0.89768


[34m[1mwandb[0m: Agent Starting Run: bp013njn with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▇████████
train_loss,█▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▆▇▇██████
validation_loss,█▂▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,85.26667
test_accuracy,83.75
test_loss,0.4669
train_accuracy,85.61111
train_loss,0.42885
validation_accuracy,85.26667
validation_loss,0.43618


[34m[1mwandb[0m: Agent Starting Run: na5a50gb with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁████
train_loss,█▁▁▁▁
validation_accuracy,█▁▁▁▁
validation_loss,█▁▁▁▁

0,1
epoch,5.0
metric_name,9.16667
test_accuracy,10.0
test_loss,2.30338
train_accuracy,10.09259
train_loss,2.30329
validation_accuracy,9.16667
validation_loss,2.30416


[34m[1mwandb[0m: Agent Starting Run: 02zftbzw with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▇▇█████
train_loss,█▅▄▃▂▂▁▁▁▁
validation_accuracy,▁▃▅▇▇▇████
validation_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,81.38333
test_accuracy,80.79
test_loss,0.52415
train_accuracy,82.12037
train_loss,0.48751
validation_accuracy,81.38333
validation_loss,0.49894


[34m[1mwandb[0m: Agent Starting Run: cj7tl287 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▄▃▂▂▂▁▁
validation_accuracy,▁▄▆▆▆█▇▇██
validation_loss,█▄▂▂▃▁▂▂▁▁

0,1
epoch,10.0
metric_name,87.46667
test_accuracy,86.4
test_loss,0.40733
train_accuracy,90.04444
train_loss,0.27621
validation_accuracy,87.46667
validation_loss,0.37857


[34m[1mwandb[0m: Agent Starting Run: b8843iwv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▄▅▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,74.05
test_accuracy,73.69
test_loss,0.74504
train_accuracy,74.84444
train_loss,0.7294
validation_accuracy,74.05
validation_loss,0.73637


[34m[1mwandb[0m: Agent Starting Run: z0728n0d with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▂▁▂▃▄▅▆▆▇█
train_loss,█▆▄▃▂▂▂▁▁▁
validation_accuracy,▂▁▂▃▃▅▅▆▇█
validation_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,25.76667
test_accuracy,24.8
test_loss,2.04026
train_accuracy,25.15926
train_loss,2.0372
validation_accuracy,25.76667
validation_loss,2.03607


[34m[1mwandb[0m: Agent Starting Run: ok9gn6hj with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▂▃▅▇█
train_loss,█████▇▆▄▂▁
validation_accuracy,▂▂▁▁▁▂▃▅▆█
validation_loss,█▇███▇▆▄▃▁

0,1
epoch,10.0
metric_name,18.1
test_accuracy,17.3
test_loss,17.06555
train_accuracy,17.62407
train_loss,16.98272
validation_accuracy,18.1
validation_loss,16.9183


[34m[1mwandb[0m: Agent Starting Run: uotxlkq2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▅▆▇▇███
train_loss,█▆▅▄▄▃▂▂▁▁
validation_accuracy,▁▃▅▅▆▇▇▇██
validation_loss,█▆▅▄▄▃▂▂▁▁

0,1
epoch,10.0
metric_name,50.4
test_accuracy,50.21
test_loss,1.98488
train_accuracy,50.52593
train_loss,1.98259
validation_accuracy,50.4
validation_loss,1.98688


[34m[1mwandb[0m: Agent Starting Run: nycrgzvc with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▄▄▄▂█▁▃▄▃▃
train_loss,▄▃█▃▄▅▄▆▁▄
validation_accuracy,▅▅▅▇▁█▆▅▆▆
validation_loss,▄▂█▃▄▄▂▆▁▅

0,1
epoch,10.0
metric_name,9.95
test_accuracy,10.0
test_loss,2.30922
train_accuracy,10.00556
train_loss,2.30907
validation_accuracy,9.95
validation_loss,2.31058


[34m[1mwandb[0m: Agent Starting Run: pp9gn22i with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▅▆█
train_loss,█▆▄▂▁
validation_accuracy,▁▃▅▇█
validation_loss,█▆▄▂▁

0,1
epoch,5.0
metric_name,21.1
test_accuracy,21.41
test_loss,2.22852
train_accuracy,21.3
train_loss,2.23065
validation_accuracy,21.1
validation_loss,2.22826


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: w70iqge8 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▇██
train_loss,█▅▂▁▁
validation_accuracy,▁▃▇█▆
validation_loss,█▆▂▁▂

0,1
epoch,5.0
metric_name,73.88333
test_accuracy,73.48
test_loss,5.48628
train_accuracy,75.2537
train_loss,5.11974
validation_accuracy,73.88333
validation_loss,5.4001


[34m[1mwandb[0m: Agent Starting Run: ai1wfj4j with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇███
train_loss,█▆▅▄▃▃▂▂▁▁
validation_accuracy,▁▄▅▅▆▇▇███
validation_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,10.0
metric_name,24.36667
test_accuracy,23.46
test_loss,2.2977
train_accuracy,23.53889
train_loss,2.2976
validation_accuracy,24.36667
validation_loss,2.29727


[34m[1mwandb[0m: Agent Starting Run: n9cphgk9 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇██
train_loss,█▃▂▁▁
validation_accuracy,▁▆▇██
validation_loss,█▃▂▁▁

0,1
epoch,5.0
metric_name,74.48333
test_accuracy,74.18
test_loss,0.66512
train_accuracy,75.58704
train_loss,0.61789
validation_accuracy,74.48333
validation_loss,0.65145


[34m[1mwandb[0m: Agent Starting Run: csouehuy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▆▇▇████
train_loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,67.93333
test_accuracy,68.03
test_loss,6.60072
train_accuracy,69.67037
train_loss,6.26768
validation_accuracy,67.93333
validation_loss,6.63551


[34m[1mwandb[0m: Agent Starting Run: cjlmq3cw with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,87.13333
test_accuracy,86.06
test_loss,0.38233
train_accuracy,88.31667
train_loss,0.32311
validation_accuracy,87.13333
validation_loss,0.34553


[34m[1mwandb[0m: Agent Starting Run: 5jdgcyt7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█████▇▇▄▂▁
train_loss,▆▆▆▆▆▇█▃▁▁
validation_accuracy,█████▇▇▄▂▁
validation_loss,▆▆▆▆▆▇█▂▁▁

0,1
epoch,10.0
metric_name,25.25
test_accuracy,24.68
test_loss,2.46086
train_accuracy,25.22037
train_loss,2.36825
validation_accuracy,25.25
validation_loss,2.4135


[34m[1mwandb[0m: Agent Starting Run: f4pf4s8j with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁
train_loss,▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁
validation_loss,▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,10.31667
test_accuracy,10.0
test_loss,2.30353
train_accuracy,9.96481
train_loss,2.30363
validation_accuracy,10.31667
validation_loss,2.30263


[34m[1mwandb[0m: Agent Starting Run: 56pottqo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█████▄▂▁▁▁
train_loss,█▆▇▇█▃▁▁▁▁
validation_accuracy,████▇▄▂▁▁▁
validation_loss,█▆▇▇█▃▁▁▁▁

0,1
epoch,10.0
metric_name,15.78333
test_accuracy,15.61
test_loss,2.20188
train_accuracy,15.5963
train_loss,2.20055
validation_accuracy,15.78333
validation_loss,2.20139


[34m[1mwandb[0m: Agent Starting Run: 4d2vnp7n with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▄▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,73.23333
test_accuracy,72.18
test_loss,0.77544
train_accuracy,73.90185
train_loss,0.74027
validation_accuracy,73.23333
validation_loss,0.75013


[34m[1mwandb[0m: Agent Starting Run: 9unr53oj with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,72.38333
test_accuracy,72.68
test_loss,0.71977
train_accuracy,74.28148
train_loss,0.66952
validation_accuracy,72.38333
validation_loss,0.71789


[34m[1mwandb[0m: Agent Starting Run: 81nzfv64 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,85.63333
test_accuracy,84.8
test_loss,0.43551
train_accuracy,86.36481
train_loss,0.39329
validation_accuracy,85.63333
validation_loss,0.40643


[34m[1mwandb[0m: Agent Starting Run: ruqxa9ok with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
validation_accuracy,▁▆▆▇▇█████
validation_loss,█▃▂▂▂▁▁▁▁▁

0,1
epoch,10.0
metric_name,85.26667
test_accuracy,84.31
test_loss,0.46382
train_accuracy,85.80741
train_loss,0.42703
validation_accuracy,85.26667
validation_loss,0.43357


[34m[1mwandb[0m: Agent Starting Run: ss9f8mrz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,██▇▅▁
train_loss,█▄▃▃▁
validation_accuracy,██▇▄▁
validation_loss,█▄▃▃▁

0,1
epoch,5.0
metric_name,35.86667
test_accuracy,35.15
test_loss,2.37828
train_accuracy,35.36111
train_loss,2.37664
validation_accuracy,35.86667
validation_loss,2.36709


[34m[1mwandb[0m: Agent Starting Run: 33d3rh8o with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▃▂▂▂▁▁▁
validation_accuracy,▁▄▅▅▆▇▇▇██
validation_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,75.81667
test_accuracy,74.24
test_loss,0.67974
train_accuracy,76.08704
train_loss,0.63985
validation_accuracy,75.81667
validation_loss,0.66676


[34m[1mwandb[0m: Agent Starting Run: 43yveu7y with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▆▆▆█▆▇█
train_loss,█▅▃▃▃▂▁▂▁▁
validation_accuracy,▁▅▆▆▆▆█▆▇▇
validation_loss,█▅▃▄▃▂▁▂▁▁

0,1
epoch,10.0
metric_name,76.06667
test_accuracy,75.99
test_loss,0.65786
train_accuracy,77.2537
train_loss,0.61039
validation_accuracy,76.06667
validation_loss,0.63135


[34m[1mwandb[0m: Agent Starting Run: b5mlirrs with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▁▁▁▁▁▁▁▁
train_loss,▁▂▂▃▃▃▅▆▆█
validation_accuracy,█▁▁▁▁▁▁▁▁▁
validation_loss,▁▂▂▃▃▃▅▆▆█

0,1
epoch,10.0
metric_name,10.45
test_accuracy,10.0
test_loss,2.70333
train_accuracy,9.95
train_loss,2.70582
validation_accuracy,10.45
validation_loss,2.68175


[34m[1mwandb[0m: Agent Starting Run: ogvrhpvb with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇▇█████
train_loss,█▅▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▆▇▇▇████
validation_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,81.4
test_accuracy,80.97
test_loss,0.51468
train_accuracy,82.73333
train_loss,0.47665
validation_accuracy,81.4
validation_loss,0.49274


[34m[1mwandb[0m: Agent Starting Run: i5u2y4rs with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▃▅▆▆▇▇██
train_loss,██▅▄▃▃▂▂▁▁
validation_accuracy,▁▂▃▅▆▆▇▇██
validation_loss,██▅▄▃▃▂▂▁▁

0,1
epoch,10.0
metric_name,75.28333
test_accuracy,75.01
test_loss,0.6969
train_accuracy,75.72963
train_loss,0.67599
validation_accuracy,75.28333
validation_loss,0.68198


[34m[1mwandb[0m: Agent Starting Run: qgr75m52 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▆▆▇▇▇█▇█
train_loss,█▇▃▃▂▂▃▁▁▁
validation_accuracy,▁▂▇▇▇▇▇███
validation_loss,█▇▁▃▁▂▃▁▁▁

0,1
epoch,10.0
metric_name,87.61667
test_accuracy,86.67
test_loss,0.40799
train_accuracy,89.00926
train_loss,0.31467
validation_accuracy,87.61667
validation_loss,0.37797


[34m[1mwandb[0m: Agent Starting Run: xmlavy5x with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇████
train_loss,█▆▅▄▃▂▂▂▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▆▅▄▃▂▂▂▁▁

0,1
epoch,10.0
metric_name,36.68333
test_accuracy,35.54
test_loss,2.05962
train_accuracy,35.68148
train_loss,2.0584
validation_accuracy,36.68333
validation_loss,2.05568


[34m[1mwandb[0m: Agent Starting Run: oto72soi with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▃▃▁▁█
train_loss,▇▆▁█▇
validation_accuracy,▁▁███
validation_loss,█▆▁█▆

0,1
epoch,5.0
metric_name,10.33333
test_accuracy,10.17
test_loss,3.65409
train_accuracy,10.12963
train_loss,3.65777
validation_accuracy,10.33333
validation_loss,3.60831


[34m[1mwandb[0m: Agent Starting Run: 0dd3v99p with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇▇▇▆███
train_loss,█▅▃▂▃▂▃▂▂▁
validation_accuracy,▁▆▇█▇█▆▇▇▇
validation_loss,█▃▂▁▃▃▅▄▄▄

0,1
epoch,10.0
metric_name,86.41667
test_accuracy,85.35
test_loss,0.41622
train_accuracy,88.55185
train_loss,0.31169
validation_accuracy,86.41667
validation_loss,0.38782


[34m[1mwandb[0m: Agent Starting Run: vptl4p88 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▅▄▃▃▂▂▁▁
validation_accuracy,▁▃▄▅▆▇▇▇██
validation_loss,█▅▄▃▃▂▂▁▁▁

0,1
epoch,10.0
metric_name,88.25
test_accuracy,87.83
test_loss,0.34727
train_accuracy,90.3537
train_loss,0.26005
validation_accuracy,88.25
validation_loss,0.32321


[34m[1mwandb[0m: Agent Starting Run: csuuei22 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▇██
train_loss,█▄▂▁▁
validation_accuracy,▁▆▇██
validation_loss,█▄▂▁▁

0,1
epoch,5.0
metric_name,85.63333
test_accuracy,84.52
test_loss,0.43244
train_accuracy,86.44259
train_loss,0.37494
validation_accuracy,85.63333
validation_loss,0.40205


[34m[1mwandb[0m: Agent Starting Run: 1p5axogx with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▂▅▅▆▆▆▆▇█
train_loss,████▇▇▆▅▃▁
validation_accuracy,▁▂▅▅▆▆▆▆▇█
validation_loss,████▇▇▆▅▃▁

0,1
epoch,10.0
metric_name,46.03333
test_accuracy,45.63
test_loss,1.70441
train_accuracy,45.5463
train_loss,1.70222
validation_accuracy,46.03333
validation_loss,1.70711


[34m[1mwandb[0m: Agent Starting Run: rzwun232 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇▇▇████
train_loss,█▄▃▃▂▂▁▁▁▁
validation_accuracy,▁▇▇▇▇▇████
validation_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,36.16667
test_accuracy,37.15
test_loss,2.09761
train_accuracy,36.53889
train_loss,2.09738
validation_accuracy,36.16667
validation_loss,2.09983


[34m[1mwandb[0m: Agent Starting Run: cgaisrek with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▅▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▅▅▇█
validation_loss,█▅▃▂▁

0,1
epoch,5.0
metric_name,39.08333
test_accuracy,38.45
test_loss,1.33651
train_accuracy,38.55741
train_loss,1.33117
validation_accuracy,39.08333
validation_loss,1.33672


[34m[1mwandb[0m: Agent Starting Run: yznk91y1 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▆▄▃▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▇▇▇███
validation_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,54.25
test_accuracy,53.46
test_loss,1.85606
train_accuracy,53.72037
train_loss,1.85346
validation_accuracy,54.25
validation_loss,1.85582


[34m[1mwandb[0m: Agent Starting Run: zo4jdswd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
validation_accuracy,▁▅▆▇█
validation_loss,█▄▃▂▁

0,1
epoch,5.0
metric_name,71.55
test_accuracy,70.29
test_loss,6.15624
train_accuracy,72.47593
train_loss,5.70272
validation_accuracy,71.55
validation_loss,5.89577


[34m[1mwandb[0m: Agent Starting Run: jsw1bq1b with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▅▆▆▆▇██
train_loss,█▅▄▄▃▃▂▂▁▁
validation_accuracy,▁▄▅▅▆▆▆▇██
validation_loss,█▅▄▄▃▃▂▂▁▁

0,1
epoch,10.0
metric_name,78.6
test_accuracy,77.98
test_loss,0.64084
train_accuracy,79.44815
train_loss,0.61347
validation_accuracy,78.6
validation_loss,0.621


[34m[1mwandb[0m: Agent Starting Run: g82zt4p2 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,█▁▁▁▁▁▁▁▁▁
train_loss,█▇▆▅▅▄▃▂▂▁
validation_accuracy,█▁▁▁▁▁▁▁▁▁
validation_loss,█▇▆▅▄▄▃▂▂▁

0,1
epoch,10.0
metric_name,10.35
test_accuracy,10.0
test_loss,2.30255
train_accuracy,9.96111
train_loss,2.30256
validation_accuracy,10.35
validation_loss,2.30246


[34m[1mwandb[0m: Agent Starting Run: sff7pdy9 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▅▆▆▇▇▇▇█
train_loss,█▅▄▃▃▂▂▂▁▁
validation_accuracy,▁▄▆▆▇▇▇▇▇█
validation_loss,█▄▃▃▂▂▂▁▁▁

0,1
epoch,10.0
metric_name,86.95
test_accuracy,85.96
test_loss,0.39344
train_accuracy,88.03889
train_loss,0.32844
validation_accuracy,86.95
validation_loss,0.36445


[34m[1mwandb[0m: Agent Starting Run: 9ha9dew1 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▆▇▇▇▇██
train_loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▄▆▆▇█████
validation_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
metric_name,45.26667
test_accuracy,44.65
test_loss,1.88227
train_accuracy,45.17593
train_loss,1.87898
validation_accuracy,45.26667
validation_loss,1.88242


[34m[1mwandb[0m: Agent Starting Run: h2hh0v34 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
validation_accuracy,▁▄▇▇█
validation_loss,█▄▂▂▁

0,1
epoch,5.0
metric_name,86.76667
test_accuracy,85.56
test_loss,0.40492
train_accuracy,87.58704
train_loss,0.34335
validation_accuracy,86.76667
validation_loss,0.36973


[34m[1mwandb[0m: Agent Starting Run: if8x1rd2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆████▇▇▇█
train_loss,█▃▂▂▁▁▁▁▁▁
validation_accuracy,▁▆████▇▇▇▇
validation_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,47.98333
test_accuracy,47.23
test_loss,1.9388
train_accuracy,48.16667
train_loss,1.93465
validation_accuracy,47.98333
validation_loss,1.93743


[34m[1mwandb[0m: Agent Starting Run: r15l4m0h with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▂▂▅▁█
train_loss,█▅▃▇▁
validation_accuracy,▂▃▆▁█
validation_loss,█▅▄█▁

0,1
epoch,5.0
metric_name,85.75
test_accuracy,84.83
test_loss,0.42328
train_accuracy,86.73889
train_loss,0.36456
validation_accuracy,85.75
validation_loss,0.39365


[34m[1mwandb[0m: Agent Starting Run: z5p1xsg0 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▅▆▇▇█████
train_loss,█▄▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▆▇██████
validation_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
metric_name,72.1
test_accuracy,72.05
test_loss,0.94118
train_accuracy,72.85556
train_loss,0.92653
validation_accuracy,72.1
validation_loss,0.93299


[34m[1mwandb[0m: Agent Starting Run: by1qhcf5 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beta: 0.9
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	dataset: fashion_mnist
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


In [None]:
# !pkill -9 -f wandb  # Kill all WandB processes
# !wandb login --relogin
# !wandb login --cloud