In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.datasets import fashion_mnist
import wandb

In [2]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33momkarmande[0m ([33momkarmande-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test = X_test.reshape(X_test.shape[0], -1) / 255.0

num_classes = 10
y_train_onehot = np.eye(num_classes)[y_train]
y_test_onehot = np.eye(num_classes)[y_test]

split_index = int(0.9 * X_train.shape[0])
X, X_val = X_train[:split_index], X_train[split_index:]
y, y_val = y_train_onehot[:split_index], y_train_onehot[split_index:]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
def sigmoid(x):
    x = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x):
    x = np.clip(x, -500, 500)
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def identity(x):
    return x

def identity_derivative(x):
    return np.ones_like(x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)


In [5]:
def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 1e-9)) / y_true.shape[0]

def squared_error_loss(y_true, y_pred):
    return np.mean(np.sum((y_true - y_pred) ** 2, axis=1))

def accuracy(y_true, y_pred):
    true_labels = np.argmax(y_true, axis=1)
    pred_labels = np.argmax(y_pred, axis=1)
    return np.mean(true_labels == pred_labels)

In [6]:
def initialize_weights(shape, method="xavier"):
    if method == "random":
        return np.random.randn(*shape) * 0.01
    elif method == "xavier":
        return np.random.randn(*shape) * np.sqrt(2.0 / shape[0])
    else:
        raise ValueError("Unknown initialization method: Choose 'random' or 'xavier'")

def clip_gradients(grads, clip_value=5.0):
    return [np.clip(g, -clip_value, clip_value) for g in grads]

In [13]:
class Model:
  def get_activation_functions(self, activation_type):
        activations = {
            "sigmoid": (sigmoid, sigmoid_derivative),
            "tanh": (tanh, tanh_derivative),
            "ReLu": (relu, relu_derivative),
            "identity": (identity, identity_derivative)
        }
        return activations.get(activation_type, (sigmoid, sigmoid_derivative))

  def __init__(self, il_neuron, hl_neuron, hl_count, ol_neuron, opt="adam", lr=0.1, batch=4, init="xavier", act="tanh", loss="cross_entropy", decay=0):
    self.layers = [il_neuron] + [hl_neuron]*hl_count + [ol_neuron]
    self.weights = []
    self.biases = []
    self.opt = opt
    self.lr = lr
    self.batch = batch
    self.init = init
    self.act = act
    self.loss = loss
    self.decay = decay
    self.momentum = 0.9
    self.beta1 = 0.9
    self.beta2 = 0.999
    self.epsilon = 1e-6
    self.t = 0

    self.velocities = []
    self.velocities_b = []
    self.squared_grads = []
    self.squared_grads_b = []
    self.m_t_w = []
    self.m_t_b = []
    self.v_t_w = []
    self.v_t_b = []

    self.activation_func, self.activation_derivative = self.get_activation_functions(act)
    self.loss_func = cross_entropy_loss if loss == "cross_entropy" else squared_error_loss

    #initializing and giving shape
    for i in range(len(self.layers) - 1):
        weight_matrix = initialize_weights((self.layers[i], self.layers[i + 1]), method=self.init)
        bias_vector = np.zeros((1, self.layers[i + 1]))
        self.weights.append(weight_matrix)
        self.biases.append(bias_vector)
        self.velocities.append(np.zeros_like(weight_matrix))
        self.velocities_b.append(np.zeros_like(bias_vector))
        self.squared_grads.append(np.zeros_like(weight_matrix))
        self.squared_grads_b.append(np.zeros_like(bias_vector))
        self.m_t_w.append(np.zeros_like(weight_matrix))
        self.m_t_b.append(np.zeros_like(bias_vector))
        self.v_t_w.append(np.zeros_like(weight_matrix))
        self.v_t_b.append(np.zeros_like(bias_vector))

  def feedForward(self, X):
    activations = [X]
    for i in range(len(self.weights) - 1):
      z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
      #print(z.shape)
      a = self.activation_func(z)
      activations.append(a)

    z_output = np.dot(activations[-1], self.weights[-1]) + self.biases[-1]
    a_output = softmax(z_output)
    activations.append(a_output)

    return activations

  def backProp(self, X_batch, y_batch, activations):
    batch_size = X_batch.shape[0]
    grads_w = [np.zeros_like(w) for w in self.weights]
    grads_b = [np.zeros_like(b) for b in self.biases]

    if self.loss == "cross_entropy":
        dz = activations[-1] - y_batch
    else:
        dz = (activations[-1] - y_batch) * 2 / batch_size
    grads_w[-1] = np.dot(activations[-2].T, dz) / batch_size + (self.decay * self.weights[-1])
    grads_b[-1] = np.sum(dz, axis=0, keepdims=True) / batch_size

    for i in range(len(self.weights) - 2, -1, -1):
        dz = np.dot(dz, self.weights[i + 1].T) * self.activation_derivative(activations[i + 1])
        grads_w[i] = np.dot(activations[i].T, dz) / batch_size + (self.decay * self.weights[i])
        grads_b[i] = np.sum(dz, axis=0, keepdims=True) / batch_size

    self.update_weights(grads_w, grads_b)

  def update_weights(self, grads_w, grads_b):
    grads_w = clip_gradients(grads_w)
    grads_b = clip_gradients(grads_b)

    if self.opt == "sgd":
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * (grads_w[i] + self.decay * self.weights[i])
            self.biases[i] -= self.lr * grads_b[i]

    elif self.opt == "momentum":
        for i in range(len(self.weights)):
            self.velocities[i] = self.momentum * self.velocities[i] - self.lr * (grads_w[i] + self.decay * self.weights[i])
            self.weights[i] += self.velocities[i]

            self.velocities_b[i] = self.momentum * self.velocities_b[i] - self.lr * grads_b[i]
            self.biases[i] += self.velocities_b[i]

    elif self.opt == "nesterov":
        for i in range(len(self.weights)):
            lookahead_w = self.weights[i] - self.momentum * self.velocities[i]
            lookahead_b = self.biases[i] - self.momentum * self.velocities_b[i]
            grad_w_lookahead = grads_w[i]
            grad_b_lookahead = grads_b[i]
            self.velocities[i] = self.momentum * self.velocities[i] - self.lr * (grad_w_lookahead + self.decay * lookahead_w)
            self.weights[i] += self.velocities[i]
            self.velocities_b[i] = self.momentum * self.velocities_b[i] - self.lr * grad_b_lookahead
            self.biases[i] += self.velocities_b[i]

    elif self.opt == "rmsprop":
        for i in range(len(self.weights)):
            self.squared_grads[i] = self.beta2 * self.squared_grads[i] + (1 - self.beta2) * grads_w[i]**2
            self.weights[i] -= self.lr * (grads_w[i] + self.decay * self.weights[i]) / (np.sqrt(self.squared_grads[i]) + self.epsilon)
            self.squared_grads_b[i] = self.beta2 * self.squared_grads_b[i] + (1 - self.beta2) * grads_b[i]**2
            self.biases[i] -= self.lr * grads_b[i] / (np.sqrt(self.squared_grads_b[i]) + self.epsilon)

    elif self.opt == "adam":
        self.t += 1
        for i in range(len(self.weights)):
            self.m_t_w[i] = self.beta1 * self.m_t_w[i] + (1 - self.beta1) * grads_w[i]
            self.v_t_w[i] = self.beta2 * self.v_t_w[i] + (1 - self.beta2) * (grads_w[i]**2)
            self.m_t_b[i] = self.beta1 * self.m_t_b[i] + (1 - self.beta1) * grads_b[i]
            self.v_t_b[i] = self.beta2 * self.v_t_b[i] + (1 - self.beta2) * (grads_b[i]**2)
            m_hat_w = self.m_t_w[i] / (1 - self.beta1**self.t)
            v_hat_w = self.v_t_w[i] / (1 - self.beta2**self.t)
            m_hat_b = self.m_t_b[i] / (1 - self.beta1**self.t)
            v_hat_b = self.v_t_b[i] / (1 - self.beta2**self.t)
            self.weights[i] -= self.lr * (m_hat_w / (np.sqrt(v_hat_w) + self.epsilon) + self.decay * self.weights[i])
            self.biases[i] -= self.lr * (m_hat_b / (np.sqrt(v_hat_b) + self.epsilon))

  def train(self, epochs=10):
    for epoch in range(epochs):
        for i in range(0, X.shape[0], self.batch):
            X_batch = X[i:i + self.batch]
            y_batch = y[i:i + self.batch]
            activations = self.feedForward(X_batch)
            self.backProp(X_batch, y_batch, activations)

        y_train_pred = self.feedForward(X)[-1]
        train_loss = self.loss_func(y, y_train_pred)
        train_accuracy = np.mean(np.argmax(y_train_pred, axis=1) == np.argmax(y, axis=1))

        y_val_pred = self.feedForward(X_val)[-1]
        val_loss = self.loss_func(y_val, y_val_pred)
        val_accuracy = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

        wandb.log({"Epoch": epoch+1, "Train Loss": train_loss, "Train Accuracy": train_accuracy*100,
                    "Validation Loss": val_loss, "Validation Accuracy": val_accuracy*100})

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_accuracy*100:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_accuracy*100:.4f}")



In [14]:
sweep_configuration = {
    'method': "bayes",
    'metric': {'name': 'Validation Accuracy', 'goal': 'maximize'},
    'parameters': {
        'epochs': {'values': [5, 10]},
        'num_hidden_layers': {'values': [3, 4, 5]},
        'hidden_layer_size': {'values': [32, 64, 128]},
        'learning_rate': {'values': [1e-3, 1e-4]},
        'weight_decay': {'values': [0, 0.0005, 0.5]},
        'optimizer_name': {'values': ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam']},
        'batch_size': {'values': [16, 32, 64]},
        'init_type': {'values': ['random', 'xavier']},
        'activation_type': {'values': ['sigmoid', 'tanh', 'ReLU']},
        'loss_type': {'values': ['cross_entropy']}
    }
}

sweep_id = wandb.sweep(sweep_configuration, project="assignment01-temp")

Create sweep with ID: of6uaseh
Sweep URL: https://wandb.ai/omkarmande-iit-madras/assignment01-temp/sweeps/of6uaseh


In [15]:
def train_sweep():
    wandb.init(project="assignment01-temp",config=sweep_configuration)
    run_name = f"hl_{wandb.config.num_hidden_layers}_bs_{wandb.config.batch_size}_{wandb.config.activation_type}_{wandb.config.optimizer_name}_lr_{wandb.config.learning_rate}"
    wandb.run.name = run_name

    config = wandb.config

    model = Model(
        il_neuron=784, hl_neuron=config.hidden_layer_size, hl_count=config.num_hidden_layers, ol_neuron=10,
        opt=config.optimizer_name, lr=config.learning_rate, batch=config.batch_size,
        init=config.init_type, act=config.activation_type, loss=config.loss_type, decay=config.weight_decay
    )

    model.train(config.epochs)

# Run the sweep
wandb.agent(sweep_id, train_sweep, count=100)

[34m[1mwandb[0m: Agent Starting Run: oqsxzkkp with config:
[34m[1mwandb[0m: 	activation_type: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5


Epoch 1: Train Loss=2.3027, Train Acc=9.9444, Val Loss=2.3026, Val Acc=10.5000
Epoch 2: Train Loss=2.3026, Train Acc=10.0833, Val Loss=2.3027, Val Acc=9.2500
Epoch 3: Train Loss=2.3026, Train Acc=10.0833, Val Loss=2.3027, Val Acc=9.2500
Epoch 4: Train Loss=2.3026, Train Acc=10.0833, Val Loss=2.3027, Val Acc=9.2500
Epoch 5: Train Loss=2.3026, Train Acc=10.0833, Val Loss=2.3027, Val Acc=9.2500


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁████
Train Loss,█▁▁▁▁
Validation Accuracy,█▁▁▁▁
Validation Loss,▁▆▇██

0,1
Epoch,5.0
Train Accuracy,10.08333
Train Loss,2.30258
Validation Accuracy,9.25
Validation Loss,2.30269


[34m[1mwandb[0m: Agent Starting Run: 9ukdo161 with config:
[34m[1mwandb[0m: 	activation_type: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer_name: adam
[34m[1mwandb[0m: 	weight_decay: 0.5


Epoch 1: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 2: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 3: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 4: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 5: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3029, Val Acc=10.0333


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▁▁▁▁
Train Loss,█▇▅▃▁
Validation Accuracy,▁▁▁▁▁
Validation Loss,▁▄▅▇█

0,1
Epoch,5.0
Train Accuracy,9.9963
Train Loss,2.30263
Validation Accuracy,10.03333
Validation Loss,2.30285


[34m[1mwandb[0m: Agent Starting Run: fbssaedj with config:
[34m[1mwandb[0m: 	activation_type: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1: Train Loss=2.3026, Train Acc=12.7130, Val Loss=2.3026, Val Acc=11.5000
Epoch 2: Train Loss=2.3026, Train Acc=11.9759, Val Loss=2.3026, Val Acc=10.8167
Epoch 3: Train Loss=2.3026, Train Acc=11.0370, Val Loss=2.3026, Val Acc=9.9000
Epoch 4: Train Loss=2.3026, Train Acc=10.4722, Val Loss=2.3026, Val Acc=9.4667
Epoch 5: Train Loss=2.3026, Train Acc=10.1963, Val Loss=2.3026, Val Acc=9.2833


0,1
Epoch,▁▃▅▆█
Train Accuracy,█▆▃▂▁
Train Loss,█▆▄▃▁
Validation Accuracy,█▆▃▂▁
Validation Loss,▁▃▅▆█

0,1
Epoch,5.0
Train Accuracy,10.1963
Train Loss,2.30258
Validation Accuracy,9.28333
Validation Loss,2.3026


[34m[1mwandb[0m: Agent Starting Run: w6jfdywh with config:
[34m[1mwandb[0m: 	activation_type: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer_name: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5


Epoch 1: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 2: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 3: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 4: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 5: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 6: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 7: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 8: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 9: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333
Epoch 10: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3028, Val Acc=10.0333


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▁▁▁▁▁▁▁▁▁
Train Loss,█▄▃▂▂▂▁▁▁▁
Validation Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▂▁▁▁▂▃▃▄▄

0,1
Epoch,10.0
Train Accuracy,9.9963
Train Loss,2.3026
Validation Accuracy,10.03333
Validation Loss,2.3028


[34m[1mwandb[0m: Agent Starting Run: 6udeelgs with config:
[34m[1mwandb[0m: 	activation_type: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer_name: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 2: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 3: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 4: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 5: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 6: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 7: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 8: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 9: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333
Epoch 10: Train Loss=2.3026, Train Acc=9.9963, Val Loss=2.3027, Val Acc=10.0333


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▁▁▁▁▁▁▁▁▁
Train Loss,▁█▇▆▅▅▅▄▄▄
Validation Accuracy,▁▁▁▁▁▁▁▁▁▁
Validation Loss,▁███▇▇▇▇▇▇

0,1
Epoch,10.0
Train Accuracy,9.9963
Train Loss,2.30263
Validation Accuracy,10.03333
Validation Loss,2.30273


In [None]:
#best configuration
sweep_configuration = {
    'method': "bayes",
    'metric': {'name': 'Validation Accuracy', 'goal': 'maximize'},
    'parameters': {
        'epochs': {'values': [10]},
        'num_hidden_layers': {'values': [4]},
        'hidden_layer_size': {'values': [128]},
        'learning_rate': {'values': [1e-4]},
        'weight_decay': {'values': [0]},
        'optimizer_name': {'values': ['adam']},
        'batch_size': {'values': [32]},
        'init_type': {'values': ['xavier']},
        'activation_type': {'values': ['tanh']},
        'loss_type': {'values': ['cross_entropy']}
    }
}

sweep_id = wandb.sweep(sweep_configuration, project="assignment01-temp")

In [None]:
def train_sweep():
    wandb.init(project="assignment01-temp",config=sweep_configuration)
    run_name = f"hl_{wandb.config.num_hidden_layers}_bs_{wandb.config.batch_size}_{wandb.config.activation_type}_{wandb.config.optimizer_name}_lr_{wandb.config.learning_rate}"
    wandb.run.name = run_name

    config = wandb.config

    model = Model(
        il_neuron=784, hl_neuron=config.hidden_layer_size, hl_count=config.num_hidden_layers, ol_neuron=10,
        opt=config.optimizer_name, lr=config.learning_rate, batch=config.batch_size,
        init=config.init_type, act=config.activation_type, loss=config.loss_type, decay=config.weight_decay
    )

    model.train(config.epochs)

# Run the sweep
wandb.agent(sweep_id, train_sweep, count=1)