In [18]:
wandb.login()



True

Q2


In [2]:
import numpy as np
import pandas as pd
import wandb
from tensorflow.keras.datasets import fashion_mnist

# Initialize Weights & Biases (wandb)
wandb.init(project="DLassignment1")

# Load Fashion-MNIST Dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# Normalize pixel values to [0,1]
X_train, X_test = X_train / 255.0, X_test / 255.0

# Flatten images from (28,28) to (784,)
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)

# One-hot encode labels
num_classes = 10
y_train_onehot = np.eye(num_classes)[y_train]
y_test_onehot = np.eye(num_classes)[y_test]


In [3]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_layers, output_size):
        """
        :param input_size: Number of input features (784 for Fashion-MNIST)
        :param hidden_layers: List of neurons per hidden layer [128, 64, ...]
        :param output_size: Number of classes (10 for Fashion-MNIST)
        """
        self.layers = [input_size] + hidden_layers + [output_size]
        self.weights = []
        self.biases = []

        # Initialize weights and biases
        for i in range(len(self.layers) - 1):
            self.weights.append(np.random.randn(self.layers[i], self.layers[i + 1]) * 0.01)
            self.biases.append(np.zeros((1, self.layers[i + 1])))

    def relu(self, Z):
        return np.maximum(0, Z)

    def relu_derivative(self, Z):
        return (Z > 0).astype(float)

    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)

    def forward(self, X):
        """
        Forward propagation
        """
        A = X
        activations = [X]
        zs = []

        for i in range(len(self.weights) - 1):  
            Z = np.dot(A, self.weights[i]) + self.biases[i]
            A = self.relu(Z)
            zs.append(Z)
            activations.append(A)

        # Output layer
        Z = np.dot(A, self.weights[-1]) + self.biases[-1]
        A = self.softmax(Z)
        zs.append(Z)
        activations.append(A)

        return activations, zs

    def compute_loss(self, Y_hat, Y):
        """
        Cross-entropy loss
        """
        m = Y.shape[0]
        return -np.sum(Y * np.log(Y_hat + 1e-8)) / m

    def backward(self, activations, zs, X, Y, learning_rate):
        """
        Backpropagation
        """
        m = X.shape[0]
        gradients = [None] * len(self.weights)

        # Output layer error
        dZ = activations[-1] - Y
        gradients[-1] = (np.dot(activations[-2].T, dZ) / m, np.sum(dZ, axis=0, keepdims=True) / m)

        # Hidden layers
        for i in range(len(self.weights) - 2, -1, -1):
            dA = np.dot(dZ, self.weights[i + 1].T)
            dZ = dA * self.relu_derivative(zs[i])
            gradients[i] = (np.dot(activations[i].T, dZ) / m, np.sum(dZ, axis=0, keepdims=True) / m)

        # Update weights and biases
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * gradients[i][0]
            self.biases[i] -= learning_rate * gradients[i][1]

    def train(self, X, Y, epochs, batch_size, learning_rate):
        """
        Training loop
        """
        for epoch in range(epochs):
            loss = 0
            num_batches = X.shape[0] // batch_size

            for i in range(num_batches):
                X_batch = X[i * batch_size: (i + 1) * batch_size]
                Y_batch = Y[i * batch_size: (i + 1) * batch_size]

                activations, zs = self.forward(X_batch)
                loss += self.compute_loss(activations[-1], Y_batch)
                self.backward(activations, zs, X_batch, Y_batch, learning_rate)

            loss /= num_batches
            accuracy = self.evaluate(X, Y)

            # Log metrics to wandb
            wandb.log({"epoch": epoch + 1, "loss": loss, "accuracy": accuracy})

            print(f"Epoch {epoch + 1}/{epochs} - Loss: {loss:.4f} - Accuracy: {accuracy:.4f}")

    def predict(self, X):
        """
        Make predictions
        """
        activations, _ = self.forward(X)
        return np.argmax(activations[-1], axis=1)

    def evaluate(self, X, Y):
        """
        Compute accuracy
        """
        predictions = self.predict(X)
        return np.mean(np.argmax(Y, axis=1) == predictions)


In [4]:
# Define model parameters
input_size = 784
hidden_layers = [128, 64]  # Modify this list to change hidden layers
output_size = 10
epochs = 10
batch_size = 64
learning_rate = 0.01

# Initialize the model
nn = NeuralNetwork(input_size, hidden_layers, output_size)

# Train the model
nn.train(X_train, y_train_onehot, epochs, batch_size, learning_rate)

# Evaluate on test set
test_accuracy = nn.evaluate(X_test, y_test_onehot)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Log final test accuracy
wandb.log({"test_accuracy": test_accuracy})


Epoch 1/10 - Loss: 2.3013 - Accuracy: 0.1863
Epoch 2/10 - Loss: 2.1585 - Accuracy: 0.3597
Epoch 3/10 - Loss: 1.2095 - Accuracy: 0.6232
Epoch 4/10 - Loss: 0.8978 - Accuracy: 0.6763
Epoch 5/10 - Loss: 0.8278 - Accuracy: 0.7028
Epoch 6/10 - Loss: 0.7827 - Accuracy: 0.7217
Epoch 7/10 - Loss: 0.7394 - Accuracy: 0.7382
Epoch 8/10 - Loss: 0.6937 - Accuracy: 0.7527
Epoch 9/10 - Loss: 0.6456 - Accuracy: 0.7668
Epoch 10/10 - Loss: 0.6057 - Accuracy: 0.7790
Test Accuracy: 0.7715


Q3


In [5]:
import numpy as np

class NeuralNetwork:
    def __init__(self, layers, activation='sigmoid', optimizer='sgd', learning_rate=0.01, **optimizer_params):
        self.layers = layers
        self.activation = self.get_activation(activation)
        self.weights = [np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])]
        self.biases = [np.random.randn(y, 1) for y in layers[1:]]
        self.optimizer = self.get_optimizer(optimizer, learning_rate, **optimizer_params)
    
    def get_activation(self, name):
        if name == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x)), lambda x: x * (1 - x)
        # Add more activation functions as needed
    
    def get_optimizer(self, name, learning_rate, **params):
        optimizers = {
            'sgd': SGD,
            'momentum': Momentum,
            'nag': NAG,
            'rmsprop': RMSprop,
            'adam': Adam,
            'nadam': Nadam
        }
        return optimizers[name](learning_rate, **params)
    
    def forward(self, X):
        self.activations = [X]
        self.zs = []
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, self.activations[-1]) + b
            self.zs.append(z)
            self.activations.append(self.activation[0](z))
        return self.activations[-1]
    
    def backward(self, X, y):
        m = X.shape[1]
        delta = self.activations[-1] - y
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        for l in range(1, len(self.layers)):
            nabla_b[-l] = np.sum(delta, axis=1, keepdims=True) / m
            nabla_w[-l] = np.dot(delta, self.activations[-l-1].T) / m
            if l < len(self.layers) - 1:
                delta = np.dot(self.weights[-l].T, delta) * self.activation[1](self.zs[-l-1])
        
        return nabla_w, nabla_b
    
    def train(self, X, y, epochs, batch_size):
        for _ in range(epochs):
            for i in range(0, X.shape[1], batch_size):
                X_batch = X[:, i:i+batch_size]
                y_batch = y[:, i:i+batch_size]
                self.forward(X_batch)
                nabla_w, nabla_b = self.backward(X_batch, y_batch)
                self.optimizer.update(self.weights, self.biases, nabla_w, nabla_b)

class Optimizer:
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate

class SGD(Optimizer):
    def update(self, weights, biases, nabla_w, nabla_b):
        for i in range(len(weights)):
            weights[i] -= self.learning_rate * nabla_w[i]
            biases[i] -= self.learning_rate * nabla_b[i]

class Momentum(Optimizer):
    def __init__(self, learning_rate, momentum=0.9):
        super().__init__(learning_rate)
        self.momentum = momentum
        self.v_w = None
        self.v_b = None
    
    def update(self, weights, biases, nabla_w, nabla_b):
        if self.v_w is None:
            self.v_w = [np.zeros_like(w) for w in weights]
            self.v_b = [np.zeros_like(b) for b in biases]
        
        for i in range(len(weights)):
            self.v_w[i] = self.momentum * self.v_w[i] - self.learning_rate * nabla_w[i]
            self.v_b[i] = self.momentum * self.v_b[i] - self.learning_rate * nabla_b[i]
            weights[i] += self.v_w[i]
            biases[i] += self.v_b[i]

class NAG(Optimizer):
    def __init__(self, learning_rate, momentum=0.9):
        super().__init__(learning_rate)
        self.momentum = momentum
        self.v_w = None
        self.v_b = None
    
    def update(self, weights, biases, nabla_w, nabla_b):
        if self.v_w is None:
            self.v_w = [np.zeros_like(w) for w in weights]
            self.v_b = [np.zeros_like(b) for b in biases]
        
        for i in range(len(weights)):
            v_w_prev = self.v_w[i]
            v_b_prev = self.v_b[i]
            self.v_w[i] = self.momentum * self.v_w[i] - self.learning_rate * nabla_w[i]
            self.v_b[i] = self.momentum * self.v_b[i] - self.learning_rate * nabla_b[i]
            weights[i] += -self.momentum * v_w_prev + (1 + self.momentum) * self.v_w[i]
            biases[i] += -self.momentum * v_b_prev + (1 + self.momentum) * self.v_b[i]

class RMSprop(Optimizer):
    def __init__(self, learning_rate, decay_rate=0.9, epsilon=1e-8):
        super().__init__(learning_rate)
        self.decay_rate = decay_rate
        self.epsilon = epsilon
        self.s_w = None
        self.s_b = None
    
    def update(self, weights, biases, nabla_w, nabla_b):
        if self.s_w is None:
            self.s_w = [np.zeros_like(w) for w in weights]
            self.s_b = [np.zeros_like(b) for b in biases]
        
        for i in range(len(weights)):
            self.s_w[i] = self.decay_rate * self.s_w[i] + (1 - self.decay_rate) * np.square(nabla_w[i])
            self.s_b[i] = self.decay_rate * self.s_b[i] + (1 - self.decay_rate) * np.square(nabla_b[i])
            weights[i] -= self.learning_rate * nabla_w[i] / (np.sqrt(self.s_w[i]) + self.epsilon)
            biases[i] -= self.learning_rate * nabla_b[i] / (np.sqrt(self.s_b[i]) + self.epsilon)

class Adam(Optimizer):
    def __init__(self, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m_w = None
        self.m_b = None
        self.v_w = None
        self.v_b = None
        self.t = 0
    
    def update(self, weights, biases, nabla_w, nabla_b):
        if self.m_w is None:
            self.m_w = [np.zeros_like(w) for w in weights]
            self.m_b = [np.zeros_like(b) for b in biases]
            self.v_w = [np.zeros_like(w) for w in weights]
            self.v_b = [np.zeros_like(b) for b in biases]
        
        self.t += 1
        for i in range(len(weights)):
            self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * nabla_w[i]
            self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * nabla_b[i]
            self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * np.square(nabla_w[i])
            self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * np.square(nabla_b[i])
            
            m_w_hat = self.m_w[i] / (1 - self.beta1**self.t)
            m_b_hat = self.m_b[i] / (1 - self.beta1**self.t)
            v_w_hat = self.v_w[i] / (1 - self.beta2**self.t)
            v_b_hat = self.v_b[i] / (1 - self.beta2**self.t)
            
            weights[i] -= self.learning_rate * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
            biases[i] -= self.learning_rate * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

class Nadam(Optimizer):
    def __init__(self, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m_w = None
        self.m_b = None
        self.v_w = None
        self.v_b = None
        self.t = 0
    
    def update(self, weights, biases, nabla_w, nabla_b):
        if self.m_w is None:
            self.m_w = [np.zeros_like(w) for w in weights]
            self.m_b = [np.zeros_like(b) for b in biases]
            self.v_w = [np.zeros_like(w) for w in weights]
            self.v_b = [np.zeros_like(b) for b in biases]
        
        self.t += 1
        for i in range(len(weights)):
            self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * nabla_w[i]
            self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * nabla_b[i]
            self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * np.square(nabla_w[i])
            self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * np.square(nabla_b[i])
            
            m_w_hat = self.m_w[i] / (1 - self.beta1**self.t)
            m_b_hat = self.m_b[i] / (1 - self.beta1**self.t)
            v_w_hat = self.v_w[i] / (1 - self.beta2**self.t)
            v_b_hat = self.v_b[i] / (1 - self.beta2**self.t)
            
            m_w_hat_next = m_w_hat * self.beta1 + (1 - self.beta1) * nabla_w[i] / (1 - self.beta1**self.t)
            m_b_hat_next = m_b_hat * self.beta1 + (1 - self.beta1) * nabla_b[i] / (1 - self.beta1**self.t)
            
            weights[i] -= self.learning_rate * m_w_hat_next / (np.sqrt(v_w_hat) + self.epsilon)
            biases[i] -= self.learning_rate * m_b_hat_next / (np.sqrt(v_b_hat) + self.epsilon)

# Example usage:
nn = NeuralNetwork([2, 3, 1], optimizer='adam', learning_rate=0.01, beta1=0.9, beta2=0.999)
X = np.random.randn(2, 100)
y = np.random.randn(1, 100)
nn.train(X, y, epochs=10, batch_size=32)


Q4:


In [9]:
# Cell 2

def load_data(val_split=0.1):
    """
    Loads Fashion-MNIST, reshapes & normalizes it, and splits off 'val_split' fraction
    of the training data as validation data.
    Returns: (X_train, y_train), (X_val, y_val), (X_test, y_test)
    """
    (X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()
    
    # Flatten 28x28 => 784, normalize
    X_train_full = X_train_full.reshape(-1, 784).astype(np.float32) / 255.0
    X_test       = X_test.reshape(-1, 784).astype(np.float32) / 255.0
    
    # Reserve 10% of training data for validation
    val_size = int(val_split * len(X_train_full))
    X_val = X_train_full[:val_size]
    y_val = y_train_full[:val_size]
    X_train = X_train_full[val_size:]
    y_train = y_train_full[val_size:]
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


def one_hot_encode(labels, num_classes=10):
    """
    Converts a 1D array of labels (0..9) into a 2D one-hot encoded array.
    """
    one_hot = np.zeros((labels.shape[0], num_classes))
    one_hot[np.arange(labels.shape[0]), labels] = 1.0
    return one_hot


In [10]:
# Cell 3

def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(Z.dtype)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_derivative(A):
    # If A = sigmoid(Z), derivative wrt Z is A*(1-A)
    return A*(1 - A)

def tanh(Z):
    return np.tanh(Z)

def tanh_derivative(A):
    # If A = tanh(Z), derivative wrt Z is 1 - A^2
    return 1 - A**2

def softmax(Z):
    shiftZ = Z - np.max(Z, axis=1, keepdims=True)
    expZ = np.exp(shiftZ)
    return expZ / np.sum(expZ, axis=1, keepdims=True)

def cross_entropy_loss(probs, one_hot_labels):
    eps = 1e-9
    log_probs = -np.log(probs + eps)
    return np.mean(np.sum(log_probs * one_hot_labels, axis=1))

def accuracy(probs, one_hot_labels):
    pred_labels = np.argmax(probs, axis=1)
    true_labels = np.argmax(one_hot_labels, axis=1)
    return np.mean(pred_labels == true_labels)


def xavier_init(in_dim, out_dim):
    # Xavier uniform
    limit = np.sqrt(6.0 / (in_dim + out_dim))
    return np.random.uniform(-limit, limit, (in_dim, out_dim))

def random_init(in_dim, out_dim):
    # Basic random normal scaled by 0.01
    return 0.01 * np.random.randn(in_dim, out_dim)


In [11]:
# Cell 4

def initialize_parameters(input_dim, num_hidden_layers, hidden_size, output_dim,
                          weight_init="random"):
    """
    Creates parameter dict: W1, b1, W2, b2, ..., WL, bL
    Where L = num_hidden_layers + 1 (the output layer)
    """
    params = {}
    prev_dim = input_dim
    
    for i in range(num_hidden_layers):
        layer_name = i+1
        if weight_init == "xavier":
            params[f"W{layer_name}"] = xavier_init(prev_dim, hidden_size)
        else:  # "random"
            params[f"W{layer_name}"] = random_init(prev_dim, hidden_size)
        params[f"b{layer_name}"] = np.zeros((1, hidden_size))
        prev_dim = hidden_size
    
    # Output layer
    layer_name = num_hidden_layers + 1
    if weight_init == "xavier":
        params[f"W{layer_name}"] = xavier_init(prev_dim, output_dim)
    else:
        params[f"W{layer_name}"] = random_init(prev_dim, output_dim)
    params[f"b{layer_name}"] = np.zeros((1, output_dim))
    
    return params


def forward_pass(X, params, num_hidden_layers, activation):
    """
    Forward pass through the network. 
    activation in {"relu", "sigmoid", "tanh"} for hidden layers.
    Softmax output layer.
    Returns final probs and a cache of intermediate values.
    """
    cache = {}
    A = X
    L = num_hidden_layers + 1  # total layers
    
    for i in range(1, L+1):
        W = params[f"W{i}"]
        b = params[f"b{i}"]
        Z = A @ W + b
        
        if i < L:
            # Hidden layer
            if activation == "relu":
                A = relu(Z)
            elif activation == "sigmoid":
                A = sigmoid(Z)
            else: # "tanh"
                A = tanh(Z)
            cache[f"Z{i}"] = Z
            cache[f"A{i}"] = A
        else:
            # Output layer => softmax
            probs = softmax(Z)
            cache[f"Z{i}"] = Z
            cache[f"A{i}"] = probs
    
    return probs, cache


def backward_pass(X, y, params, cache, num_hidden_layers, activation):
    """
    Backprop through the network:
      X: input batch
      y: one-hot labels
      cache: forward-pass intermediates
    """
    grads = {}
    L = num_hidden_layers + 1
    m = X.shape[0]  # batch size
    
    # 1) Output layer gradient
    A_out = cache[f"A{L}"]      # (m, 10)
    dZ = A_out - y              # derivative wrt Z in output layer
    # A_{L-1} is the activation from the last hidden layer (or X if only 1 layer)
    A_prev = cache[f"A{L-1}"] if L > 1 else X
    grads[f"dW{L}"] = (A_prev.T @ dZ) / m
    grads[f"db{L}"] = np.sum(dZ, axis=0, keepdims=True) / m
    
    # 2) Hidden layers (in reverse)
    for i in reversed(range(1, L)):
        W_next = params[f"W{i+1}"]
        Z_curr = cache[f"Z{i}"]
        if i == 1:
            A_prev = X
        else:
            A_prev = cache[f"A{i-1}"]
        
        dA = dZ @ W_next.T
        # derivative depends on activation
        if activation == "relu":
            dZ = dA * relu_derivative(Z_curr)
        elif activation == "sigmoid":
            A_curr = cache[f"A{i}"]
            dZ = dA * sigmoid_derivative(A_curr)
        else:  # "tanh"
            A_curr = cache[f"A{i}"]
            dZ = dA * tanh_derivative(A_curr)
        
        grads[f"dW{i}"] = (A_prev.T @ dZ) / m
        grads[f"db{i}"] = np.sum(dZ, axis=0, keepdims=True) / m
    
    return grads


def update_parameters(params, grads, learning_rate, weight_decay=0.0):
    """
    Updates params in-place with gradient descent step and optional weight decay (L2).
    """
    for key in params:
        if key.startswith("W"):
            # L2 penalty
            params[key] -= learning_rate * (grads[f"d{key}"] + weight_decay * params[key])
        elif key.startswith("b"):
            params[key] -= learning_rate * grads[f"d{key}"]
    return params


def train_one_epoch(X_train, y_train, params, num_hidden_layers, activation,
                    batch_size, learning_rate, weight_decay=0.0):
    """
    Trains for 1 epoch using mini-batch SGD.
    Returns train_loss, train_acc for this epoch.
    """
    N = X_train.shape[0]
    permutation = np.random.permutation(N)
    X_train = X_train[permutation]
    y_train = y_train[permutation]
    
    total_loss = 0.0
    total_correct = 0
    
    for i in range(0, N, batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        # Forward
        probs, cache = forward_pass(X_batch, params, num_hidden_layers, activation)
        loss = cross_entropy_loss(probs, y_batch)
        total_loss += loss * len(X_batch)
        
        # Accuracy
        pred_labels = np.argmax(probs, axis=1)
        true_labels = np.argmax(y_batch, axis=1)
        total_correct += np.sum(pred_labels == true_labels)
        
        # Backward
        grads = backward_pass(X_batch, y_batch, params, cache, num_hidden_layers, activation)
        
        # Update
        params = update_parameters(params, grads, learning_rate, weight_decay)
    
    # Compute average metrics
    avg_loss = total_loss / N
    avg_acc  = total_correct / N
    return avg_loss, avg_acc


In [12]:
# Cell 5

def train():
    # Start a new W&B run
    wandb.init()
    config = wandb.config
    
    # Load data
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = load_data(val_split=0.1)
    # One-hot
    y_train_oh = one_hot_encode(y_train)
    y_val_oh   = one_hot_encode(y_val)
    y_test_oh  = one_hot_encode(y_test)
    
    # Create run name for clarity
    run_name = f"hl_{config.num_hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    wandb.run.name = run_name
    
    # Initialize parameters
    input_dim  = 784
    output_dim = 10
    params = initialize_parameters(
        input_dim,
        num_hidden_layers=config.num_hidden_layers,
        hidden_size=config.hidden_layer_size,
        output_dim=output_dim,
        weight_init=config.weight_init
    )
    
    # Training loop
    for epoch in range(config.epochs):
        train_loss, train_acc = train_one_epoch(
            X_train, y_train_oh, 
            params,
            num_hidden_layers=config.num_hidden_layers,
            activation=config.activation,
            batch_size=config.batch_size,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay
        )
        
        # Validation metrics
        probs_val, _ = forward_pass(X_val, params, config.num_hidden_layers, config.activation)
        val_loss = cross_entropy_loss(probs_val, y_val_oh)
        val_acc  = accuracy(probs_val, y_val_oh)
        
        # Log to W&B
        wandb.log({
            "epoch": epoch+1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })
    
    # Evaluate on test set once
    probs_test, _ = forward_pass(X_test, params, config.num_hidden_layers, config.activation)
    test_loss = cross_entropy_loss(probs_test, y_test_oh)
    test_acc  = accuracy(probs_test, y_test_oh)
    wandb.log({
        "test_loss": test_loss,
        "test_acc": test_acc
    })
    
    wandb.finish()


In [13]:
# Cell 6

sweep_config = {
    "name": "fashion_mnist_sweep_example",  # or any descriptive name
    "method": "random",  # can also be "grid" or "bayes"
    "metric": {
        "name": "val_acc",
        "goal": "maximize"
    },
    "parameters": {
        "epochs": {
            "values": [5, 10]
        },
        "num_hidden_layers": {
            "values": [3, 4, 5]
        },
        "hidden_layer_size": {
            "values": [32, 64, 128]
        },
        "weight_decay": {
            "values": [0.0, 0.0005, 0.5]
        },
        "learning_rate": {
            "values": [1e-3, 1e-4]
        },
        "batch_size": {
            "values": [16, 32, 64]
        },
        "weight_init": {
            "values": ["random", "xavier"]
        },
        "activation": {
            "values": ["sigmoid", "tanh", "relu"]
        },
        "optimizer": {"values": ["sgd", "momentum", "rmsprop", "adam"]}  # New Field for Optimizers

    }
}


In [14]:
# New Code: Optimizer Classes

class SGD:
    def __init__(self, lr):
        self.lr = lr
    def update(self, params, grads, t=None):
        for key in params:
            if key in grads:
                params[key] -= self.lr * grads[key]
        return params

class Momentum:
    def __init__(self, lr, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = {}
    def update(self, params, grads, t=None):
        for key in params:
            if key in grads:
                if key not in self.v:
                    self.v[key] = np.zeros_like(params[key])
                self.v[key] = self.momentum * self.v[key] + self.lr * grads[key]
                params[key] -= self.v[key]
        return params

class RMSprop:
    def __init__(self, lr, decay_rate=0.99, epsilon=1e-8):
        self.lr = lr
        self.decay_rate = decay_rate
        self.epsilon = epsilon
        self.cache = {}
    def update(self, params, grads, t=None):
        for key in params:
            if key in grads:
                if key not in self.cache:
                    self.cache[key] = np.zeros_like(params[key])
                self.cache[key] = self.decay_rate * self.cache[key] + (1 - self.decay_rate) * (grads[key] ** 2)
                params[key] -= self.lr * grads[key] / (np.sqrt(self.cache[key]) + self.epsilon)
        return params

class Adam:
    def __init__(self, lr, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}
        self.v = {}
    def update(self, params, grads, t):
        for key in params:
            if key in grads:
                if key not in self.m:
                    self.m[key] = np.zeros_like(params[key])
                if key not in self.v:
                    self.v[key] = np.zeros_like(params[key])
                self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
                self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)
                m_hat = self.m[key] / (1 - self.beta1 ** t)
                v_hat = self.v[key] / (1 - self.beta2 ** t)
                params[key] -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
        return params


In [15]:
# Updated train_one_epoch() function

def train_one_epoch(X_train, y_train, params, num_hidden_layers, activation,
                    batch_size, learning_rate, weight_decay=0.0, optimizer="sgd",
                    beta1=0.9, beta2=0.999, epsilon=1e-8, momentum=0.9, decay_rate=0.99):

    """
    Trains for 1 epoch using mini-batch gradient descent.
    Supports multiple optimizers: SGD, Momentum, Nesterov, RMSprop, Adam, Nadam.
    
    Args:
        X_train, y_train: Training data & labels (one-hot encoded).
        params: Model parameters.
        num_hidden_layers: Number of hidden layers.
        activation: Activation function ("relu", "sigmoid", "tanh").
        batch_size: Mini-batch size.
        learning_rate: Learning rate for optimization.
        weight_decay: L2 regularization term (if used).
        optimizer: String defining optimizer type ("sgd", "momentum", "nesterov", "rmsprop", "adam", "nadam").
        beta1, beta2: Adam/Nadam exponential decay rates.
        epsilon: Small constant for numerical stability (Adam/Nadam).
        momentum: Momentum factor for momentum-based optimizers.
        decay_rate: Decay factor for RMSprop.

    Returns:
        train_loss, train_acc: Loss & accuracy for this epoch.
    """

    N = X_train.shape[0]
    permutation = np.random.permutation(N)
    X_train = X_train[permutation]
    y_train = y_train[permutation]
    
    total_loss = 0.0
    total_correct = 0

    # Initialize optimizer-specific caches
    v = {}  # Momentum/Nesterov
    s = {}  # RMSprop, Adam, Nadam
    t = 0   # Adam/Nadam time step counter
    
    for key in params:
        v[key] = np.zeros_like(params[key])  # Initialize momentum term
        s[key] = np.zeros_like(params[key])  # Initialize second moment term

    for i in range(0, N, batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        # Forward Pass
        probs, cache = forward_pass(X_batch, params, num_hidden_layers, activation)
        loss = cross_entropy_loss(probs, y_batch)
        total_loss += loss * len(X_batch)

        # Accuracy
        pred_labels = np.argmax(probs, axis=1)
        true_labels = np.argmax(y_batch, axis=1)
        total_correct += np.sum(pred_labels == true_labels)
        
        # Backward Pass
        grads = backward_pass(X_batch, y_batch, params, cache, num_hidden_layers, activation)
        
        # Update parameters based on optimizer choice
        t += 1  # Increment time step for Adam/Nadam
        
        for key in params:
            if key.startswith("W"):
                if optimizer == "sgd":
                    # Vanilla SGD
                    params[key] -= learning_rate * (grads[f"d{key}"] + weight_decay * params[key])
                
                elif optimizer == "momentum":
                    # Momentum-based SGD
                    v[key] = momentum * v[key] - learning_rate * grads[f"d{key}"]
                    params[key] += v[key]

                elif optimizer == "nesterov":
                    # Nesterov Accelerated Gradient (Lookahead step)
                    v_prev = v[key]
                    v[key] = momentum * v[key] - learning_rate * grads[f"d{key}"]
                    params[key] += -momentum * v_prev + (1 + momentum) * v[key]
                
                elif optimizer == "rmsprop":
                    # RMSprop
                    s[key] = decay_rate * s[key] + (1 - decay_rate) * (grads[f"d{key}"] ** 2)
                    params[key] -= learning_rate * grads[f"d{key}"] / (np.sqrt(s[key]) + epsilon)

                elif optimizer == "adam":
                    # Adam optimizer
                    v[key] = beta1 * v[key] + (1 - beta1) * grads[f"d{key}"]
                    s[key] = beta2 * s[key] + (1 - beta2) * (grads[f"d{key}"] ** 2)
                    v_corrected = v[key] / (1 - beta1 ** t)
                    s_corrected = s[key] / (1 - beta2 ** t)
                    params[key] -= learning_rate * v_corrected / (np.sqrt(s_corrected) + epsilon)

                elif optimizer == "nadam":
                    # Nadam (Adam + Nesterov momentum)
                    v[key] = beta1 * v[key] + (1 - beta1) * grads[f"d{key}"]
                    s[key] = beta2 * s[key] + (1 - beta2) * (grads[f"d{key}"] ** 2)
                    v_corrected = v[key] / (1 - beta1 ** t)
                    s_corrected = s[key] / (1 - beta2 ** t)
                    nadam_update = beta1 * v_corrected + (1 - beta1) * grads[f"d{key}"] / (1 - beta1 ** t)
                    params[key] -= learning_rate * nadam_update / (np.sqrt(s_corrected) + epsilon)

            elif key.startswith("b"):
                # Bias updates (same update rule as weights, but without L2 regularization)
                params[key] -= learning_rate * grads[f"d{key}"]

    # Compute average metrics
    avg_loss = total_loss / N
    avg_acc  = total_correct / N
    return avg_loss, avg_acc


In [16]:
# Cell 5

def train():
    # Start a new W&B run
    wandb.init()
    config = wandb.config
    
    # Load data
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = load_data(val_split=0.1)
    # One-hot
    y_train_oh = one_hot_encode(y_train)
    y_val_oh   = one_hot_encode(y_val)
    y_test_oh  = one_hot_encode(y_test)
    
    # Create run name for clarity
    run_name = f"hl_{config.num_hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    wandb.run.name = run_name
    
    # Initialize parameters
    input_dim  = 784
    output_dim = 10
    params = initialize_parameters(
        input_dim,
        num_hidden_layers=config.num_hidden_layers,
        hidden_size=config.hidden_layer_size,
        output_dim=output_dim,
        weight_init=config.weight_init
    )
    
    # Training loop
    for epoch in range(config.epochs):
        train_loss, train_acc = train_one_epoch(
            X_train, y_train_oh, 
            params,
            num_hidden_layers=config.num_hidden_layers,
            activation=config.activation,
            batch_size=config.batch_size,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            optimizer=config.optimizer,
            beta1=0.9, beta2=0.999, epsilon=1e-8, momentum=0.9, decay_rate=0.99

        )
        
        # Validation metrics
        probs_val, _ = forward_pass(X_val, params, config.num_hidden_layers, config.activation)
        val_loss = cross_entropy_loss(probs_val, y_val_oh)
        val_acc  = accuracy(probs_val, y_val_oh)
        
        # Log to W&B
        wandb.log({
            "epoch": epoch+1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })
    
    # Evaluate on test set once
    probs_test, _ = forward_pass(X_test, params, config.num_hidden_layers, config.activation)
    test_loss = cross_entropy_loss(probs_test, y_test_oh)
    test_acc  = accuracy(probs_test, y_test_oh)
    wandb.log({
        "test_loss": test_loss,
        "test_acc": test_acc
    })
    
    wandb.finish()


In [20]:
# Cell 7

# 7a) Log in to W&B if not already
#wandb.login()

# 7b) Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="DLassignment1")  # set your W&B project name

# 7c) Run the sweep
#    count=10 => run 10 random trials from the hyperparameter space
wandb.agent(sweep_id, function=train, count=10)




Create sweep with ID: x945lubm
Sweep URL: https://wandb.ai/snehalma23m020-iit-madras/DLassignment1/sweeps/x945lubm


[34m[1mwandb[0m: Agent Starting Run: d3bw4wcy with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: xavier
Exception in thread Thread-145 (_run_job):
Traceback (most recent call last):
  File "/home/snehal/Downloads/dl_ass1/dvenv/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/tmp/ipykernel_212809/1493681935.py", line 5, in train
  File "/home/snehal/Downloads/dl_ass1/dvenv/lib/python3.12/site-packages/wandb/sdk/wandb_init.py", line 1485, in init
    wandb._sentry.reraise(e)
  File "/home/snehal/Downloads/dl_ass1/dvenv/lib/python3.12/site-packages/wandb/analytics/sentry.py", line 156, in reraise
    raise exc.with_traceback(sy