<a href="https://colab.research.google.com/github/manglesh001/DL-assigment1/blob/main/DL_Ass1_Q10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from tensorflow.keras.datasets import mnist
import wandb

In [5]:


# Initialize Wandb
wandb.init(project="mnist", config={})

# Activation functions and derivatives
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

# Activation functions and derivatives
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

In [6]:

# Weight Initialization
def initialize_weights(layers, method="xavier"):
    weights = []
    biases = []
    for i in range(len(layers) - 1):
        if method == "xavier":
            weights.append(np.random.randn(layers[i], layers[i+1]) * np.sqrt(1 / layers[i]))
        else:  # random
            weights.append(np.random.randn(layers[i], layers[i+1]) * 0.01)
        biases.append(np.zeros((1, layers[i+1])))
    return weights, biases

In [7]:


# Forward Propagation
def forward_propagation(X, weights, biases, activation):
    activations = [X]
    for i in range(len(weights)):
        z = np.dot(activations[-1], weights[i]) + biases[i]
        if activation[i] == "sigmoid":
            activations.append(sigmoid(z))
        elif activation[i] == "tanh":
            activations.append(tanh(z))
        elif activation[i] == "relu":
            activations.append(relu(z))
    return activations


In [8]:


# Backpropagation
def backpropagation(y, activations, weights, activation):
    gradients_w = [None] * len(weights)
    gradients_b = [None] * len(weights)
    error = activations[-1] - y

    for i in reversed(range(len(weights))):
        if activation[i] == "sigmoid":
            delta = error * sigmoid_derivative(activations[i+1])
        elif activation[i] == "tanh":
            delta = error * tanh_derivative(activations[i+1])
        elif activation[i] == "relu":
            delta = error * relu_derivative(activations[i+1])
        gradients_w[i] = np.dot(activations[i].T, delta)
        gradients_b[i] = np.sum(delta, axis=0, keepdims=True)
        error = np.dot(delta, weights[i].T)

    return gradients_w, gradients_b

In [9]:
# RMSprop Optimizer
def rmsprop(weights, biases, gradients_w, gradients_b, lr, cache_w, cache_b, beta=0.99, epsilon=1e-8):
    for i in range(len(weights)):
        cache_w[i] = beta * cache_w[i] + (1 - beta) * (gradients_w[i] ** 2)
        weights[i] -= lr * gradients_w[i] / (np.sqrt(cache_w[i]) + epsilon)
        cache_b[i] = beta * cache_b[i] + (1 - beta) * (gradients_b[i] ** 2)
        biases[i] -= lr * gradients_b[i] / (np.sqrt(cache_b[i]) + epsilon)
    return weights, biases, cache_w, cache_b



In [10]:
# Adam Optimizer
def adam(weights, biases, gradients_w, gradients_b, lr, m_w, v_w, m_b, v_b, beta1=0.9, beta2=0.999, epsilon=1e-8, t=1):
    for i in range(len(weights)):
        # Update momentum and velocity for weights
        m_w[i] = beta1 * m_w[i] + (1 - beta1) * gradients_w[i]
        v_w[i] = beta2 * v_w[i] + (1 - beta2) * (gradients_w[i] ** 2)
        m_w_hat = m_w[i] / (1 - beta1 ** t)
        v_w_hat = v_w[i] / (1 - beta2 ** t)
        weights[i] -= lr * m_w_hat / (np.sqrt(v_w_hat) + epsilon)

        # Update momentum and velocity for biases
        m_b[i] = beta1 * m_b[i] + (1 - beta1) * gradients_b[i]
        v_b[i] = beta2 * v_b[i] + (1 - beta2) * (gradients_b[i] ** 2)
        m_b_hat = m_b[i] / (1 - beta1 ** t)
        v_b_hat = v_b[i] / (1 - beta2 ** t)
        biases[i] -= lr * m_b_hat / (np.sqrt(v_b_hat) + epsilon)

    return weights, biases, m_w, v_w, m_b, v_b

In [11]:


# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test = X_test.reshape(X_test.shape[0], -1) / 255.0

# One-hot encode labels
y_train = np.eye(10)[y_train]
y_test_original = y_test  # Keep original labels for confusion matrix

In [12]:






# Define configurations
configurations = [
    {
        'hidden_layers': 3,
        'hidden_size': 128,
        'activation': 'relu',
        'weight_init': 'xavier',
        'optimizer': 'rmsprop',
        'batch_size': 16,
        'epochs': 5,
        'learning_rate': 0.001,
        'weight_decay': 0
    },
    {
        'hidden_layers': 5,
        'hidden_size': 128,
        'activation': 'relu',
        'weight_init': 'xavier',
        'optimizer': 'adam',
        'batch_size': 32,
        'epochs': 5,
        'learning_rate': 0.001,
        'weight_decay': 0.5
    },
    {
        'hidden_layers': 4,
        'hidden_size': 128,
        'activation': 'tanh',
        'weight_init': 'xavier',
        'optimizer': 'adam',
        'batch_size': 16,
        'epochs': 10,
        'learning_rate': 0.001,
        'weight_decay': 0.5
    }
]


In [2]:

def train_network(X_train, y_train, X_val, y_val, config):
    np.random.seed(42)
    layers = [X_train.shape[1]] + [config['hidden_size']] * config['hidden_layers'] + [10]
    activation = [config['activation']] * config['hidden_layers'] + ['sigmoid']

    weights, biases = initialize_weights(layers, config['weight_init'])
    cache_w = [np.zeros_like(w) for w in weights]
    cache_b = [np.zeros_like(b) for b in biases]
    m_w = [np.zeros_like(w) for w in weights]
    v_w = [np.zeros_like(w) for w in weights]
    m_b = [np.zeros_like(b) for b in biases]
    v_b = [np.zeros_like(b) for b in biases]

    batch_size = config['batch_size']
    epochs = config['epochs']
    lr = config['learning_rate']

    for epoch in range(epochs):
        indices = np.random.permutation(X_train.shape[0])
        X_train_shuffled, y_train_shuffled = X_train[indices], y_train[indices]

        train_loss = 0
        train_correct = 0
        train_total = 0

        for i in range(0, X_train_shuffled.shape[0], batch_size):
            X_batch = X_train_shuffled[i:i+batch_size]
            y_batch = y_train_shuffled[i:i+batch_size]

            # Forward propagation
            activations = forward_propagation(X_batch, weights, biases, activation)

            # Calculate training loss (cross-entropy loss)
            output = activations[-1]
            train_loss += -np.sum(y_batch * np.log(output + 1e-8)) / len(y_batch)

            # Calculate training accuracy
            train_preds = np.argmax(output, axis=1)
            train_true = np.argmax(y_batch, axis=1)
            train_correct += np.sum(train_preds == train_true)
            train_total += len(y_batch)

            # Backpropagation
            gradients_w, gradients_b = backpropagation(y_batch, activations, weights, activation)
            if config['optimizer'] == "rmsprop":
                weights, biases, cache_w, cache_b = rmsprop(weights, biases, gradients_w, gradients_b, lr, cache_w, cache_b)
            elif config['optimizer'] == "adam":
                weights, biases, m_w, v_w, m_b, v_b = adam(weights, biases, gradients_w, gradients_b, lr, m_w, v_w, m_b, v_b, t=epoch+1)

        # Calculate average training loss and accuracy for the epoch
        train_loss /= (X_train_shuffled.shape[0] // batch_size)
        train_accuracy = train_correct / train_total

        # Validation
        val_activations = forward_propagation(X_val, weights, biases, activation)
        val_loss = -np.sum(y_val * np.log(val_activations[-1] + 1e-8)) / len(y_val)
        val_preds = np.argmax(val_activations[-1], axis=1)
        val_true = np.argmax(y_val, axis=1)
        val_accuracy = np.mean(val_preds == val_true)

        # Log metrics to Wandb
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_accuracy": train_accuracy,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy
        })

        # Print metrics
        print(f"Epoch {epoch + 1}/{epochs}, "
              f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    return weights, biases

# Split training data into training and validation sets
X_train, X_val = X_train[:50000], X_train[50000:]
y_train, y_val = y_train[:50000], y_train[50000:]




In [3]:
# Train and evaluate each configuration
for i, config in enumerate(configurations):
    print(f"\nTraining Configuration {i + 1}: {config}")
    wandb.init(project="mnist", config=config, reinit=True)
    weights, biases = train_network(X_train, y_train, X_val, y_val, config)

    # Evaluate on the test set
    activations = forward_propagation(X_test, weights, biases, [config['activation']] * config['hidden_layers'] + ['sigmoid'])
    test_predictions = np.argmax(activations[-1], axis=1)

    # Calculate test loss (cross-entropy loss)
    test_loss = -np.sum(np.eye(10)[y_test_original] * np.log(activations[-1] + 1e-8)) / len(y_test_original)

    # Calculate test accuracy
    test_accuracy = np.mean(test_predictions == y_test_original)

    # Log test metrics to Wandb
    wandb.log({
        "test_loss": test_loss,
        "test_accuracy": test_accuracy
    })

    # Print test metrics
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Training Configuration 1: {'hidden_layers': 3, 'hidden_size': 128, 'activation': 'relu', 'weight_init': 'xavier', 'optimizer': 'rmsprop', 'batch_size': 16, 'epochs': 5, 'learning_rate': 0.001, 'weight_decay': 0}


Epoch 1/5, Train Loss: 0.2666, Train Accuracy: 0.9246, Val Loss: 0.1502, Val Accuracy: 0.9619
Epoch 2/5, Train Loss: 0.1378, Train Accuracy: 0.9635, Val Loss: 0.1341, Val Accuracy: 0.9688
Epoch 3/5, Train Loss: 0.1072, Train Accuracy: 0.9727, Val Loss: 0.1406, Val Accuracy: 0.9654
Epoch 4/5, Train Loss: 0.0973, Train Accuracy: 0.9777, Val Loss: 0.1302, Val Accuracy: 0.9724
Epoch 5/5, Train Loss: 0.0888, Train Accuracy: 0.9810, Val Loss: 0.1304, Val Accuracy: 0.9735
Test Loss: 0.1269, Test Accuracy: 0.9718

Training Configuration 2: {'hidden_layers': 5, 'hidden_size': 128, 'activation': 'relu', 'weight_init': 'xavier', 'optimizer': 'adam', 'batch_size': 32, 'epochs': 5, 'learning_rate': 0.001, 'weight_decay': 0.5}


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇██
train_loss,█▃▂▁▁
val_accuracy,▁▅▃▇█
val_loss,█▂▅▁▁

0,1
epoch,5.0
test_accuracy,0.9718
test_loss,0.12689
train_accuracy,0.98104
train_loss,0.08877
val_accuracy,0.9735
val_loss,0.13038


Epoch 1/5, Train Loss: 18.2913, Train Accuracy: 0.1133, Val Loss: 18.3975, Val Accuracy: 0.1064
Epoch 2/5, Train Loss: 18.3576, Train Accuracy: 0.1136, Val Loss: 18.4087, Val Accuracy: 0.1064
Epoch 3/5, Train Loss: 18.4198, Train Accuracy: 0.1136, Val Loss: 18.4085, Val Accuracy: 0.1064
Epoch 4/5, Train Loss: 18.4196, Train Accuracy: 0.1136, Val Loss: 18.4084, Val Accuracy: 0.1064
Epoch 5/5, Train Loss: 18.4194, Train Accuracy: 0.1136, Val Loss: 18.4082, Val Accuracy: 0.1064
Test Loss: 18.4066, Test Accuracy: 0.1135

Training Configuration 3: {'hidden_layers': 4, 'hidden_size': 128, 'activation': 'tanh', 'weight_init': 'xavier', 'optimizer': 'adam', 'batch_size': 16, 'epochs': 10, 'learning_rate': 0.001, 'weight_decay': 0.5}


0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁████
train_loss,▁▅███
val_accuracy,▁▁▁▁▁
val_loss,▁████

0,1
epoch,5.0
test_accuracy,0.1135
test_loss,18.40665
train_accuracy,0.11356
train_loss,18.41942
val_accuracy,0.1064
val_loss,18.40823


Epoch 1/10, Train Loss: 0.3392, Train Accuracy: 0.9031, Val Loss: 0.2298, Val Accuracy: 0.9451
Epoch 2/10, Train Loss: 0.2008, Train Accuracy: 0.9460, Val Loss: 0.1644, Val Accuracy: 0.9551
Epoch 3/10, Train Loss: 0.1661, Train Accuracy: 0.9562, Val Loss: 0.1488, Val Accuracy: 0.9598
Epoch 4/10, Train Loss: 0.1431, Train Accuracy: 0.9630, Val Loss: 0.1349, Val Accuracy: 0.9625
Epoch 5/10, Train Loss: 0.1282, Train Accuracy: 0.9674, Val Loss: 0.1314, Val Accuracy: 0.9642
Epoch 6/10, Train Loss: 0.1159, Train Accuracy: 0.9713, Val Loss: 0.1370, Val Accuracy: 0.9621
Epoch 7/10, Train Loss: 0.1060, Train Accuracy: 0.9740, Val Loss: 0.1160, Val Accuracy: 0.9687
Epoch 8/10, Train Loss: 0.0991, Train Accuracy: 0.9753, Val Loss: 0.1097, Val Accuracy: 0.9696
Epoch 9/10, Train Loss: 0.0924, Train Accuracy: 0.9779, Val Loss: 0.1132, Val Accuracy: 0.9694
Epoch 10/10, Train Loss: 0.0867, Train Accuracy: 0.9793, Val Loss: 0.1164, Val Accuracy: 0.9691
Test Loss: 0.1225, Test Accuracy: 0.9650
