<a href="https://colab.research.google.com/github/manglesh001/DL-assigment1/blob/main/DL_Assi1_Q7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from tensorflow.keras.datasets import fashion_mnist
import wandb

In [23]:

# Define the best model parameters
best_config = {
    'hidden_layers': 3,
    'hidden_size': 128,
    'activation': 'relu',
    'weight_init': 'xavier',
    'optimizer': 'rmsprop',
    'batch_size': 16,
    'epochs': 5,
    'learning_rate': 0.001,
    'weight_decay': 0
}

In [24]:

# Initialize Wandb
wandb.init(project="fashion-mnist", config=best_config)

# Activation functions and derivatives
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

In [25]:
# Weight Initialization
def initialize_weights(layers, method="xavier"):
    weights = []
    biases = []
    for i in range(len(layers) - 1):
        if method == "xavier":
            weights.append(np.random.randn(layers[i], layers[i+1]) * np.sqrt(1 / layers[i]))
        else:  # random
            weights.append(np.random.randn(layers[i], layers[i+1]) * 0.01)
        biases.append(np.zeros((1, layers[i+1])))
    return weights, biases




In [26]:
# Forward Propagation
def forward_propagation(X, weights, biases, activation):
    activations = [X]
    for i in range(len(weights)):
        z = np.dot(activations[-1], weights[i]) + biases[i]
        if activation[i] == "relu":
            activations.append(relu(z))
        elif activation[i] == "sigmoid":
            activations.append(sigmoid(z))
    return activations


In [27]:
# Backpropagation
def backpropagation(y, activations, weights, activation):
    gradients_w = [None] * len(weights)
    gradients_b = [None] * len(weights)
    error = activations[-1] - y

    for i in reversed(range(len(weights))):
        if activation[i] == "relu":
            delta = error * relu_derivative(activations[i+1])
        elif activation[i] == "sigmoid":
            delta = error * sigmoid_derivative(activations[i+1])
        gradients_w[i] = np.dot(activations[i].T, delta)
        gradients_b[i] = np.sum(delta, axis=0, keepdims=True)
        error = np.dot(delta, weights[i].T)

    return gradients_w, gradients_b

In [28]:
# RMSprop Optimizer
def rmsprop(weights, biases, gradients_w, gradients_b, lr, cache_w, cache_b, beta=0.99, epsilon=1e-8):
    for i in range(len(weights)):
        cache_w[i] = beta * cache_w[i] + (1 - beta) * (gradients_w[i] ** 2)
        weights[i] -= lr * gradients_w[i] / (np.sqrt(cache_w[i]) + epsilon)
        cache_b[i] = beta * cache_b[i] + (1 - beta) * (gradients_b[i] ** 2)
        biases[i] -= lr * gradients_b[i] / (np.sqrt(cache_b[i]) + epsilon)
    return weights, biases, cache_w, cache_b

# Load Fashion-MNIST dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test = X_test.reshape(X_test.shape[0], -1) / 255.0

# One-hot encode labels
y_train = np.eye(10)[y_train]
y_test_original = y_test  # Keep original labels for confusion matrix

In [29]:










def train_network(X_train, y_train, X_val, y_val, config):
    np.random.seed(42)
    layers = [X_train.shape[1]] + [config['hidden_size']] * config['hidden_layers'] + [10]
    activation = [config['activation']] * config['hidden_layers'] + ['sigmoid']

    weights, biases = initialize_weights(layers, config['weight_init'])
    cache_w = [np.zeros_like(w) for w in weights]
    cache_b = [np.zeros_like(b) for b in biases]

    batch_size = config['batch_size']
    epochs = config['epochs']
    lr = config['learning_rate']

    for epoch in range(epochs):
        indices = np.random.permutation(X_train.shape[0])
        X_train_shuffled, y_train_shuffled = X_train[indices], y_train[indices]

        train_loss = 0
        train_correct = 0
        train_total = 0

        for i in range(0, X_train_shuffled.shape[0], batch_size):
            X_batch = X_train_shuffled[i:i+batch_size]
            y_batch = y_train_shuffled[i:i+batch_size]

            # Forward propagation
            activations = forward_propagation(X_batch, weights, biases, activation)

            # Calculate training loss (cross-entropy loss)
            output = activations[-1]
            train_loss += -np.sum(y_batch * np.log(output + 1e-8)) / len(y_batch)

            # Calculate training accuracy
            train_preds = np.argmax(output, axis=1)
            train_true = np.argmax(y_batch, axis=1)
            train_correct += np.sum(train_preds == train_true)
            train_total += len(y_batch)

            # Backpropagation
            gradients_w, gradients_b = backpropagation(y_batch, activations, weights, activation)
            weights, biases, cache_w, cache_b = rmsprop(weights, biases, gradients_w, gradients_b, lr, cache_w, cache_b)

        # Calculate average training loss and accuracy for the epoch
        train_loss /= (X_train_shuffled.shape[0] // batch_size)
        train_accuracy = train_correct / train_total

        # Log training metrics to Wandb
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_accuracy": train_accuracy
        })

        # Print training metrics
        print(f"Epoch {epoch + 1}/{epochs}, "
              f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    return weights, biases




In [30]:

# Train the best model
X_train, X_val = X_train[:54000], X_train[54000:]
y_train, y_val = y_train[:54000], y_train[54000:]
weights, biases = train_network(X_train, y_train, X_val, y_val, best_config)

# Evaluate on the test set
activations = forward_propagation(X_test, weights, biases, [best_config['activation']] * best_config['hidden_layers'] + ['sigmoid'])
test_predictions = np.argmax(activations[-1], axis=1)

# One-hot encode y_test_original for loss calculation
y_test_one_hot = np.eye(10)[y_test_original]

# Calculate test loss (cross-entropy loss)
test_loss = -np.sum(y_test_one_hot * np.log(activations[-1] + 1e-8)) / len(y_test_original)

# Calculate test accuracy
test_accuracy = np.mean(test_predictions == y_test_original)

# Log test metrics to Wandb
wandb.log({
    "test_loss": test_loss,
    "test_accuracy": test_accuracy
})

# Print test metrics
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Epoch 1/5, Train Loss: 0.6836, Train Accuracy: 0.7995
Epoch 2/5, Train Loss: 0.4161, Train Accuracy: 0.8607
Epoch 3/5, Train Loss: 0.3894, Train Accuracy: 0.8736
Epoch 4/5, Train Loss: 0.3744, Train Accuracy: 0.8807
Epoch 5/5, Train Loss: 0.3677, Train Accuracy: 0.8884
Test Loss: 0.4300, Test Accuracy: 0.8700


In [31]:
# Generate confusion matrix
cm = confusion_matrix(y_test_original, test_predictions)

# Plot confusion matrix
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix for Fashion-MNIST Test Set', fontsize=16)
plt.xlabel('Predicted Labels', fontsize=14)
plt.ylabel('True Labels', fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)

# Save the confusion matrix plot to a file
confusion_matrix_path = "confusion_matrix.png"
plt.savefig(confusion_matrix_path)
plt.close()

# Log the confusion matrix as an image to Wandb
wandb.log({"confusion_matrix": wandb.Image(confusion_matrix_path)})

# Finish Wandb run
wandb.finish()

0,1
epoch,▁▃▅▆█
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇█
train_loss,█▂▁▁▁

0,1
epoch,5.0
test_accuracy,0.87
test_loss,0.42999
train_accuracy,0.88844
train_loss,0.36766
