### Milestone 1

</br>Author : Nadine Mohamed (20162200)
</br>Date : 12/12/2024

In [17]:
import pickle
import numpy as np
import csv
from sklearn.model_selection import train_test_split

# modify the following
path_train_data = './data/train_data.pkl'
path_test_data =  './data/test_data.pkl'
subset = 50000
h_shape = 128
lr = 0.01
epochs = 100
batch_size = 256

In [None]:
class simpleNN:
    """
    A simple fully connected neural network with one hidden layer using ReLU activation
    and softmax output for multi-class classification.
    """

    def __init__(self, X_shape, h_shape, y_shape, lr=0.01, seed=42):
        """
        args:
            X_shape : int
                The dimensionality of the input features.
            h_shape : int
                The number of hidden units.
            y_shape : int
                The number of output classes.
            lr : float
                Learning rate for gradient descent.
            seed : int
                Random seed for reproducibility.
        """

        np.random.seed(seed)
        self.W1 = np.random.randn(X_shape, h_shape) / np.sqrt(X_shape)
        self.b1 = np.zeros((1, h_shape))
        self.W2 = np.random.randn(h_shape, y_shape) / np.sqrt(h_shape)
        self.b2 = np.zeros((1, y_shape))
        self.lr = lr

    def relu(self, x):
        """
        Applies the ReLU activation function element-wise.

        args :
            x : np.ndarray
                Input array.

        output : np.ndarray
                Result after applying ReLU.
        """
        return np.maximum(0, x)

    def d_relu(self, x):
        """
        Computes the derivative of the ReLU function.

        args
        x : np.ndarray
            Input array (pre-activation).

        output : np.ndarray
            Derivative of ReLU.
        """
        return (x > 0).astype(float)

    def softmax(self, x):
        """
        Applies the softmax function row-wise for classification outputs.

        args
            x : np.ndarray
                Logits before softmax.

        output :
            n.ndarray
                Probabilities after softmax.
        """
        shifted = x - np.max(x, axis=1, keepdims=True)
        exp_shifted = np.exp(shifted)
        return exp_shifted / np.sum(exp_shifted, axis=1, keepdims=True)

    def cross_entropy(self, p, y_c):
        """
        Computes the cross-entropy loss.

        args :
            p : np.ndarray
                Predicted probabilities.
            y_c : np.ndarray
                One-hot encoded true labels.

        output :
            float
                The cross-entropy loss.
        """
        p_capped = np.clip(p, 1e-8, 1 - 1e-8)
        N = y_c.shape[0]
        loss = -np.sum(y_c * np.log(p_capped)) / N
        return loss
    
    def evaluate(self, X, Y_onehot):
        """
            Evaluates the model on given data.
            
            args :
            X : np.ndarray
                Input data.
            Y_onehot : np.ndarray
                One-hot encoded true labels.
            
            ouput :
                tuple
                    (loss, accuracy)
            """
        p, _ = self.fprop(X)
        loss = self.cross_entropy(p, Y_onehot)
        preds = np.argmax(p, axis=1)
        y_test = np.argmax(Y_onehot, axis=1)
        accuracy = np.mean(preds == y_test)
        return loss, accuracy

    def fprop(self, X):
        """
        Performs the forward pass.

        args :
            X : np.ndarray
                Input data.

        output :
            tuple
                p: predicted probabilities
                cache: intermediate values needed for backward pass
        """

        h1 = X.dot(self.W1) + self.b1
        a1 = self.relu(h1)
        h2 = a1.dot(self.W2) + self.b2
        p = self.softmax(h2)
        cache = (X, a1, h1, h2, p)
        return p, cache

    def bprop(self, cache, y_c):
        """
        Performs the backward pass and updates the parameters.

        args :
            cache : tuple
                Values stored during the forward pass.
            y_c : np.ndarray
                One-hot encoded true labels.
        """
        X, h1, a1, h2, p = cache
        N = X.shape[0]

        d_h2 = (p - y_c) / N
        d_W2 = a1.T.dot(d_h2)
        d_b2 = np.sum(d_h2, axis=0, keepdims=True)

        d_a1 = d_h2.dot(self.W2.T)
        d_h1 = d_a1 * self.d_relu(h1)
        d_W1 = X.T.dot(d_h1)
        d_b1 = np.sum(d_h1, axis=0, keepdims=True)

        # Parameter update
        self.W2 -= self.lr * d_W2
        self.b2 -= self.lr * d_b2
        self.W1 -= self.lr * d_W1
        self.b1 -= self.lr * d_b1

    def predict(self, X):
        """
        Predicts class labels for input data.

        args :
            X : np.ndarray
                Input data.

        output :
            np.ndarray
                Predicted class labels.
        """
        p, _ = self.fprop(X)
        return np.argmax(p, axis=1)

    def fit(
        self, X_train, Y_train_onehot, X_val, Y_val_onehot, epochs=50, batch_size=256
    ):
        """
        Trains the neural network using mini-batch gradient descent.

        args :
            X_train : np.ndarray
                Training input data.
            Y_train_onehot : np.ndarray
                One-hot encoded training labels.
            X_val : np.ndarray
                Validation input data.
            Y_val_onehot : np.ndarray
                One-hot encoded validation labels.
            epochs : int
                Number of training epochs.
            batch_size : int
                Size of each mini-batch.
        """

        # Training with mini-batch gradient descent
        num_samples = X_train.shape[0]
        for epoch in range(epochs):
            idx = np.random.permutation(num_samples)
            X_train_shuffled = X_train[idx]
            Y_train_onehot_shuffled = Y_train_onehot[idx]

            for i in range(0, num_samples, batch_size):
                X_batch = X_train_shuffled[i : i + batch_size]
                Y_batch = Y_train_onehot_shuffled[i : i + batch_size]

                p, cache = self.fprop(X_batch)
                self.bprop(cache, Y_batch)

            train_loss, train_acc = self.evaluate(X_train, Y_train_onehot)
            val_loss, val_acc = self.evaluate(X_val, Y_val_onehot)
            if (epoch + 1) % 5 == 0 or epoch == 0:
                print(
                    f"Epoch {epoch+1}/{epochs} "
                    f"- Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
                    f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
                )


def one_hot_encode(y, n_class):
    """
    One-hot encodes integer labels.
    
    args :
        y : np.ndarray
            Array of integer labels.
        num_classes : int
            Number of classes.
    
    output :
        np.ndarray
            One-hot encoded labels.
    """
    encoded = np.zeros((len(y), n_class))
    encoded[np.arange(len(y)), y] = 1
    return encoded

In [19]:
# Train 
with open(path_train_data, 'rb') as f:
    train_data = pickle.load(f)

X = np.array(train_data['images'])
Y = np.array(train_data['labels'])

X = X.reshape(X.shape[0], -1).astype(float)
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0) + 1e-8
X_normalized = (X - X_mean) / X_std


indices = np.random.permutation(len(X_normalized))[:subset]
X_subset = X_normalized[indices]
Y_subset = Y[indices]

X_train, X_val, Y_train, Y_val = train_test_split(X_subset, Y_subset, test_size=0.2, random_state=42)

num_classes = 4
Y_train_onehot = one_hot_encode(Y_train, num_classes)
Y_val_onehot = one_hot_encode(Y_val, num_classes)

X_shape = X_train.shape[1]
y_shape = num_classes

model = simpleNN(X_shape, h_shape, y_shape, lr, seed=42)
model.fit(X_train, Y_train_onehot, X_val, Y_val_onehot, epochs, batch_size)

Epoch 1/100 - Train Loss: 1.0571, Train Acc: 0.5709, Val Loss: 1.0657, Val Acc: 0.5723
Epoch 5/100 - Train Loss: 0.9307, Train Acc: 0.6475, Val Loss: 0.9465, Val Acc: 0.6383
Epoch 10/100 - Train Loss: 0.8729, Train Acc: 0.6759, Val Loss: 0.8954, Val Acc: 0.6658
Epoch 15/100 - Train Loss: 0.8343, Train Acc: 0.6939, Val Loss: 0.8628, Val Acc: 0.6843
Epoch 20/100 - Train Loss: 0.8055, Train Acc: 0.7065, Val Loss: 0.8393, Val Acc: 0.6960
Epoch 25/100 - Train Loss: 0.7786, Train Acc: 0.7179, Val Loss: 0.8180, Val Acc: 0.7027
Epoch 30/100 - Train Loss: 0.7558, Train Acc: 0.7290, Val Loss: 0.7984, Val Acc: 0.7147
Epoch 35/100 - Train Loss: 0.7379, Train Acc: 0.7360, Val Loss: 0.7851, Val Acc: 0.7203
Epoch 40/100 - Train Loss: 0.7200, Train Acc: 0.7433, Val Loss: 0.7711, Val Acc: 0.7254
Epoch 45/100 - Train Loss: 0.7048, Train Acc: 0.7486, Val Loss: 0.7592, Val Acc: 0.7285
Epoch 50/100 - Train Loss: 0.6907, Train Acc: 0.7550, Val Loss: 0.7489, Val Acc: 0.7373
Epoch 55/100 - Train Loss: 0.6798,

In [20]:
# Predict
with open('./data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

X_test_raw = np.array(test_data['images'])
X_test_flat = X_test_raw.reshape(X_test_raw.shape[0], -1).astype(float)
X_test_normalized = (X_test_flat - X_mean) / X_std

test_preds = model.predict(X_test_normalized)

with open('submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["ID", "Class"])
    for i, pred in enumerate(test_preds, start=1):
        writer.writerow([i, pred])

print("Submission saved to 'submission.csv'.")

Submission saved to 'submission.csv'.


---

#### Sources/references:

1. IFT6390 Course material 
2. 'Implementation of neural network from scratch using NumPy' : https://www.geeksforgeeks.org/implementation-of-neural-network-from-scratch-using-numpy/
2. 'Creating a Neural Network from Scratch Using Python and NumPy' : https://lumos.blog/creating-a-neural-network-from-scratch-using-python-and-numpy/
3. The help of AI tools (Co-pilot, ChatGPT, Gemini) 
AI tools, including GitHub Co-pilot and ChatGPT, were utilized during the coding process. These tools primarily contributed to generating docstrings, refining code structure, and offering suggestions inline within the IDE.