## Data Preprocessing ##

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
#%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')
from typing import List
from tqdm import tqdm
lb=LabelBinarizer()

# Load the training and test data
train_df = pd.read_csv('archive/sign_mnist_train.csv')
test_df = pd.read_csv('archive/sign_mnist_test.csv')

# Separate features and labels
x_train = train_df.drop('label', axis=1).values
y_train = train_df['label'].values
x_test = test_df.drop('label', axis=1).values
y_test = test_df['label'].values

# Mean subtraction
x_train = x_train - np.mean(x_train, axis=0)
x_test = x_test - np.mean(x_test, axis=0)

# Normalization 
x_train = x_train / np.std(x_train)
x_test = x_test / np.std(x_test)

x_train = x_train.reshape(-1,28,28,1)
x_test = x_test.reshape(-1,28,28,1)

# Encoding the labels
#y_train = lb.fit_transform(y_train)
#y_test = lb.fit_transform(y_test)

num_classes = 26
y_train = np.eye(num_classes)[y_train]
y_test = np.eye(num_classes)[y_test]

# Confirm preprocessing
print("Training data shape:", x_train.shape)
print("Test data shape:", x_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)

In [None]:
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None
        self.parameters = None
        
    def forward(self, x):
        raise NotImplementedError

    def backward(self, gradient):
        raise NotImplementedError
    
class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.ni = input_size
        self.no = output_size
        self.w = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.b = np.random.randn(output_size)
        self.cur_input = None
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x
        return x @ self.w.T + self.b

    def backward(self, gradient):
        assert self.cur_input is not None, "Must call forward before backward"
        dw = gradient.T @ self.cur_input
        db = gradient.sum(axis=0)
        self.gradient = [dw, db]
        return gradient @ self.w

class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None

    def forward(self, x):
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        return self.cur_probs - target
    
class MLP:
    
    def __init__(self, *args: List[NeuralNetLayer]):
        self.layers = args

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, target):
        for layer in self.layers[::-1]:
            target = layer.backward(target)

    def fit(self, x, y, iterations=10, learning_rate=0.1, batch_size=26, lambda_reg=0.0):
        history = {
            "loss": [],
            "accuracy": []
        }
        
        for epoch in range(iterations):
            # Shuffle the data at the beginning of each epoch
            indices = np.arange(len(x))
            np.random.shuffle(indices)
            x = x[indices]
            y = y[indices]

            # Mini-batch gradient descent
            for start_idx in range(0, len(x), batch_size):
                # Create the mini-batch
                end_idx = min(start_idx + batch_size, len(x))
                x_batch = x[start_idx:end_idx]
                y_batch = y[start_idx:end_idx]

                # Forward pass
                output = self.forward(x_batch)

                # Loss computation with L2 regularization
                data_loss = self.compute_loss(output, y_batch)
                reg_loss = lambda_reg * sum(np.sum(layer.W ** 2) for layer in self.layers if hasattr(layer, 'W'))
                loss = data_loss + reg_loss

                # Backward pass with L2 gradient adjustment
                gradients = self.backward(output, y_batch)
                self.update_parameters(gradients, learning_rate, lambda_reg)

                # Compute and store the loss and accuracy
                history["loss"].append(loss)
                predictions = self.predict(x_batch)
                accuracy = self.evaluate_acc(y_batch, predictions)
                history["accuracy"].append(accuracy)

            # Verbose output for tracking progress
            if (epoch + 1) % 10 == 0 or epoch == iterations - 1:
                print(f'Epoch {epoch + 1}/{iterations} - Loss: {np.mean(history["loss"][-len(X)//batch_size:])}, '
                      f'Accuracy: {np.mean(history["accuracy"][-len(x)//batch_size:])}')

        return history

    def update_parameters(self, gradients, learning_rate, lambda_reg):
        for layer, gradient in zip(self.layers, gradients):
            if hasattr(layer, 'W'):
                layer.W -= learning_rate * (gradient[0] + lambda_reg * layer.W)
                layer.b -= learning_rate * gradient[1]

    def compute_loss(self, output, y_batch):
        # Assuming y_batch is a one-hot encoded matrix of labels
        m = y_batch.shape[0]  # Number of examples
        # Clipping output to avoid division by zero
        output_clipped = np.clip(output, 1e-7, 1 - 1e-7)
        # Compute cross-entropy loss
        loss = -np.sum(y_batch * np.log(output_clipped)) / m
        return loss
    
    def update_parameters(self, gradients, learning_rate):
        for layer, gradient in zip(self.layers, gradients):
            for i, param in enumerate(layer.params):
                param -= learning_rate * gradient[i]

    def predict(self, x):
        predicts = self.forward(x)
        return np.argmax(predicts, axis=1)

    def evaluate_acc(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

## Experiments ##


### Experiment 1 ###

In [None]:
from sklearn.metrics import accuracy_score

def build_and_train_mlp(x_train, y_train, x_test, y_test, hidden_layers, iterations=50, learning_rate=0.01):
    """
    This function builds an MLP model with the given architecture, trains it, and evaluates its performance.
    """
    input_size = x_train.shape[1]  # Determine the input size from the training data
    output_size = 10  # Assuming there are 10 classes for the Sign Language MNIST dataset
    
    layers = []
    if not hidden_layers:  # Handling the no hidden layer scenario
        layers.append(LinearLayer(input_size, output_size))
    else:
        for i, hidden_units in enumerate(hidden_layers):
            if i == 0:
                # First layer connects input to the first hidden layer
                layers.append(LinearLayer(input_size, hidden_units))
            else:
                # Subsequent hidden layers
                layers.append(LinearLayer(hidden_layers[i - 1], hidden_units))
            # Add ReLU activation after each hidden layer
            layers.append(ReLULayer())

        # Add the output layer
        layers.append(LinearLayer(hidden_layers[-1], output_size))

    layers.append(SoftmaxOutputLayer())  # Softmax layer for classification
    
    # Instantiate the MLP model with the specified layers
    mlp = MLP(*layers)
    
    # Train the MLP model
    mlp.fit(x_train, y_train, iterations=10, learning_rate=0.1, batch_size=26)
    
    # Make predictions with the trained MLP model
    y_pred = mlp.predict(x_test)
    
    # Compute the accuracy
    accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred)  # Adjust based on y_test format
    return accuracy

# Experiment parameters
hidden_units_options = [32, 64, 128, 256]
models_architecture = [
    [],  # Model with no hidden layer, directly mapping inputs to outputs
    [64],  # Model with one hidden layer
    [64, 64]  # Model with two hidden layers
]

# Results dictionary
results = {}

# Run experiments
for architecture in models_architecture:
    arch_results = []
    for units in hidden_units_options:
        # Adjust architecture to have the correct number of hidden units
        adjusted_architecture = [units if len(arch) > 0 else 0 for arch in architecture]
        accuracy = build_and_train_mlp(
            x_train, y_train, 
            x_test, y_test,
            adjusted_architecture
        )
        arch_results.append(accuracy)
        print(f"Architecture {adjusted_architecture}: Test Accuracy = {accuracy}")
    results[str(architecture)] = arch_results

# Now you can compare the results stored in `results`

### Experiment 2 ###

In [None]:
class SigmoidLayer(NeuralNetLayer):
    def forward(self, x):
        self.output = 1 / (1 + np.exp(-x))
        return self.output
    
    def backward(self, dout):
        return dout * (self.output * (1 - self.output))

class LeakyReLULayer(NeuralNetLayer):
    def __init__(self, alpha=0.01):
        self.alpha = alpha
    
    def forward(self, x):
        self.x = x
        return np.where(x > 0, x, self.alpha * x)
    
    def backward(self, dout):
        dx = np.ones_like(self.x)
        dx[self.x < 0] = self.alpha
        return dout * dx

def train_model_with_activation(x_train, y_train, x_test, y_test, activation_layer, hidden_units=[64, 64]):
    # Construct the model architecture
    layers = [LinearLayer(x_train.shape[1], hidden_units[0]), activation_layer()]
    for units in hidden_units[1:]:
        layers.append(LinearLayer(units, units))
        layers.append(activation_layer())
    layers.append(LinearLayer(hidden_units[-1], 10))  # Assuming 10 classes
    layers.append(SoftmaxOutputLayer())

    # Create and train the MLP model
    mlp = MLP(*layers)
    mlp.fit(x_train, y_train, iterations=50, learning_rate=0.01)
    
    # Evaluate the model
    y_pred = mlp.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Activation functions to test
activations = {
    "Sigmoid": SigmoidLayer,
    "Leaky ReLU": lambda: LeakyReLULayer(alpha=0.01),
    "ReLU": ReLULayer  # Assuming ReLULayer is already defined
}

# Training and evaluating models with different activations
results = {}
for name, activation_layer in activations.items():
    accuracy = train_model_with_activation(
        x_train, y_train, 
        x_test, y_test, 
        activation_layer, hidden_units=[64, 64]  # Example architecture
    )
    results[name] = accuracy
    print(f"{name} activation: Test Accuracy = {accuracy}")

### Experiment 3 ###

In [None]:
lambdas = [0, 0.001, 0.01, 0.1, 1]  # Different values of lambda for L2 regularization
batch_size = 64  # Or any other batch size that you want to use

for lambda_reg in lambdas:
    print(f"Training with lambda = {lambda_reg}")
    mlp = MLP(
        LinearLayer(x_train.shape[1], 64),
        ReLULayer(),
        LinearLayer(64, 64),
        ReLULayer(),
        LinearLayer(64, 10),
        SoftmaxOutputLayer()
    )
    mlp.fit(x_train, y_train, iterations=50, learning_rate=0.01, batch_size=batch_size, lambda_reg=lambda_reg)
    y_pred = mlp.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with L2 regularization (lambda={lambda_reg}): {accuracy}")

### Experiment 4 ###

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Define the ConvNet
class ConvNet(nn.Module):
    def __init__(self, fc_units):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=5, stride=1, padding=2)
        self.fc1 = nn.Linear(128 * 3 * 3, fc_units)  # Assuming the images are 28x28 and pooling is applied
        self.fc2 = nn.Linear(fc_units, 10)  # Output layer for 10 classes

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define data preprocessing
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1), # Ensure image is grayscale
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load the dataset (adjust paths as necessary)
train_dataset = datasets.ImageFolder(root='path_to_train_dataset', transform=transform)
test_dataset = datasets.ImageFolder(root='path_to_test_dataset', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

# Train and evaluate the model
def train_and_evaluate(hidden_units):
    model = ConvNet(hidden_units).to(device)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(10):  # number of epochs can be adjusted
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

    # Evaluation
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    print(f"Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.2f}%)")

hidden_units_options = [32, 64, 128, 256]
for hidden_units in hidden_units_options:
    print(f"Training with {hidden_units} hidden units")
    train_and_evaluate(hidden_units)

### Experiment 5 ### 