# Combined Linear and Neural Network Classification Notebook
This notebook consolidates code for Binary Classification, Multiclass Softmax Regression (Manual/PyTorch), and Multilayer Neural Network analysis on the MNIST dataset.

## 1. Setup and Configuration

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import torch.nn as nn
import torch.optim as optim
import struct
from array import array
from os.path import join
import random
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_SIZE = 28 * 28
BATCH_SIZE = 64
LEARNING_RATE = 0.01
NUM_EPOCHS = 50
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

input_path = 'MNIST_Dataset'

## 2. Data Preparation Utilities (Multiclass and Binary)

In [None]:
class MnistDataloader(object):
    def __init__(self, training_images_filepath, training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        training_images_filepath = join(input_path, 'train-images.idx3-ubyte')
        training_labels_filepath = join(input_path, 'train-labels.idx1-ubyte')
        test_images_filepath = join(input_path, 't10k-images.idx3-ubyte')
        test_labels_filepath = join(input_path, 't10k-labels.idx1-ubyte')

        x_train, y_train = self.read_images_labels(training_images_filepath, training_labels_filepath)
        x_test, y_test = self.read_images_labels(test_images_filepath, test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)        

In [None]:
def load_and_prepare_multiclass_data(random_state=SEED):
    mnist_dataloader = MnistDataloader(
        training_images_filepath=join(input_path, 'train-images.idx3-ubyte'),
        training_labels_filepath=join(input_path, 'train-labels.idx1-ubyte'),
        test_images_filepath=join(input_path, 't10k-images.idx3-ubyte'),
        test_labels_filepath=join(input_path, 't10k-labels.idx1-ubyte')
    )
    (x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

    x_train = np.array(x_train, dtype=np.float32)
    x_test = np.array(x_test, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64)
    y_test = np.array(y_test, dtype=np.int64)

    X = np.concatenate([x_train, x_test], axis=0)
    y = np.concatenate([y_train, y_test], axis=0)

    X_train_full, X_temp, y_train_full, y_temp = train_test_split(
        X, y, test_size=0.4, stratify=y, random_state=random_state
    )
    X_val, X_test_final, y_val, y_test_final = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=random_state
    )

    X_train_flat = X_train_full.reshape(-1, INPUT_SIZE) / 255.0
    X_val_flat = X_val.reshape(-1, INPUT_SIZE) / 255.0
    X_test_flat = X_test_final.reshape(-1, INPUT_SIZE) / 255.0

    X_train_t = torch.tensor(X_train_flat, dtype=torch.float32)
    X_val_t = torch.tensor(X_val_flat, dtype=torch.float32)
    X_test_t = torch.tensor(X_test_flat, dtype=torch.float32)
    y_train_t = torch.tensor(y_train_full, dtype=torch.long)
    y_val_t = torch.tensor(y_val, dtype=torch.long)
    y_test_t = torch.tensor(y_test_final, dtype=torch.long)
    
    X_test_raw_np = X_test_final
    y_test_raw_np = y_test_final

    return X_train_t, X_val_t, X_test_t, y_train_t, y_val_t, y_test_t, X_train_flat, X_val_flat, X_test_flat, y_test_raw_np

In [None]:
class BinaryMNISTDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data.float().reshape(-1, INPUT_SIZE)
        self.targets = targets.float().unsqueeze(1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

def get_binary_dataloaders(batch_size=BATCH_SIZE, random_state=SEED):
    train_data = datasets.MNIST(
        root='./data', train=True, download=True
    )

    is_binary_mask = (train_data.targets == 0) | (train_data.targets == 1)
    X_full = train_data.data[is_binary_mask]
    Y_full = train_data.targets[is_binary_mask]

    X_full = X_full.float() / 255.0

    mean = 0.1307
    std = 0.3081
    X_full = (X_full - mean) / std


    X_train_val, X_test, Y_train_val, Y_test = train_test_split(
        X_full, Y_full, test_size=0.2, random_state=random_state, stratify=Y_full
    )

    X_train, X_val, Y_train, Y_val = train_test_split(
        X_train_val, Y_train_val, test_size=(0.2/0.8), random_state=random_state, stratify=Y_train_val
    )

    train_dataset = BinaryMNISTDataset(X_train, Y_train)
    val_dataset = BinaryMNISTDataset(X_val, Y_val)
    test_dataset = BinaryMNISTDataset(X_test, Y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [None]:
def create_multiclass_dataloaders(batch_size=BATCH_SIZE):
    train_dataset = TensorDataset(X_train_t, y_train_t)
    val_dataset = TensorDataset(X_val_t, y_val_t)
    test_dataset = TensorDataset(X_test_t, y_test_t)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

## 3. Binary Classification (Scratch Implementation)

In [None]:
class BinaryLogisticRegression:
    def __init__(self, input_size):
        self.W = torch.randn(input_size, 1, device=DEVICE) * 0.01
        self.b = torch.zeros(1, device=DEVICE)
        self.W.requires_grad_(True)
        self.b.requires_grad_(True)

    def forward(self, x):
        linear = torch.matmul(x, self.W) + self.b
        y_pred = torch.sigmoid(linear)
        return y_pred

    def parameters(self):
        return [self.W, self.b]

def custom_binary_cross_entropy(y_pred, y_true):
    epsilon = 1e-15
    y_pred = torch.clamp(y_pred, epsilon, 1.0 - epsilon)
    loss = - (y_true * torch.log(y_pred) + (1 - y_true) * torch.log(1 - y_pred))
    return loss.mean()

def calculate_binary_accuracy(y_pred, y_true):
    y_pred_class = (y_pred >= 0.5).float()
    correct = (y_pred_class == y_true).float().sum()
    return (correct / len(y_true)).item()

def train_model_binary(model, optimizer, train_loader, val_loader, loss_fn, epochs):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    print(f"Starting binary training on {DEVICE}...")

    for epoch in range(epochs):
        epoch_train_loss, epoch_train_acc = 0.0, 0.0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)

            y_pred = model.forward(X_batch)
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            epoch_train_loss += loss.item() * len(X_batch)
            epoch_train_acc += calculate_binary_accuracy(y_pred, y_batch) * len(X_batch)

        avg_train_loss = epoch_train_loss / len(train_loader.dataset)
        avg_train_acc = epoch_train_acc / len(train_loader.dataset)

        epoch_val_loss, epoch_val_acc = 0.0, 0.0
        with torch.no_grad():
            for X_batch_val, y_batch_val in val_loader:
                X_batch_val, y_batch_val = X_batch_val.to(DEVICE), y_batch_val.to(DEVICE)

                y_pred_val = model.forward(X_batch_val)
                loss_val = loss_fn(y_pred_val, y_batch_val)

                epoch_val_loss += loss_val.item() * len(X_batch_val)
                epoch_val_acc += calculate_binary_accuracy(y_pred_val, y_batch_val) * len(X_batch_val)

        avg_val_loss = epoch_val_loss / len(val_loader.dataset)
        avg_val_acc = epoch_val_acc / len(val_loader.dataset)

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_accuracies.append(avg_train_acc)
        val_accuracies.append(avg_val_acc)

        print(f'Epoch {epoch+1:2d}/{epochs} | Train Loss: {avg_train_loss:.4f}, Acc: {avg_train_acc:.4f} | Val Loss: {avg_val_loss:.4f}, Acc: {avg_val_acc:.4f}')

    return train_losses, val_losses, train_accuracies, val_accuracies

def evaluate_binary_model(model, test_loader, loss_fn):
    all_preds, all_targets = [], []
    total_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)

            y_pred = model.forward(X_batch)
            loss = loss_fn(y_pred, y_batch)

            total_loss += loss.item() * len(X_batch)
            y_pred_class = (y_pred >= 0.5).float()

            all_preds.extend(y_pred_class.cpu().numpy().flatten())
            all_targets.extend(y_batch.cpu().numpy().flatten())

    avg_loss = total_loss / len(test_loader.dataset)
    conf_matrix = confusion_matrix(all_targets, all_preds)
    final_acc = (conf_matrix[0, 0] + conf_matrix[1, 1]) / len(all_targets)

    return final_acc, conf_matrix, avg_loss

## 4. Multiclass Softmax Regression (Scratch/NumPy Implementation)

In [None]:
def initialize_weights(input_dim=INPUT_SIZE, num_classes=10):
    W = np.random.randn(input_dim, num_classes) * 0.01
    b = np.zeros((1, num_classes))
    return W, b

def softmax_forward(X, W, b):
    logits = X @ W + b
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
    return logits, probs

def calc_loss_and_accuracy(X, y, W, b):
    _, probs = softmax_forward(X, W, b)
    predicted_classes = np.argmax(probs, axis=1)
    accuracy = np.mean(predicted_classes == y) * 100
    
    n_samples = len(y)
    correct_class_probs = probs[np.arange(n_samples), y]
    loss = -np.mean(np.log(correct_class_probs + 1e-8))
    
    return loss, accuracy, predicted_classes

def train_epoch_sgd(X, y, W, b, learning_rate, batch_size):
    n_train = len(X)
    n_batches = n_train // batch_size
    
    indices = np.random.permutation(n_train)
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    
    epoch_loss = 0
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        X_batch = X_shuffled[start_idx:end_idx]
        y_batch = y_shuffled[start_idx:end_idx]
        
        logits, probs = softmax_forward(X_batch, W, b)
        
        batch_size_actual = len(y_batch)
        correct_probs = probs[np.arange(batch_size_actual), y_batch]
        batch_loss = -np.mean(np.log(correct_probs + 1e-8))
        epoch_loss += batch_loss
        
        y_one_hot = np.zeros_like(probs)
        y_one_hot[np.arange(batch_size_actual), y_batch] = 1
        dlogits = (probs - y_one_hot) / batch_size_actual
        dW = X_batch.T @ dlogits
        db = np.sum(dlogits, axis=0, keepdims=True)

        W -= learning_rate * dW
        b -= learning_rate * db
    
    return W, b, epoch_loss

def train_manual_softmax(X_train, y_train, X_val, y_val, learning_rate=LEARNING_RATE, num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE):
    W, b = initialize_weights()
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    n_train = len(X_train)
    n_batches = n_train // batch_size
    
    for i in range(num_epochs):
        W, b, l = train_epoch_sgd(X_train, y_train, W, b, learning_rate, batch_size)
        
        train_loss, train_acc, _ = calc_loss_and_accuracy(X_train, y_train, W, b)
        
        val_loss, val_acc, _ = calc_loss_and_accuracy(X_val, y_val, W, b)
        
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
    
    return W, b, train_losses, train_accuracies, val_losses, val_accuracies

## 5. General PyTorch Utilities and Model Definitions

In [None]:
def train_pytorch_model_comp(model, train_loader, val_loader, criterion, optimizer, num_epochs, verbose=False):
    train_losses, train_accs, val_losses, val_accs = [], [], [], []
    start_time = time.time()

    for epoch in range(num_epochs):
        model.train()
        epoch_loss, correct, total = 0, 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        train_loss = epoch_loss / len(train_loader)
        train_acc = 100 * correct / total

        val_loss, val_acc, _ = evaluate_pytorch_model_comp(model, val_loader, criterion)

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
    
        if verbose and (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%")

    training_time = time.time() - start_time
    return train_losses, train_accs, val_losses, val_accs, training_time

def evaluate_pytorch_model_comp(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    predictions = []
    
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
            predictions.extend(predicted.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = 100 * correct / total
    
    return avg_loss, accuracy, np.array(predictions)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

class SoftmaxRegressionPyTorch(nn.Module):
    def __init__(self, input_dim=INPUT_SIZE, num_classes=10):
        super(SoftmaxRegressionPyTorch, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        return self.linear(x)

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=INPUT_SIZE, hidden_layers=[256, 128], num_classes=10):
        super(NeuralNetwork, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

## 6. Execution: Multiclass Data Loading and Preprocessing

In [None]:
X_train_t, X_val_t, X_test_t, y_train_t, y_val_t, y_test_t, X_train_flat, X_val_flat, X_test_flat, y_test_raw_np = load_and_prepare_multiclass_data()

print(f"Loaded Data Shapes:")
print(f"  Train (tensors): {X_train_t.shape}, {y_train_t.shape}")
print(f"  Validation (tensors): {X_val_t.shape}, {y_val_t.shape}")
print(f"  Test (tensors): {X_test_t.shape}, {y_test_t.shape}")

X_train_np = X_train_flat
X_val_np = X_val_flat
X_test_np = X_test_flat
y_train_np = y_train_t.numpy()
y_val_np = y_val_t.numpy()
y_test_np = y_test_raw_np

## 7. Execution: Binary Classification (0s and 1s)

In [None]:
train_loader_binary, val_loader_binary, test_loader_binary = get_binary_dataloaders(num_epochs=20)

model_binary = BinaryLogisticRegression(INPUT_SIZE)
optimizer_binary = torch.optim.SGD(model_binary.parameters(), lr=0.01)

train_losses, val_losses, train_accuracies, val_accuracies = train_model_binary(
    model=model_binary,
    optimizer=optimizer_binary,
    train_loader=train_loader_binary,
    val_loader=val_loader_binary,
    loss_fn=custom_binary_cross_entropy,
    epochs=20
)

test_acc, conf_matrix, test_loss = evaluate_binary_model(
    model=model_binary,
    test_loader=test_loader_binary,
    loss_fn=custom_binary_cross_entropy
)

print("\nFinal Test Set Results (Binary Classification)")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc*100:.2f}%")

## 8. Execution: Multiclass Softmax Regression (Manual)

In [None]:
W, b = initialize_weights()
initial_loss, initial_accuracy,_= calc_loss_and_accuracy(X_train_np, y_train_np, W, b)

print(f"Initial accuracy with random weights: {initial_accuracy:.2f}%")
print(f"Initial loss: {initial_loss:.4f}")

W_manual, b_manual, train_losses_manual, train_accuracies_manual, val_losses_manual, val_accuracies_manual = train_manual_softmax(X_train_np, y_train_np, X_val_np, y_val_np, learning_rate=0.01, num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

test_loss_manual, test_acc_manual, test_pred_manual = calc_loss_and_accuracy(X_test_np, y_test_np, W_manual, b_manual)

print("\nFinal Manual Softmax Test Results:")
print(f"Test Loss: {test_loss_manual:.4f}")
print(f"Test Accuracy: {test_acc_manual:.2f}%")

## 9. Hyperparameter Analysis (Using Neural Network Model)

In [None]:
def plot_results(results_dict, title_prefix=''):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    colors = ['blue', 'green', 'orange', 'red', 'purple']
    
    for i, (name, data) in enumerate(results_dict.items()):
        color = colors[i % len(colors)]
        
        ax1.plot(data['train_losses'], label=name, color=color, linewidth=2)
        
        ax2.plot(data['val_accs'], label=name, color=color, linewidth=2)
    
    ax1.set_xlabel('Epoch', fontsize=12)
    ax1.set_ylabel('Training Loss', fontsize=12)
    ax1.set_title(f'{title_prefix} Training Loss', fontsize=14, fontweight='bold')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    ax2.set_xlabel('Epoch', fontsize=12)
    ax2.set_ylabel('Validation Accuracy (%)', fontsize=12)
    ax2.set_title(f'{title_prefix} Validation Accuracy', fontsize=14, fontweight='bold')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### 9.1: Learning Rate Analysis

In [None]:
print("="*60)
print("LEARNING RATE ANALYSIS")
print("="*60)

learning_rates = [0.001, 0.01, 0.1, 1.0]
results_lr = {}
train_loader, val_loader, test_loader = create_multiclass_dataloaders(batch_size=64)

for lr in learning_rates:
    print(f"\nTraining with learning rate = {lr}")
    model = NeuralNetwork(hidden_layers=[256, 128])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    train_losses, train_accs, val_losses, val_accs, train_time = train_pytorch_model_comp(
        model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, verbose=False
    )
    test_loss, test_acc, _ = evaluate_pytorch_model_comp(model, test_loader, criterion)

    results_lr[lr] = {
        'train_losses': train_losses, 'train_accs': train_accs,
        'val_losses': val_losses, 'val_accs': val_accs,
        'test_acc': test_acc, 'train_time': train_time
    }
    print(f"  Final Val Acc: {val_accs[-1]:.2f}%, Test Acc: {test_acc:.2f}%, Time: {train_time:.1f}s")

plot_results(results_lr, title_prefix='Learning Rate Effect on')

print("\nLearning Rate Summary:")
print(f"{'LR':<10} {'Final Val Acc':<15} {'Test Acc':<12} {'Time (s)':<10}")
print("-" * 50)
for lr, data in results_lr.items():
    print(f"{lr:<10} {data['val_accs'][-1]:<15.2f}% {data['test_acc']:<12.2f}% {data['train_time']:<10.1f}")

### 9.2: Batch Size Analysis

In [None]:
print("="*60)
print("BATCH SIZE ANALYSIS")
print("="*60)

batch_sizes = [16, 32, 64, 128]
results_bs = {}
best_lr = 0.01

for bs in batch_sizes:
    print(f"\nTraining with batch size = {bs}")
    train_loader, val_loader, test_loader = create_multiclass_dataloaders(batch_size=bs)
    model = NeuralNetwork(hidden_layers=[256, 128])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=best_lr)

    train_losses, train_accs, val_losses, val_accs, train_time = train_pytorch_model_comp(
        model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, verbose=False
    )
    test_loss, test_acc, _ = evaluate_pytorch_model_comp(model, test_loader, criterion)

    results_bs[bs] = {
        'train_losses': train_losses, 'train_accs': train_accs,
        'val_losses': val_losses, 'val_accs': val_accs,
        'test_acc': test_acc, 'train_time': train_time
    }
    print(f"  Final Val Acc: {val_accs[-1]:.2f}%, Test Acc: {test_acc:.2f}%, Time: {train_time:.1f}s")

plot_results(results_bs, title_prefix='Batch Size Effect on')

print("\nBatch Size Summary:")
print(f"{'BS':<10} {'Final Val Acc':<15} {'Test Acc':<12} {'Time (s)':<10}")
print("-" * 50)
for bs, data in results_bs.items():
    print(f"{bs:<10} {data['val_accs'][-1]:<15.2f}% {data['test_acc']:<12.2f}% {data['train_time']:<10.1f}")

### 9.3: Network Depth Analysis

In [None]:
print("="*60)
print("NETWORK DEPTH ANALYSIS")
print("="*60)

architectures = {
    '2_layers': [256, 128],
    '3_layers': [256, 128, 64],
    '4_layers': [256, 128, 64, 32],
    '5_layers': [256, 128, 64, 32, 16]
}
results_arch = {}
train_loader, val_loader, test_loader = create_multiclass_dataloaders(batch_size=64)

for name, layers in architectures.items():
    print(f"\nTraining architecture: {name} - {layers}")
    model = NeuralNetwork(hidden_layers=layers)
    n_params = count_parameters(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    train_losses, train_accs, val_losses, val_accs, train_time = train_pytorch_model_comp(
        model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, verbose=False
    )
    test_loss, test_acc, _ = evaluate_pytorch_model_comp(model, test_loader, criterion)

    results_arch[name] = {
        'layers': layers, 'n_params': n_params,
        'train_losses': train_losses, 'train_accs': train_accs,
        'val_losses': val_losses, 'val_accs': val_accs,
        'test_acc': test_acc, 'train_time': train_time
    }
    print(f"  Parameters: {n_params:,}, Final Val Acc: {val_accs[-1]:.2f}%, Test Acc: {test_acc:.2f}%")

plot_results(results_arch, title_prefix='Network Depth Effect on')

print("\nNetwork Depth Summary:")
print(f"{'Depth':<10} {'Parameters':<12} {'Final Val Acc':<15} {'Test Acc':<12}")
print("-" * 50)
for name, data in results_arch.items():
    print(f"{name:<10} {data['n_params']:,<12} {data['val_accs'][-1]:<15.2f}% {data['test_acc']:.2f}%")

### 9.4: Network Width Analysis

In [None]:
print("="*60)
print("NETWORK WIDTH ANALYSIS")
print("="*60)

architectures_width = {
    'small': [64, 32],
    'medium': [128, 64],
    'large': [256, 128],
    'xlarge': [512, 256]
}
results_neurons = {}
train_loader, val_loader, test_loader = create_multiclass_dataloaders(batch_size=64)

for name, layers in architectures_width.items():
    print(f"\nTraining architecture: {name} - {layers}")
    model = NeuralNetwork(hidden_layers=layers)
    n_params = count_parameters(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    train_losses, train_accs, val_losses, val_accs, train_time = train_pytorch_model_comp(
        model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, verbose=False
    )
    test_loss, test_acc, _ = evaluate_pytorch_model_comp(model, test_loader, criterion)

    results_neurons[name] = {
        'config': layers, 'n_params': n_params,
        'train_losses': train_losses, 'train_accs': train_accs,
        'val_losses': val_losses, 'val_accs': val_accs,
        'test_acc': test_acc, 'train_time': train_time
    }
    print(f"  Parameters: {n_params:,}, Final Val Acc: {val_accs[-1]:.2f}%, Test Acc: {test_acc:.2f}%")

plot_results(results_neurons, title_prefix='Network Width Effect on')

print("\nNetwork Width Summary:")
print(f"{'Width':<10} {'Parameters':<12} {'Final Val Acc':<15} {'Test Acc':<12}")
print("-" * 50)
for name, data in results_neurons.items():
    print(f"{str(data['config']):<10} {data['n_params']:,<12} {data['val_accs'][-1]:<15.2f}% {data['test_acc']:.2f}%")

## 10. Final Model Comparison

In [None]:
def plot_confusion_matrices(y_true, y_pred_manual, y_pred_pytorch, y_pred_nn):
    cm_manual = confusion_matrix(y_true, y_pred_manual)
    cm_pytorch = confusion_matrix(y_true, y_pred_pytorch)
    cm_nn = confusion_matrix(y_true, y_pred_nn)

    fig, axs = plt.subplots(1, 3, figsize=(24, 7))

    disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_manual, display_labels=np.arange(10))
    disp1.plot(cmap='Blues', ax=axs[0], values_format='d')
    axs[0].set_title('Softmax Regression (Manual)')

    disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_pytorch, display_labels=np.arange(10))
    disp2.plot(cmap='Greens', ax=axs[1], values_format='d')
    axs[1].set_title('Softmax Regression (PyTorch)')

    disp3 = ConfusionMatrixDisplay(confusion_matrix=cm_nn, display_labels=np.arange(10))
    disp3.plot(cmap='Reds', ax=axs[2], values_format='d')
    axs[2].set_title('Neural Network')

    plt.tight_layout()
    plt.show()
    
    return cm_manual, cm_pytorch, cm_nn

def calc_per_class_accuracy(y_true, y_pred, num_classes=10):
    per_class_acc = []
    
    for class_idx in range(num_classes):
        class_mask = (y_true == class_idx)
        class_correct = np.sum((y_pred == class_idx) & class_mask)
        class_total = np.sum(class_mask)
        acc = (class_correct / class_total) * 100 if class_total > 0 else 0
        per_class_acc.append(acc)
    
    return per_class_acc

def plot_per_class_accuracy_comp(acc_softmax, acc_nn, y_true):
    print("PER-CLASS ACCURACY ANALYSIS\n")
    
    print("Class | Softmax Acc | NN Acc | Test Samples")
    print("-" * 55)
    for i in range(10):
        print(f"  {i}   |   {acc_softmax[i]:5.2f}%  |   {acc_nn[i]:5.2f}%   |     {np.sum(y_true == i):4d}")

    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(10)
    width = 0.35

    bars1 = ax.bar(x - width/2, acc_softmax, width, label='Softmax', color='steelblue', edgecolor='black')
    bars2 = ax.bar(x + width/2, acc_nn, width, label='Neural Network', color='seagreen', edgecolor='black')

    ax.set_xlabel('Digit Class', fontsize=12)
    ax.set_ylabel('Accuracy (%)', fontsize=12)
    ax.set_title('Per-Class Accuracy Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_ylim([0, 105])
    ax.legend(fontsize=11)
    ax.grid(axis='y', alpha=0.3)

    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height, f'{height:.1f}', ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.show()
    
    return np.argmax(acc_softmax), np.argmin(acc_softmax), np.argmax(acc_nn), np.argmin(acc_nn)

In [None]:
print("="*60)
print("FINAL MODEL COMPARISON")
print("="*60)

train_loader, val_loader, test_loader = create_multiclass_dataloaders(batch_size=64)
final_models = {}

# 1. Softmax Regression (Simple)
print("\n1. Training Softmax Regression...")
model1 = SoftmaxRegressionPyTorch(input_dim=INPUT_SIZE, num_classes=10).to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.SGD(model1.parameters(), lr=0.01)
train_losses1, train_accs1, val_losses1, val_accs1, time1 = train_pytorch_model_comp(model1, train_loader, val_loader, criterion, optimizer, num_epochs=50)
test_loss1, test_acc1, preds1 = evaluate_pytorch_model_comp(model1, test_loader, criterion)
final_models['Softmax Regression'] = { 'model': model1, 'n_params': count_parameters(model1), 'test_acc': test_acc1, 'predictions': preds1 }
print(f" Test Accuracy: {test_acc1:.2f}%, Parameters: {count_parameters(model1):,}")

# 2. Softmax with L2 Regularization
print("\n2. Training Softmax with L2 Regularization...")
model2 = SoftmaxRegressionPyTorch(input_dim=INPUT_SIZE, num_classes=10).to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.SGD(model2.parameters(), lr=0.01, weight_decay=0.001)
train_losses2, train_accs2, val_losses2, val_accs2, time2 = train_pytorch_model_comp(model2, train_loader, val_loader, criterion, optimizer, num_epochs=50)
test_loss2, test_acc2, preds2 = evaluate_pytorch_model_comp(model2, test_loader, criterion)
final_models['Softmax + L2 Reg'] = { 'model': model2, 'n_params': count_parameters(model2), 'test_acc': test_acc2, 'predictions': preds2 }
print(f" Test Accuracy: {test_acc2:.2f}%, Parameters: {count_parameters(model2):,}")

# 3. Best Neural Network (3 layers, [256, 128, 64])
print("\n3. Training Best Neural Network...")
model3 = NeuralNetwork(input_dim=INPUT_SIZE, hidden_layers=[256, 128, 64], num_classes=10).to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.SGD(model3.parameters(), lr=0.01)
train_losses3, train_accs3, val_losses3, val_accs3, time3 = train_pytorch_model_comp(model3, train_loader, val_loader, criterion, optimizer, num_epochs=50, verbose=True)
test_loss3, test_acc3, preds3 = evaluate_pytorch_model_comp(model3, test_loader, criterion)
final_models['Neural Network'] = { 'model': model3, 'n_params': count_parameters(model3), 'test_acc': test_acc3, 'predictions': preds3 }
print(f" Test Accuracy: {test_acc3:.2f}%, Parameters: {count_parameters(model3):,}")

cm_manual_softmax, cm_pytorch_softmax, cm_nn = plot_confusion_matrices(
    y_test_np, 
    test_pred_manual, 
    final_models['Softmax Regression']['predictions'], 
    final_models['Neural Network']['predictions']
)

acc_softmax = calc_per_class_accuracy(y_test_np, final_models['Softmax Regression']['predictions'])
acc_nn = calc_per_class_accuracy(y_test_np, final_models['Neural Network']['predictions'])
plot_per_class_accuracy_comp(acc_softmax, acc_nn, y_test_np)