In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

# For evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

# Set the random seeds for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [2]:
#If CUDA/MPS is available...
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: mps


In [3]:
# Step 1: Load the MNIST training set with only the ToTensor() transform
temp_transform = transforms.ToTensor()
temp_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=temp_transform)

# Create a DataLoader that loads the entire dataset in one batch
temp_loader = DataLoader(temp_train_dataset, batch_size=len(temp_train_dataset), shuffle=False)
data, _ = next(iter(temp_loader))

# Compute the mean and standard deviation across the entire training dataset
mean = data.mean().item()
std = data.std().item()
print("Computed Mean:", mean)
print("Computed Std:", std)

# Step 2: Define the transformation pipeline using the computed mean and std
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((mean,), (std,))
])

# Step 3: Reload the MNIST training and test datasets using the updated transform
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Step 4: Create DataLoader objects for batching and shuffling the data
batch_size = 64

# for windows, linux
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


Computed Mean: 0.13066047430038452
Computed Std: 0.30810782313346863


In [17]:
class Model1(nn.Module):
    def __init__(self):
        super(Model1, self).__init__()
        self.flatten = nn.Flatten()  # Flattens the 28x28 image into a 784-dim vector.
        self.fc = nn.Linear(28*28, 10)  # Fully connected layer mapping to 10 output classes.

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc(x)
        return x

In [18]:
class Model2(nn.Module):
    def __init__(self):
        super(Model2, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 128)  # Hidden layer with 128 neurons.
        self.relu = nn.ReLU()             # Activation function.
        self.fc2 = nn.Linear(128, 10)       # Output layer mapping to 10 classes.

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [19]:
class Model3(nn.Module):
    def __init__(self):
        super(Model3, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 128)
        self.bn1 = nn.BatchNorm1d(128)  # Batch normalization after first layer.
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)   # Dropout regularization with probability 0.5.
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)     # Batch normalization after second layer.
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [20]:
def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
    model.train()  # Set model to training mode.
    train_losses = []
    for epoch in range(epochs):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()  # Clear gradients from the previous step.
            outputs = model(data)  # Forward pass.
            loss = criterion(outputs, target)  # Compute the loss.
            loss.backward()  # Backward pass (compute gradients).
            optimizer.step()  # Update model parameters.
            running_loss += loss.item() * data.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
    return train_losses

In [21]:
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode.
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
    # Compute confusion matrix and classification report
    cm = confusion_matrix(all_targets, all_preds)
    report = classification_report(all_targets, all_preds, output_dict=True)
    return cm, report

In [None]:
def plot_misclassified(model, test_loader, device, n=36, title="Misclassified Examples"):
    model.eval()
    misclassified_images = []
    misclassified_preds = []
    misclassified_targets = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, preds = torch.max(outputs, 1)
            for i in range(len(target)):
                if preds[i] != target[i]:
                    misclassified_images.append(data[i].cpu().numpy().squeeze())
                    misclassified_preds.append(preds[i].cpu().item())
                    misclassified_targets.append(target[i].cpu().item())
                if len(misclassified_images) >= n:
                    break
            if len(misclassified_images) >= n:
                break
    
    # If misclassified_images has fewer than n, fill the rest with blank images
    while len(misclassified_images) < n:
        blank_image = np.zeros((28, 28))
        misclassified_images.append(blank_image)
        misclassified_preds.append(None)
        misclassified_targets.append(None)
    
    # Create a 6x6 grid for 36 images; each subplot is smaller.
    fig, axes = plt.subplots(6, 6, figsize=(8, 8))
    fig.suptitle(title)
    idx = 0
    for i in range(6):
        for j in range(6):
            ax = axes[i, j]
            image = misclassified_images[idx]
            ax.imshow(image, cmap='gray')
            if misclassified_preds[idx] is not None:
                ax.set_title(f"P:{misclassified_preds[idx]}\nT:{misclassified_targets[idx]}", fontsize=8)
            else:
                ax.set_title("Blank", fontsize=8)
            ax.axis('off')
            idx += 1
    plt.tight_layout()
    plt.show()

: 

In [None]:
# Instantiate Model 1, define loss function and optimizer
model1 = Model1().to(device)
criterion = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(model1.parameters(), lr=0.001)

print("Training Model 1 (One-Layer, No Regularization)")
losses1 = train_model(model1, train_loader, criterion, optimizer1, device, epochs=5)

# Evaluate Model 1 on test data
cm1, report1 = evaluate_model(model1, test_loader, device)
print("Classification Report for Model 1:")
print(report1)

# Plot confusion matrix for Model 1
plt.figure(figsize=(8,6))
sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Model 1')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Training Model 1 (One-Layer, No Regularization)


In [None]:
# Instantiate Model 2, define its optimizer
model2 = Model2().to(device)
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)

print("Training Model 2 (Two-Layer, No Regularization)")
losses2 = train_model(model2, train_loader, criterion, optimizer2, device, epochs=5)

# Evaluate Model 2 on test data
cm2, report2 = evaluate_model(model2, test_loader, device)
print("Classification Report for Model 2:")
print(report2)

# Plot confusion matrix for Model 2
plt.figure(figsize=(8,6))
sns.heatmap(cm2, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Model 2')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Instantiate Model 3, define its optimizer
model3 = Model3().to(device)
optimizer3 = optim.Adam(model3.parameters(), lr=0.001)

print("Training Model 3 (Two Hidden Layers with Batch Norm and Dropout)")
losses3 = train_model(model3, train_loader, criterion, optimizer3, device, epochs=5)

# Evaluate Model 3 on test data
cm3, report3 = evaluate_model(model3, test_loader, device)
print("Classification Report for Model 3:")
print(report3)

# Plot confusion matrix for Model 3
plt.figure(figsize=(8,6))
sns.heatmap(cm3, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Model 3')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(losses1, label='Model 1: 1 Layer')
plt.plot(losses2, label='Model 2: 2 Layers')
plt.plot(losses3, label='Model 3: 2 Hidden Layers with Reg.')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss per Epoch for Different Models')
plt.legend()
plt.show()

In [None]:
def extract_metrics(report):
    return {
        "Accuracy": report['accuracy'],
        "Macro Precision": report['macro avg']['precision'],
        "Macro Recall": report['macro avg']['recall'],
        "Macro F1-Score": report['macro avg']['f1-score'],
        "Weighted Precision": report['weighted avg']['precision'],
        "Weighted Recall": report['weighted avg']['recall'],
        "Weighted F1-Score": report['weighted avg']['f1-score']
    }

metrics1 = extract_metrics(report1)
metrics2 = extract_metrics(report2)
metrics3 = extract_metrics(report3)

metrics_df = pd.DataFrame({
    "Model 1": metrics1,
    "Model 2": metrics2,
    "Model 3": metrics3
})
metrics_df = metrics_df.T  # Transpose for easier reading
print("Evaluation Metrics Summary:")
metrics_df

In [None]:
plot_misclassified(model1, test_loader, device, n=36, title="Misclassified Examples - Model 1")
plot_misclassified(model2, test_loader, device, n=36, title="Misclassified Examples - Model 2")
plot_misclassified(model3, test_loader, device, n=36, title="Misclassified Examples - Model 3")