In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset
import numpy as np

# Load MNIST dataset (subset for faster training)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

trainset = torchvision.datasets.MNIST(root="./data", train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root="./data", train=False, download=True, transform=transform)

# Use only a small subset to speed up training
train_subset = Subset(trainset, np.random.choice(len(trainset), 5000, replace=False))
test_subset = Subset(testset, np.random.choice(len(testset), 1000, replace=False))

train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=64, shuffle=False)

# Define different models
class ModelA(nn.Module):  # No hidden layer, Softmax output
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(28*28, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return torch.softmax(self.linear(x), dim=1)

class ModelB(nn.Module):  # 1 hidden layer (2 neurons), Softmax hidden, Linear output
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(28*28, 2)
        self.output = nn.Linear(2, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.softmax(self.hidden(x), dim=1)
        return self.output(x)

class ModelC(nn.Module):  # 1 hidden layer (20 neurons), Linear hidden, Softmax output
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(28*28, 20)
        self.output = nn.Linear(20, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.hidden(x)  # Linear activation
        return torch.softmax(self.output(x), dim=1)

class ModelB_ReLU(nn.Module):  # Model B with ReLU
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(28*28, 2)
        self.output = nn.Linear(2, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.hidden(x))
        return self.output(x)

class ModelC_ReLU(nn.Module):  # Model C with ReLU
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(28*28, 20)
        self.output = nn.Linear(20, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.hidden(x))
        return self.output(x)

# Training function
def train_model(model, train_loader, test_loader, epochs=100, learning_rate=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        if (epoch+1) % 20 == 0:
            train_acc = evaluate_model(model, train_loader)
            test_acc = evaluate_model(model, test_loader)
            print(f"Epoch {epoch+1}/{epochs}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Train and compare models
models = {"ModelA": ModelA(), "ModelB": ModelB(), "ModelC": ModelC(),
          "ModelB_ReLU": ModelB_ReLU(), "ModelC_ReLU": ModelC_ReLU()}

results = {}
for name, model in models.items():
    print(f"Training {name}...")
    train_model(model, train_loader, test_loader, epochs=100)
    train_acc = evaluate_model(model, train_loader)
    test_acc = evaluate_model(model, test_loader)
    results[name] = (train_acc, test_acc)
    print(f"Final Accuracy: Train = {train_acc:.4f}, Test = {test_acc:.4f}\n")

# Print results summary
for name, (train_acc, test_acc) in results.items():
    print(f"{name}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")


100%|██████████| 9.91M/9.91M [00:00<00:00, 60.9MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.72MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 14.9MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 5.54MB/s]


Training ModelA...
Epoch 20/100: Train Acc = 0.7876, Test Acc = 0.7730
Epoch 40/100: Train Acc = 0.8080, Test Acc = 0.7900
Epoch 60/100: Train Acc = 0.8156, Test Acc = 0.8000
Epoch 80/100: Train Acc = 0.8222, Test Acc = 0.8000
Epoch 100/100: Train Acc = 0.8242, Test Acc = 0.8040
Final Accuracy: Train = 0.8242, Test = 0.8040

Training ModelB...
Epoch 20/100: Train Acc = 0.2118, Test Acc = 0.2040
Epoch 40/100: Train Acc = 0.2192, Test Acc = 0.2120
Epoch 60/100: Train Acc = 0.2270, Test Acc = 0.2280
Epoch 80/100: Train Acc = 0.2194, Test Acc = 0.2340
Epoch 100/100: Train Acc = 0.2152, Test Acc = 0.2370
Final Accuracy: Train = 0.2152, Test = 0.2370

Training ModelC...
Epoch 20/100: Train Acc = 0.6978, Test Acc = 0.6680
Epoch 40/100: Train Acc = 0.7334, Test Acc = 0.7140
Epoch 60/100: Train Acc = 0.7506, Test Acc = 0.7230
Epoch 80/100: Train Acc = 0.8288, Test Acc = 0.8090
Epoch 100/100: Train Acc = 0.8414, Test Acc = 0.8180
Final Accuracy: Train = 0.8414, Test = 0.8180

Training ModelB_ReL