In [None]:
# üöÄ **Importing Helper Modules**

import torch  # üß† Core PyTorch library for tensor operations and neural networks
import torch.nn as nn  # üèóÔ∏è Neural network components (layers, loss functions)
import torch.optim as optim  # ‚öôÔ∏è Optimization algorithms (SGD, Adam, etc.)
import torchvision  # üé® Computer vision utilities and datasets
import torchvision.transforms as transforms  # üñºÔ∏è Data transformations (normalization, augmentation)
from torch.utils.data import DataLoader  # üöö For loading and batching data
import matplotlib.pyplot as plt  # üìä Visualization for losses and accuracies
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# üöÄ **Part 1: Data Loading and Preprocessing**

# üõ†Ô∏è TODO: Complete the data loading code üß©
def load_mnist_data(batch_size=64):
    """
    üì¶ **Load and preprocess the MNIST dataset.**
    üìú **Returns:** train_loader and test_loader üéØ
    """
    # transform = transforms.Compose([
    #     transforms.ToTensor(),  # üîÑ Convert images to tensors üìä
    #     transforms.Normalize((0.1307,), (0.3081,))  # ‚öñÔ∏è Normalize with mean and std for MNIST dataset
    # ])

    train_transform = transforms.Compose([
        transforms.Resize((224, 224)), # resize to OG VGG16 input with 1 channel
        transforms.RandomAffine(degrees=15, translate=(0.1, 0.1)), # random rotation of +- 15 degrees, with 10% shift at random
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
    ])

    test_transform = transforms.Compose([
        transforms.Resize((224, 224)), # resize to OG VGG16 input with 3 channels
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
    ])

    # üõ†Ô∏è **TODO: Load MNIST training and test datasets** üñºÔ∏è
    # üìå Hint: Use `torchvision.datasets.MNIST` for dataset loading üì•
    #          Use `torch.utils.data.DataLoader` for creating data loaders üîÑ

    # train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
    # test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=train_transform, download=True)
    test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=test_transform, download=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Example (not yet implemented):
    # train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
    # test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader  # üöö Return the loaders üì¶


In [None]:
# üöÄ **Part 2: Custom Dropout Implementation**

class CustomDropout(nn.Module):
    """
    üõ†Ô∏è TODO: Implement custom dropout layer üéØ

    üìú **Requirements:**
    1Ô∏è‚É£ Initialize with **dropout probability** `p` üé≤
    2Ô∏è‚É£ Implement **forward pass** with proper scaling üîÑ
    3Ô∏è‚É£ **Only drop** units during **training** (`self.training` flag) üèãÔ∏è‚Äç‚ôÇÔ∏è
    """

    def __init__(self, p=0.5):
        super(CustomDropout, self).__init__()
        # üé≤ Store dropout probability (p between 0 and 1)
        self.p = p
        # pass  # üöß Initialization complete! Time to implement the logic üõ†Ô∏è

    def forward(self, x):
        # üîÑ **TODO: Implement forward pass**
        if self.training:  # üèãÔ∏è‚Äç‚ôÇÔ∏è Drop units only during training mode
            # pass  # üöß Work in progress! Apply dropout logic üß™
            mask = torch.bernoulli(torch.ones_like(x) * (1 - self.p)) # using a bernoulli distribution for generating random 0 or 1, p probability of mask being 0
            # alternate implementation
            # mask = (torch.rand_like(x) > self.p).float()
            return x * mask / (1 - self.p) # dividing by (1-p) is crucial so that the expected value of the activations ultimately remain unchanged
        return x  # üîÑ Return the (possibly dropped) output ‚ú®


In [None]:
# üöÄ **Part 3: Custom BatchNorm2d Implementation**

class CustomBatchNorm2d(nn.Module):
    """
    üõ†Ô∏è TODO: Implement custom 2D batch normalization üîÑ

    üìú **Requirements:**
    1Ô∏è‚É£ Initialize **running mean**, **variance**, **gamma (scale)**, and **beta (shift)** ‚öñÔ∏è
    2Ô∏è‚É£ Implement **forward pass** with proper normalization ‚ú®
    3Ô∏è‚É£ Track **running statistics** during training üìä
    """

    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super(CustomBatchNorm2d, self).__init__()
        # üõ†Ô∏è **TODO: Initialize parameters and buffers**
        # pass  # üöß Work in progress üöÄ
        self.num_features = num_features # number of channels
        self.eps = eps
        self.momentum = momentum

        self.gamma = nn.Parameter(torch.ones(num_features)) # scaling factor
        self.beta = nn.Parameter(torch.zeros(num_features)) # shifting factor
        self.register_buffer('running_mean', torch.zeros(num_features)) # simple buffer to store the running values, optional
        self.register_buffer('running_var', torch.ones(num_features))

    def forward(self, x):
        # üîÑ **TODO: Implement forward pass for batch normalization**
        # Steps:
        # 1Ô∏è‚É£ Calculate batch mean and variance üìä
        # 2Ô∏è‚É£ Normalize the input üéØ
        # 3Ô∏è‚É£ Apply learnable parameters (gamma and beta) ‚öôÔ∏è
        # 4Ô∏è‚É£ Update running statistics during training üèãÔ∏è‚Äç‚ôÇÔ∏è
        # pass  # üöß Normalize and return the output üß™
        if self.training:
            batch_mean = x.mean([0, 2, 3]) # mean and variance is per channel, the input here is 4D (batch_size, num_channels, height, width) and we are applying across (0, 2, 3) indexes in the given array
            batch_var = x.var([0, 2, 3], unbiased=False)

            self.running_mean.mul_(1 - self.momentum).add_(self.momentum * batch_mean)
            self.running_var.mul_(1 - self.momentum).add_(self.momentum * batch_var)

            mean = batch_mean.view(1, -1, 1, 1)
            var = batch_var.view(1, -1, 1, 1)
        else:
            mean = self.running_mean.view(1, -1, 1, 1)
            var = self.running_var.view(1, -1, 1, 1)

        x_normalized = (x - mean) / torch.sqrt(var + self.eps)
        gamma = self.gamma.view(1, -1, 1, 1)
        beta = self.beta.view(1, -1, 1, 1)
        ret_val = gamma * x_normalized + beta

        return ret_val


In [None]:
class CustomReLU(nn.Module):
    """
    üõ†Ô∏è TODO: Implement custom ReLU activation function ‚ú®

    üìú **Requirements:**
    1Ô∏è‚É£ Apply ReLU manually using tensor operations (avoid using `F.relu`) üîÑ
    2Ô∏è‚É£ Output should replace all negative values with 0 (ReLU behavior) üßπ
    """

    def forward(self, x):
        # üîÑ **TODO: Implement forward pass for ReLU**
        # Hint: Use `torch.max` to replace all negative values with 0 üéØ
        # pass  # üöß Replace and return the ReLU-activated output ‚ö°
        return torch.max(torch.zeros_like(x), x) # normal ReLU = max(0, x) implementation but vectorized


In [None]:
class CustomMaxPooling2d(nn.Module):
    """
    üõ†Ô∏è TODO: Implement custom 2D MaxPooling layer üèä

    üìú **Requirements:**
    1Ô∏è‚É£ Implement a max-pooling operation with a given kernel size and stride üìê
    2Ô∏è‚É£ Return the maximum value in each pooling window üåä
    3Ô∏è‚É£ Ensure it supports both training and evaluation modes üîÑ
    """

    def __init__(self, kernel_size=2, stride=2):
        super(CustomMaxPooling2d, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        # üîÑ **TODO: Implement forward pass for max-pooling**
        # Hint: Use `unfold` to break the input into windows and compute the max for each window üîç
        # pass  # üöß Pool and return the reduced output üèä‚Äç‚ôÇÔ∏è
        N, C, H, W = x.shape # batch, channels, height, width
        out_H = (H - self.kernel_size) // self.stride + 1 # normal theory formula O = floor ((I + 2P - K) / S) + 1
        out_W = (W - self.kernel_size) // self.stride + 1

        x_unfold = F.unfold(x, kernel_size = self.kernel_size, stride = self.stride)
        x_unfold = x_unfold.view(N, C, self.kernel_size * self.kernel_size, -1)

        out, _ = x_unfold.max(dim = 2)
        out = out.view(N, C, out_H, out_W)
        
        return out


In [None]:
# üöÄ **Part 4: Custom VGG16 Model Implementation**
class CustomVGG16(nn.Module):
    """
    üìú Custom VGG16-like Model with:
    1Ô∏è‚É£ Convolutional blocks using nn.Conv2d, CustomBatchNorm2d, and CustomDropout üîÑ
    2Ô∏è‚É£ ReLU activation ‚ö° and MaxPooling üèä
    3Ô∏è‚É£ Fully connected layers at the end
    """

    def __init__(self, num_classes=10):  # num_classes = 10 for MNIST
        super(CustomVGG16, self).__init__()
        # üî® **TODO: Define your layers here**
        # Example: self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size)
        # pass  # üöß Work in progress üöÄ
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            CustomBatchNorm2d(64),
            CustomReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            CustomBatchNorm2d(64),
            CustomReLU(),
            CustomMaxPooling2d(),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            CustomBatchNorm2d(128),
            CustomReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            CustomBatchNorm2d(128),
            CustomReLU(),
            CustomMaxPooling2d(),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            CustomBatchNorm2d(256),
            CustomReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            CustomBatchNorm2d(256),
            CustomReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            CustomBatchNorm2d(256),
            CustomReLU(),
            CustomMaxPooling2d(),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            CustomMaxPooling2d(),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            CustomBatchNorm2d(512),
            CustomReLU(),
            CustomMaxPooling2d()

        )
        # Fully connected layers
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512 * 7 * 7, 4096),
            CustomReLU(),
            CustomDropout(),
            nn.Linear(4096, 4096),
            CustomReLU(),
            CustomDropout(),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        # üîÑ **TODO: Implement the forward pass** üí°
        # Example: x = self.conv1(x)
        # pass  # üöß Process the input and return the output üéØ
        x = self.features(x)
        x = self.classifier(x)
        return x


In [None]:
# üöÄ **Part 5: Training Functions**

def train_epoch(model, train_loader, criterion, optimizer, device):
    """
    üõ†Ô∏è TODO: Implement training loop for one epoch üèãÔ∏è‚Äç‚ôÇÔ∏è
    """
    model.train()  # üìà Switch to training mode
    running_loss = 0.0  # üí∞ Track the cumulative loss
    correct = 0  # ‚úÖ Correct predictions counter
    total = 0  # üìä Total samples counter
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Training")

    for batch_idx, (data, target) in pbar:  # üîÑ Loop through batches
        # üìå Your code here (e.g., forward pass, loss calculation, backward pass, optimizer step)
        # pass
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad() # clear from previous batch
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        predicted = output.argmax(dim=1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        pbar.set_postfix({
            'loss': loss.item(),
            'acc': 100. * correct / total
        })

    # üìä Return average loss and accuracy for the epoch
    avg_loss = running_loss / len(train_loader)
    accuracy = 100. * correct / total if total > 0 else 0
    return avg_loss, accuracy

def evaluate(model, test_loader, criterion, device):
    """
    üß™ TODO: Implement evaluation loop üîç
    """
    model.eval()  # üîï Switch to evaluation mode (no gradients)
    test_loss = 0  # üí∞ Track cumulative test loss
    correct = 0  # ‚úÖ Correct predictions counter
    total = 0  # üìä Total samples counter
    pbar = tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating")

    with torch.no_grad():  # üö´ No gradient calculation for evaluation
        # üìå Your code here (e.g., forward pass, loss calculation, accuracy calculation)
        # pass
        for batch_idx, (data, target) in pbar:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            test_loss += loss.item()

            predicted = output.argmax(dim=1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            pbar.set_postfix({
                'loss': loss.item(),
                'acc': 100. * correct / total if total > 0 else 0
            })

    avg_loss = test_loss / len(test_loader)
    accuracy = 100. * correct / total if total > 0 else 0
    # üìä Return average test loss and accuracy
    return avg_loss, accuracy
    

In [None]:
# üöÄ **Part 6: Main Training Loop**

def main():
    # ‚öôÔ∏è **Hyperparameters**
    BATCH_SIZE = 16  # üì¶ Batch size for data loading
    EPOCHS = 2  # üîÑ Number of training epochs
    LEARNING_RATE = 0.001  # üöÄ Learning rate for optimizer
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # ‚ö° Use GPU if available

    # üìä **Load data**
    train_loader, test_loader = load_mnist_data(BATCH_SIZE)

    # üõ†Ô∏è **Initialize model, criterion, optimizer**
    model = CustomVGG16().to(DEVICE)  # üñ•Ô∏è Move model to the selected device
    criterion = nn.CrossEntropyLoss()  # üéØ Loss function for classification
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)  # üöÄ Adam optimizer for better convergence

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

    # üîÑ **Training loop**
    train_losses = []  # üìâ Track training losses
    test_losses = []  # üìâ Track test losses
    train_accs = []  # üìä Track training accuracy
    test_accs = []  # üìä Track test accuracy

    for epoch in range(EPOCHS):
        # üèãÔ∏è‚Äç‚ôÇÔ∏è **TODO: Implement main training loop**
        print(f"üåü Epoch {epoch+1}/{EPOCHS}")  # üïí Display current epoch
        # Example Steps:
        # 1Ô∏è‚É£ Train for one epoch
        # 2Ô∏è‚É£ Evaluate on test set
        # 3Ô∏è‚É£ Record losses and accuracies
        # 4Ô∏è‚É£ Print progress üí¨
        # pass
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, DEVICE)
        test_loss, test_acc = evaluate(model, test_loader, criterion, DEVICE)

        scheduler.step(test_loss)
        print(f"üîÑ Learning rate changed to: {optimizer.param_groups[0]['lr']:.6f}")

        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accs.append(train_acc)
        test_accs.append(test_acc)

        print(f'Epoch {epoch+1}/{EPOCHS} finished:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

    # üìà **Plot results**
    # üõ†Ô∏è **TODO: Create loss and accuracy plots**
    # Example: plt.plot(train_losses), plt.plot(test_losses), etc.
    # pass  # üé® Generate and display plots üìä
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(test_accs, label='Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    main()

In [None]:
# For comparison with PyTorch
class PyTorchVGG16(nn.Module):
    """
    VGG16 using PyTorch's built-in layers
    """
    def __init__(self, num_classes=10):
        super(PyTorchVGG16, self).__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        if (batch_idx + 1) % 100 == 0:
            print(f"Batch {batch_idx + 1}/{len(train_loader)} "
                  f"Loss: {loss.item()} "
                  f"Accuracy: {100. * correct / total}")

    return running_loss / len(train_loader), 100. * correct / total

def evaluate(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            test_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

    return test_loss / len(test_loader), 100. * correct / total

def main():
    # Hyperparameters
    BATCH_SIZE = 16
    EPOCHS = 2
    LEARNING_RATE = 0.001
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load data
    train_loader, test_loader = load_mnist_data(BATCH_SIZE)

    # Initialize model, criterion, optimizer
    model = PyTorchVGG16().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=1
    )

    # Training tracking
    train_losses = []
    test_losses = []
    train_accs = []
    test_accs = []

    # Training loop
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}/{EPOCHS}")

        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, DEVICE
        )
        test_loss, test_acc = evaluate(model, test_loader, criterion, DEVICE)

        scheduler.step(test_loss)
        print(f"Learning rate: {optimizer.param_groups[0]['lr']:.6f}")

        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accs.append(train_acc)
        test_accs.append(test_acc)

        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

    # Plot results
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(test_accs, label='Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    main()