In [28]:
import sys
import torch

print("="*60)
print("CUDA Diagnostic")
print("="*60)
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch CUDA compiled version: {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print("CUDA is working!")
else:
    print("CUDA is NOT working")
print("="*60)

CUDA Diagnostic
Python version: 3.14.0 (tags/v3.14.0:ebf955d, Oct  7 2025, 10:15:03) [MSC v.1944 64 bit (AMD64)]
PyTorch version: 2.9.1+cu126
PyTorch CUDA compiled version: 12.6
CUDA available: True
CUDA device count: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 4070
CUDA is working!


In [29]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import random

import pandas as pd
import os

# Set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("="*60)
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: Running on CPU")
print("="*60)

Using device: cuda
GPU: NVIDIA GeForce RTX 4070
GPU Memory: 12.9 GB


In [30]:
# Dataloaders
def create_dataloaders(batch_size_train):
  myDir = '/files/'

  train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.FashionMNIST(myDir, train=True, download=True,
                                  transform=torchvision.transforms.Compose([
                                  torchvision.transforms.ToTensor(),
                                  torchvision.transforms.Normalize(
                                   (0.2860,), (0.3530,))
                               ])),
    batch_size=batch_size_train, shuffle=True)

  batch_size_test = 1000
  test_loader = torch.utils.data.DataLoader(
    torchvision.datasets.FashionMNIST(myDir, train=False, download=True,
                                  transform=torchvision.transforms.Compose([
                                  torchvision.transforms.ToTensor(),
                                  torchvision.transforms.Normalize(
                                   (0.2860,), (0.3530,))
                               ])),
    batch_size=batch_size_test, shuffle=True)

  return train_loader, test_loader

In [31]:
# Neural Network Class
from math import e
class Net(nn.Module):
    def __init__(self, dropout=0.5, normalization=None):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)

        # self.conv2_drop = nn.Dropout2d()
        self.dropout = nn.Dropout2d(p = dropout)

        if(normalization == "batchnorm"):
          self.norm1 = nn.BatchNorm2d(10)
          self.norm2 = nn.BatchNorm2d(20)
        elif(normalization == "layernorm"):
          # shape after conv â†’ here 10x24x24 and 20x8x8
          self.norm1 = nn.LayerNorm([10, 24, 24])
          self.norm2 = nn.LayerNorm([20, 8, 8])
        else:
          self.norm1 = nn.Identity()
          self.norm2 = nn.Identity()

        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):

        x = self.conv1(x)
        x = self.norm1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        x = self.conv2(x)
        x = self.norm2(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)


        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)


        return F.log_softmax(x, dim = 1)

In [32]:
# Deep Neural Network Class
from math import e
class DeepNet(nn.Module):
    def __init__(self, dropout=0.5, normalization=None):
        super(DeepNet, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        
        # self.conv2_drop = nn.Dropout2d()
        self.dropout = nn.Dropout2d(p = dropout)

        if(normalization == "batchnorm"):
            self.norm1 = nn.BatchNorm2d(32)
            self.norm2 = nn.BatchNorm2d(32)
            self.norm3 = nn.BatchNorm2d(64)
            self.norm4 = nn.BatchNorm2d(64)
        elif(normalization == "layernorm"):
            self.norm1 = nn.LayerNorm([32, 28, 28])
            self.norm2 = nn.LayerNorm([32, 28, 28])
            self.norm3 = nn.LayerNorm([64, 14, 14])
            self.norm4 = nn.LayerNorm([64, 14, 14])
        else:
            self.norm1 = nn.Identity()
            self.norm2 = nn.Identity()
            self.norm3 = nn.Identity()
            self.norm4 = nn.Identity()

        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        # First conv block
        x = self.conv1(x)
        x = self.norm1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = self.norm2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout(x)
        
        # Second conv block
        x = self.conv3(x)
        x = self.norm3(x)
        x = F.relu(x)
        
        x = self.conv4(x)
        x = self.norm4(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout(x)
        
        # Fully connected
        x = x.view(-1, 64 * 7 * 7)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc2(x)


        return F.log_softmax(x, dim = 1)

In [33]:
# He initialization of weights
def weights_init(layer_in):
  if isinstance(layer_in, nn.Linear):
    nn.init.kaiming_uniform_(layer_in.weight)
    layer_in.bias.data.fill_(0.0)

In [34]:
# Model
def create_model(dropout, normalization, architecture):

  if architecture == "shallow":
    model = Net(dropout, normalization)
  elif architecture == "deep":
    model = DeepNet(dropout, normalization)

  model.apply(weights_init)
  return model

In [35]:
# Optimizer
def create_optimizer(optimizer_name, model, learning_rate, weight_decay, dropout):
  if optimizer_name == "SGD":
    return optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5, weight_decay=weight_decay)
  elif optimizer_name == "Adam":
    return optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


In [36]:
# Logger
def log_experiment(result_dict, save_path="logs_deep.csv"):
    df = pd.DataFrame([result_dict])
    if not os.path.exists(save_path):
        df.to_csv(save_path, index=False)
    else:
        df.to_csv(save_path, mode='a', header=False, index=False)


In [37]:
# test_accuracies = []

# Main training routine
def train(epoch, batch_train_losses):
  model.train()
  running_loss = 0.0


  # Get each
  for batch_idx, (data, target) in enumerate(train_loader):
    # work done on GPU
    data, target = data.to(device), target.to(device)

    optimizer.zero_grad()
    output = model(data)
    loss = F.nll_loss(output, target)
    loss.backward()
    optimizer.step()

    # Train_losses attempt
    batch_train_losses.append(loss.item())
    running_loss += loss.item()

    # Store results
    if batch_idx % 100 == 0:
      print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()))

  avg_loss = running_loss / len(train_loader)
  return avg_loss

In [38]:
# Run on test data
def test():
  model.eval()
  test_loss = 0
  correct = 0

  with torch.no_grad():
    for data, target in test_loader:
      # work done on GPU
      data, target = data.to(device), target.to(device)

      output = model(data)
      test_loss += F.nll_loss(output, target, size_average=False).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(target.data.view_as(pred)).sum()

  test_loss /= len(test_loader.dataset)
  test_accuracy = float(correct) / len(test_loader.dataset)



  print('Test set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * test_accuracy))


  return test_loss, test_accuracy

In [39]:
# # Get initial performance
# test()
# # Train for three epochs
# # n_epochs = 3
# n_epochs = 3
# for epoch in range(1, n_epochs + 1):
#   train_loss = train(epoch)
#   test_loss, test_accuracy = test() # test() returns test_loss and test_accuracy


# log_experiment({

#     "dataset": "Fashion MNIST",
#     "optimizer": "Adam",
#     "lr": 0.001,
#     "batch_size": 64,
#     "epochs": n_epochs,
#     "final_train_loss": train_loss,
#     "final_test_loss": test_loss,
#     "final_test_accuracy": test_accuracy
#   }) # saves to "logs.csv"

In [40]:
from torch.nn.modules import normalization

# Hyperparameter changes
# architectures = ["shallow", "deep"]
architectures = ["deep"]
optimizers = ["SGD", "Adam"]
learning_rates = [0.01, 0.001]
dropouts = [0.0, 0.3]
weight_decays = [0.0, 1e-4]
normalization = [None, "batchnorm", "layernorm"]

batch_sizes_train = [64, 128, 256]
# batch_size_test = constant 1000


experiment_counter = 0
tot_experiments = (len(architectures) * len(optimizers) * len(learning_rates) * len(dropouts) * len(weight_decays) * len(normalization) * len(batch_sizes_train))

# CNN = FashionMNIST
for architecture in architectures:
  for optimizer_name in optimizers:
    for lr in learning_rates:
      for dropout in dropouts:
        for weight_decay in weight_decays:
          for norm in normalization:
            for batch_size_train in batch_sizes_train:
              print("Architecture: {}, Optimizer: {}, Learning Rate: {}, Dropout: {}, Weight Decay: {}, Normalization: {}, Batch Size: {}"
                .format(architecture, optimizer_name, lr, dropout, weight_decay, norm, batch_size_train))


              # Created model and moved to GPU
              model = create_model(dropout, norm, architecture)
              model = model.to(device)

              optimizer = create_optimizer(optimizer_name, model, lr, weight_decay, dropout)
              # train_loader, test_loader returned
              train_loader, test_loader = create_dataloaders(batch_size_train)

              # Track metrics per epoch
              epoch_train_losses = []
              epoch_test_losses = []
              epoch_test_accuracies = []
              batch_train_losses = []

              n_epochs = 10
              train_losses = []
              for epoch in range(1, n_epochs + 1):
                train_loss = train(epoch, batch_train_losses)
                test_loss, test_accuracy = test() # test() returns test_loss and test_accuracy

                # per-epoch metrics
                epoch_train_losses.append(train_loss)
                epoch_test_losses.append(test_loss)
                epoch_test_accuracies.append(test_accuracy)
                batch_train_losses.append(train_losses)

              # logging
              best_test_accuracy = max(epoch_test_accuracies)
              best_epoch_num = epoch_test_accuracies.index(best_test_accuracy)

              log_experiment({
                "optimizer": optimizer_name,
                "lr": lr,
                "dropout": dropout,
                "weight_decay": weight_decay,
                "normalization": str(norm),
                "batch_size": batch_size_train,
                "epochs": n_epochs,
                "final_train_loss": float(train_loss),
                "final_test_loss": float(test_loss),
                "final_test_accuracy": float(test_accuracy),
                "best_test_accuracy": float(best_test_accuracy),
                "best_epoch_num": best_epoch_num,
                "epoch_train_losses": str(epoch_train_losses),
                "epoch_test_losses": str(epoch_test_losses),
                "epoch_test_accuracies": str(epoch_test_accuracies)              
                })


              del model
              if torch.cuda.is_available():
                torch.cuda.empty_cache()

              experiment_counter += 1
              print(f"Completed Experiments Count: {experiment_counter} / {tot_experiments}")






Architecture: deep, Optimizer: SGD, Learning Rate: 0.01, Dropout: 0.0, Weight Decay: 0.0, Normalization: None, Batch Size: 64
Train Epoch: 1 [0/60000]	Loss: 2.295907
Train Epoch: 1 [6400/60000]	Loss: 1.237410
Train Epoch: 1 [12800/60000]	Loss: 1.118725
Train Epoch: 1 [19200/60000]	Loss: 0.765961
Train Epoch: 1 [25600/60000]	Loss: 0.979982
Train Epoch: 1 [32000/60000]	Loss: 0.748937
Train Epoch: 1 [38400/60000]	Loss: 0.615618
Train Epoch: 1 [44800/60000]	Loss: 0.807837
Train Epoch: 1 [51200/60000]	Loss: 0.599785
Train Epoch: 1 [57600/60000]	Loss: 0.565243




Test set: Avg. loss: 0.4944, Accuracy: 8121/10000 (81.21%)

Train Epoch: 2 [0/60000]	Loss: 0.695026
Train Epoch: 2 [6400/60000]	Loss: 0.487933
Train Epoch: 2 [12800/60000]	Loss: 0.459877
Train Epoch: 2 [19200/60000]	Loss: 0.615873
Train Epoch: 2 [25600/60000]	Loss: 0.507728
Train Epoch: 2 [32000/60000]	Loss: 0.585532
Train Epoch: 2 [38400/60000]	Loss: 0.467443
Train Epoch: 2 [44800/60000]	Loss: 0.558254
Train Epoch: 2 [51200/60000]	Loss: 0.234831
Train Epoch: 2 [57600/60000]	Loss: 0.418589
Test set: Avg. loss: 0.4111, Accuracy: 8481/10000 (84.81%)

Train Epoch: 3 [0/60000]	Loss: 0.587116
Train Epoch: 3 [6400/60000]	Loss: 0.438131
Train Epoch: 3 [12800/60000]	Loss: 0.551494
Train Epoch: 3 [19200/60000]	Loss: 0.487149
Train Epoch: 3 [25600/60000]	Loss: 0.449212
Train Epoch: 3 [32000/60000]	Loss: 0.366183
Train Epoch: 3 [38400/60000]	Loss: 0.401748
Train Epoch: 3 [44800/60000]	Loss: 0.307020
Train Epoch: 3 [51200/60000]	Loss: 0.385848
Train Epoch: 3 [57600/60000]	Loss: 0.330814
Test set: 