# HW3 Image Classification
## We strongly recommend that you run with [Kaggle](https://www.kaggle.com/t/86ca241732c04da99aca6490080bae73) for this homework

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2023-spring@googlegroups.com

# Check GPU Type

In [1]:
!nvidia-smi

Sat Jan  3 13:21:55 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:61:00.0 Off |                  Off |
|  0%   24C    P8              2W /  450W |       1MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Import Packages

In [2]:
_exp_name = "sample"

In [3]:
# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset
# This is for the progress bar.
from tqdm.auto import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
myseed = 6666  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

# Transforms
Torchvision provides lots of useful utilities for image preprocessing, data *wrapping* as well as data augmentation.

Please refer to PyTorch official website for details about different transforms.

In [5]:
# Normally, We don't need augmentations in testing and validation.
# All we need here is to resize the PIL image and transform it into Tensor.
test_tfm = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# However, it is also possible to use augmentation in the testing phase.
# You may use train_tfm to produce a variety of images and then test using ensemble methods
train_tfm = transforms.Compose([
    transforms.RandomChoice([
        # 弱增强（保留语义）
        transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.RandomHorizontalFlip(0.5),
        ]),

        # 强增强（破坏捷径）
        transforms.Compose([
            transforms.RandomResizedCrop(128, scale=(0.7, 1.0)),
            transforms.RandomRotation(15),
            transforms.ColorJitter(0.2, 0.2, 0.2),
        ]),
    ], p=[0.4, 0.6]),

    transforms.ToTensor(),

    transforms.RandomErasing(
        p=0.25,
        scale=(0.02, 0.15),
        ratio=(0.3, 3.3)
    ),
])

# Datasets
The data is labelled by the name, so we load images and label while calling '__getitem__'

In [6]:
class FoodDataset(Dataset):

    def __init__(self,path,tfm=test_tfm,files = None):
        super(FoodDataset).__init__()
        self.path = path
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files

        self.transform = tfm

    def __len__(self):
        return len(self.files)

    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)

        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label

        return im,label

# Model

In [13]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        # torch.nn.MaxPool2d(kernel_size, stride, padding)
        # input 維度 [3, 128, 128]
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),  # [64, 128, 128]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [64, 64, 64]

            nn.Conv2d(64, 128, 3, 1, 1), # [128, 64, 64]
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [128, 32, 32]

            nn.Conv2d(128, 256, 3, 1, 1), # [256, 32, 32]
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      # [256, 16, 16]

            
            nn.Conv2d(256, 512, 3, 1, 1), # [512, 16, 16]
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),       # [512, 8, 8]

            nn.Conv2d(512, 512, 3, 1, 1), # [512, 8, 8]
            nn.BatchNorm2d(512),
            nn.ReLU(),
        
            nn.AdaptiveMaxPool2d((4, 4)) # [512, 4, 4]
        )
        self.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(512*4*4, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

# Configurations

In [14]:
# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize a model, and put it on the device specified.
model = Classifier().to(device)

# The number of batch size.
batch_size = 64

# The number of training epochs.
n_epochs = 100

# If no improvement in 'patience' epochs, early stop.
patience = 4

# For the classification task, we use cross-entropy as the measurement of performance.
criterion = nn.CrossEntropyLoss()

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-5)

# Dataloader

In [15]:
# Construct train and valid datasets.
# The argument "loader" tells how torchvision reads the data.
train_set = FoodDataset("./train", tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
valid_set = FoodDataset("./valid", tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

# Start Training

In [16]:
# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_acc = 0

for epoch in range(n_epochs):

    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    model.train()

    # These are used to record information in training.
    train_loss = []
    train_accs = []

    for batch in tqdm(train_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        #imgs = imgs.half()
        #print(imgs.shape,labels.shape)

        # Forward the data. (Make sure data and model are on the same device.)
        logits = model(imgs.to(device))

        # Calculate the cross-entropy loss.
        # We don't need to apply softmax before computing cross-entropy as it is done automatically.
        loss = criterion(logits, labels.to(device))

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)

        # Update the parameters with computed gradients.
        optimizer.step()

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)

    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    # Print the information.
    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        #imgs = imgs.half()

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
            logits = model(imgs.to(device))

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits, labels.to(device))

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        valid_accs.append(acc)
        #break

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)

    # Print the information.
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # update logs
    if valid_acc > best_acc:
        with open(f"./{_exp_name}_log.txt","a"):
            print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
    else:
        with open(f"./{_exp_name}_log.txt","a"):
            print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # save models
    if valid_acc > best_acc:
        print(f"Best model found at epoch {epoch}, saving model")
        torch.save(model.state_dict(), f"{_exp_name}_best.ckpt") # only save best to prevent output memory exceed error
        best_acc = valid_acc
        stale = 0
    else:
        stale += 1
        if stale > patience:
            print(f"No improvment {patience} consecutive epochs, early stopping")
            break

100%|██████████| 157/157 [07:07<00:00,  2.72s/it]


[ Train | 001/100 ] loss = 1.89546, acc = 0.33609


100%|██████████| 57/57 [02:16<00:00,  2.40s/it]


[ Valid | 001/100 ] loss = 1.73554, acc = 0.40667
[ Valid | 001/100 ] loss = 1.73554, acc = 0.40667 -> best
Best model found at epoch 0, saving model


100%|██████████| 157/157 [05:18<00:00,  2.03s/it]


[ Train | 002/100 ] loss = 1.58237, acc = 0.45850


100%|██████████| 57/57 [01:41<00:00,  1.77s/it]


[ Valid | 002/100 ] loss = 1.60280, acc = 0.45012
[ Valid | 002/100 ] loss = 1.60280, acc = 0.45012 -> best
Best model found at epoch 1, saving model


100%|██████████| 157/157 [05:16<00:00,  2.02s/it]


[ Train | 003/100 ] loss = 1.40651, acc = 0.51742


100%|██████████| 57/57 [01:42<00:00,  1.79s/it]


[ Valid | 003/100 ] loss = 1.42156, acc = 0.50931
[ Valid | 003/100 ] loss = 1.42156, acc = 0.50931 -> best
Best model found at epoch 2, saving model


100%|██████████| 157/157 [05:01<00:00,  1.92s/it]


[ Train | 004/100 ] loss = 1.28417, acc = 0.55822


100%|██████████| 57/57 [00:55<00:00,  1.03it/s]


[ Valid | 004/100 ] loss = 1.39687, acc = 0.52039
[ Valid | 004/100 ] loss = 1.39687, acc = 0.52039 -> best
Best model found at epoch 3, saving model


100%|██████████| 157/157 [04:54<00:00,  1.88s/it]


[ Train | 005/100 ] loss = 1.17607, acc = 0.60420


100%|██████████| 57/57 [01:47<00:00,  1.88s/it]


[ Valid | 005/100 ] loss = 1.25354, acc = 0.58239
[ Valid | 005/100 ] loss = 1.25354, acc = 0.58239 -> best
Best model found at epoch 4, saving model


100%|██████████| 157/157 [05:39<00:00,  2.16s/it]


[ Train | 006/100 ] loss = 1.10415, acc = 0.62500


100%|██████████| 57/57 [01:39<00:00,  1.75s/it]


[ Valid | 006/100 ] loss = 1.31176, acc = 0.56297
[ Valid | 006/100 ] loss = 1.31176, acc = 0.56297


100%|██████████| 157/157 [05:44<00:00,  2.19s/it]


[ Train | 007/100 ] loss = 1.02589, acc = 0.64819


100%|██████████| 57/57 [01:44<00:00,  1.84s/it]


[ Valid | 007/100 ] loss = 1.22496, acc = 0.58942
[ Valid | 007/100 ] loss = 1.22496, acc = 0.58942 -> best
Best model found at epoch 6, saving model


100%|██████████| 157/157 [05:35<00:00,  2.13s/it]


[ Train | 008/100 ] loss = 0.95761, acc = 0.67138


100%|██████████| 57/57 [01:43<00:00,  1.82s/it]


[ Valid | 008/100 ] loss = 1.06624, acc = 0.64078
[ Valid | 008/100 ] loss = 1.06624, acc = 0.64078 -> best
Best model found at epoch 7, saving model


100%|██████████| 157/157 [05:31<00:00,  2.11s/it]


[ Train | 009/100 ] loss = 0.90112, acc = 0.69078


100%|██████████| 57/57 [01:38<00:00,  1.73s/it]


[ Valid | 009/100 ] loss = 1.08532, acc = 0.63356
[ Valid | 009/100 ] loss = 1.08532, acc = 0.63356


100%|██████████| 157/157 [05:27<00:00,  2.09s/it]


[ Train | 010/100 ] loss = 0.84514, acc = 0.71139


100%|██████████| 57/57 [01:48<00:00,  1.90s/it]


[ Valid | 010/100 ] loss = 1.03841, acc = 0.65507
[ Valid | 010/100 ] loss = 1.03841, acc = 0.65507 -> best
Best model found at epoch 9, saving model


100%|██████████| 157/157 [03:50<00:00,  1.47s/it]


[ Train | 011/100 ] loss = 0.81668, acc = 0.72084


100%|██████████| 57/57 [01:38<00:00,  1.74s/it]


[ Valid | 011/100 ] loss = 1.36280, acc = 0.57318
[ Valid | 011/100 ] loss = 1.36280, acc = 0.57318


100%|██████████| 157/157 [05:16<00:00,  2.02s/it]


[ Train | 012/100 ] loss = 0.78222, acc = 0.73656


100%|██████████| 57/57 [01:44<00:00,  1.83s/it]


[ Valid | 012/100 ] loss = 1.20257, acc = 0.59970
[ Valid | 012/100 ] loss = 1.20257, acc = 0.59970


100%|██████████| 157/157 [05:31<00:00,  2.11s/it]


[ Train | 013/100 ] loss = 0.73637, acc = 0.74891


100%|██████████| 57/57 [01:47<00:00,  1.88s/it]


[ Valid | 013/100 ] loss = 1.16024, acc = 0.62750
[ Valid | 013/100 ] loss = 1.16024, acc = 0.62750


100%|██████████| 157/157 [05:46<00:00,  2.21s/it]


[ Train | 014/100 ] loss = 0.68649, acc = 0.76592


100%|██████████| 57/57 [01:39<00:00,  1.74s/it]


[ Valid | 014/100 ] loss = 0.96721, acc = 0.68270
[ Valid | 014/100 ] loss = 0.96721, acc = 0.68270 -> best
Best model found at epoch 13, saving model


100%|██████████| 157/157 [05:47<00:00,  2.21s/it]


[ Train | 015/100 ] loss = 0.66187, acc = 0.77339


100%|██████████| 57/57 [01:41<00:00,  1.79s/it]


[ Valid | 015/100 ] loss = 1.01348, acc = 0.66415
[ Valid | 015/100 ] loss = 1.01348, acc = 0.66415


100%|██████████| 157/157 [05:38<00:00,  2.15s/it]


[ Train | 016/100 ] loss = 0.61741, acc = 0.78911


100%|██████████| 57/57 [01:44<00:00,  1.82s/it]


[ Valid | 016/100 ] loss = 1.07196, acc = 0.66360
[ Valid | 016/100 ] loss = 1.07196, acc = 0.66360


100%|██████████| 157/157 [05:32<00:00,  2.12s/it]


[ Train | 017/100 ] loss = 0.61609, acc = 0.78752


100%|██████████| 57/57 [01:18<00:00,  1.39s/it]


[ Valid | 017/100 ] loss = 1.01055, acc = 0.68377
[ Valid | 017/100 ] loss = 1.01055, acc = 0.68377 -> best
Best model found at epoch 16, saving model


100%|██████████| 157/157 [04:22<00:00,  1.67s/it]


[ Train | 018/100 ] loss = 0.55188, acc = 0.81459


100%|██████████| 57/57 [01:40<00:00,  1.77s/it]


[ Valid | 018/100 ] loss = 0.98782, acc = 0.67881
[ Valid | 018/100 ] loss = 0.98782, acc = 0.67881


100%|██████████| 157/157 [05:27<00:00,  2.09s/it]


[ Train | 019/100 ] loss = 0.55081, acc = 0.80693


100%|██████████| 57/57 [01:40<00:00,  1.76s/it]


[ Valid | 019/100 ] loss = 1.08206, acc = 0.66874
[ Valid | 019/100 ] loss = 1.08206, acc = 0.66874


100%|██████████| 157/157 [05:35<00:00,  2.14s/it]


[ Train | 020/100 ] loss = 0.52640, acc = 0.81648


100%|██████████| 57/57 [01:38<00:00,  1.73s/it]


[ Valid | 020/100 ] loss = 0.96641, acc = 0.71011
[ Valid | 020/100 ] loss = 0.96641, acc = 0.71011 -> best
Best model found at epoch 19, saving model


100%|██████████| 157/157 [05:57<00:00,  2.27s/it]


[ Train | 021/100 ] loss = 0.49906, acc = 0.82793


100%|██████████| 57/57 [01:45<00:00,  1.84s/it]


[ Valid | 021/100 ] loss = 0.90843, acc = 0.72117
[ Valid | 021/100 ] loss = 0.90843, acc = 0.72117 -> best
Best model found at epoch 20, saving model


100%|██████████| 157/157 [05:40<00:00,  2.17s/it]


[ Train | 022/100 ] loss = 0.46573, acc = 0.83539


100%|██████████| 57/57 [01:43<00:00,  1.82s/it]


[ Valid | 022/100 ] loss = 0.89871, acc = 0.71833
[ Valid | 022/100 ] loss = 0.89871, acc = 0.71833


100%|██████████| 157/157 [05:41<00:00,  2.18s/it]


[ Train | 023/100 ] loss = 0.45667, acc = 0.84544


100%|██████████| 57/57 [01:44<00:00,  1.84s/it]


[ Valid | 023/100 ] loss = 0.90542, acc = 0.71902
[ Valid | 023/100 ] loss = 0.90542, acc = 0.71902


100%|██████████| 157/157 [04:20<00:00,  1.66s/it]


[ Train | 024/100 ] loss = 0.43943, acc = 0.85320


100%|██████████| 57/57 [01:11<00:00,  1.26s/it]


[ Valid | 024/100 ] loss = 1.10024, acc = 0.67011
[ Valid | 024/100 ] loss = 1.10024, acc = 0.67011


100%|██████████| 157/157 [05:37<00:00,  2.15s/it]


[ Train | 025/100 ] loss = 0.41966, acc = 0.85430


100%|██████████| 57/57 [01:44<00:00,  1.83s/it]


[ Valid | 025/100 ] loss = 0.92879, acc = 0.71861
[ Valid | 025/100 ] loss = 0.92879, acc = 0.71861


100%|██████████| 157/157 [05:51<00:00,  2.24s/it]


[ Train | 026/100 ] loss = 0.40034, acc = 0.86624


100%|██████████| 57/57 [01:43<00:00,  1.81s/it]

[ Valid | 026/100 ] loss = 1.03209, acc = 0.69681
[ Valid | 026/100 ] loss = 1.03209, acc = 0.69681
No improvment 4 consecutive epochs, early stopping





# Dataloader for test

In [17]:
# Construct test datasets.
# The argument "loader" tells how torchvision reads the data.
test_set = FoodDataset("./test", tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# Testing and generate prediction CSV

In [18]:
model_best = Classifier().to(device)
model_best.load_state_dict(torch.load(f"{_exp_name}_best.ckpt"))
model_best.eval()
prediction = []
with torch.no_grad():
    for data,_ in tqdm(test_loader):
        test_pred = model_best(data.to(device))
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
        prediction += test_label.squeeze().tolist()

100%|██████████| 47/47 [02:31<00:00,  3.21s/it]


In [19]:
# create test csv
def pad4(i):
    return "0"*(4-len(str(i)))+str(i)
df = pd.DataFrame()
df["Id"] = [pad4(i) for i in range(len(test_set))]
df["Category"] = prediction
df.to_csv("submission.csv",index = False)

# Q1. Augmentation Implementation
## Implement augmentation by finishing train_tfm in the code with image size of your choice.
## Directly copy the following block and paste it on GradeScope after you finish the code
### Your train_tfm must be capable of producing 5+ different results when given an identical image multiple times.
### Your  train_tfm in the report can be different from train_tfm in your training code.


In [None]:
train_tfm = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    transforms.Resize((128, 128)),
    # You can add some transforms here.
    transforms.ToTensor(),
])

# Q2. Visual Representations Implementation
## Visualize the learned visual representations of the CNN model on the validation set by implementing t-SNE (t-distributed Stochastic Neighbor Embedding) on the output of both top & mid layers (You need to submit 2 images).


In [None]:
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib.cm as cm
import torch.nn as nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the trained model
model = Classifier().to(device)
state_dict = torch.load(f"{_exp_name}_best.ckpt")
model.load_state_dict(state_dict)
model.eval()

print(model)

In [None]:
# Load the vaildation set defined by TA
valid_set = FoodDataset("./valid", tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

# Extract the representations for the specific layer of model
index = ... # You should find out the index of layer which is defined as "top" or 'mid' layer of your model.
features = []
labels = []
for batch in tqdm(valid_loader):
    imgs, lbls = batch
    with torch.no_grad():
        logits = model.cnn[:index](imgs.to(device))
        logits = logits.view(logits.size()[0], -1)
    labels.extend(lbls.cpu().numpy())
    logits = np.squeeze(logits.cpu().numpy())
    features.extend(logits)

features = np.array(features)
colors_per_class = cm.rainbow(np.linspace(0, 1, 11))

# Apply t-SNE to the features
features_tsne = TSNE(n_components=2, init='pca', random_state=42).fit_transform(features)

# Plot the t-SNE visualization
plt.figure(figsize=(10, 8))
for label in np.unique(labels):
    plt.scatter(features_tsne[labels == label, 0], features_tsne[labels == label, 1], label=label, s=5)
plt.legend()
plt.show()

plt.figure(figsize=(10, 8))
labels = [0]
for label in np.unique(labels):
    plt.scatter(features_tsne[labels == label, 0], features_tsne[labels == label, 1], label=label, s=5)
plt.legend()
plt.show()