In [1]:
import numpy as np
import pandas as pd
import os.path as osp
from PIL import Image
import matplotlib.pyplot as plt
import os
import csv
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler, Subset
from torch.optim.lr_scheduler import OneCycleLR
from torchvision.transforms import RandAugment, RandomErasing
from collections import Counter

# For reproducibility
def set_seed(seed, use_gpu=True):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

SEED = 123
USE_SEED = True
if USE_SEED:
    set_seed(SEED, torch.cuda.is_available())

**Data Loading**

In [2]:
class ImageDataset(Dataset):
    def __init__(self, root: str, test: bool = False, transform=None):
        super().__init__()
        self.root = root
        self.transform = transform or transforms.Compose([
            transforms.ToTensor(),
        ])
        self.test = test

        self.img_path = osp.join(root, 'images')
        self.targets = []
        self.ids = []

        if not test:
            # Load images and labels
            labels_path = osp.join(root, 'labels.csv')
            with open(labels_path, 'r') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    image_id = row['id'].zfill(5)
                    label = int(row['label'])
                    self.targets.append(label)
                    self.ids.append(image_id)
        else:
            # Test mode: no labels.csv
            for fname in sorted(os.listdir(self.img_path)):
                if fname.endswith('.jpeg'):
                    image_id = fname[:-5].zfill(5)
                    self.ids.append(image_id)

    def __getitem__(self, index: int):
        img_id = self.ids[index]
        img_file = osp.join(self.img_path, f'{img_id}.jpeg')
        img = Image.open(img_file).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        if self.test:
            return img, img_id
        else:
            target = self.targets[index]
            return img, target

    def __len__(self) -> int:
        return len(self.ids)

In [3]:
datasets_dir = '/kaggle/input/unipd-deep-learning-2025-challenge-1/'

# create datasets
train_dataset = ImageDataset(datasets_dir + 'train_dataset', test=False)
test_dataset = ImageDataset(datasets_dir + 'test_dataset', test=True)

train_dataset_for_val   = ImageDataset(datasets_dir + 'train_dataset', test=False)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


#Class distribution
labels_path = osp.join(datasets_dir + 'train_dataset', 'labels.csv')
label_counter = Counter()
with open(labels_path, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        label = int(row['label'])
        label_counter[label] += 1

print("Class distribution:")
for label, count in sorted(label_counter.items()):
    print(f"Class {label:2d}: {count} samples")

Train dataset size: 22430
Test dataset size: 4000
Class distribution:
Class  0: 1300 samples
Class  1: 1300 samples
Class  2: 1300 samples
Class  3: 1300 samples
Class  4: 1300 samples
Class  5: 755 samples
Class  6: 1300 samples
Class  7: 658 samples
Class  8: 1300 samples
Class  9: 1300 samples
Class 10: 1300 samples
Class 11: 1300 samples
Class 12: 1300 samples
Class 13: 756 samples
Class 14: 1300 samples
Class 15: 550 samples
Class 16: 751 samples
Class 17: 1300 samples
Class 18: 1300 samples
Class 19: 760 samples


**Data Split and Transformations**

In [4]:
#Transformations and data agumentation
train_transforms = transforms.Compose([
    transforms.Resize((128,128)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    #transforms.ColorJitter(0.3,0.3,0.3,0.1),
    transforms.RandomAffine(0, translate=(0.1,0.1), scale=(0.9,1.1)),
    transforms.RandomPerspective(0.2, p=0.5),
    #transforms.GaussianBlur(3, (0.1,2.0)),
    RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value='random'),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

train_dataset.transform = train_transforms
train_dataset_for_val.transform = val_transforms
test_dataset.transform = test_transforms

In [5]:
#Divide set in train and val set
total_size = len(train_dataset)
train_size = int(total_size * 0.75) #0.75
val_size = total_size - train_size
generator = torch.Generator().manual_seed(42)

train_indices, val_indices = random_split(range(total_size), [train_size, val_size], generator=generator)

train_set = Subset(train_dataset, train_indices)
val_set   = Subset(train_dataset_for_val, val_indices)

print(f"Size training set: {len(train_set)}")
print(f"Size validation set: {len(val_set)}")

Size training set: 16822
Size validation set: 5608


In [6]:
subset_targets = [train_dataset.targets[i] for i in train_set.indices]
class_counts = np.bincount(np.array(subset_targets))
print("Class counts:", class_counts)

# Compute weights: inverse of class frequency
class_weights = 1. / class_counts
print("Class weights:", class_weights)


sample_weights = [class_weights[label] for label in subset_targets]
train_sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)

Class counts: [ 982  964  972  958  977  558  984  490  997  996 1006  959  949  575
  961  417  570  958  987  562]
Class weights: [0.00101833 0.00103734 0.00102881 0.00104384 0.00102354 0.00179211
 0.00101626 0.00204082 0.00100301 0.00100402 0.00099404 0.00104275
 0.00105374 0.00173913 0.00104058 0.00239808 0.00175439 0.00104384
 0.00101317 0.00177936]


In [7]:
# Create data loaders
batch_size = 64
train_loader = DataLoader(train_set, batch_size=batch_size, sampler=train_sampler, num_workers=2)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Check the shape of a sample
sample_image, sample_label = next(iter(train_loader))
print(f"Batch image shape: {sample_image.shape}")
print(f"Batch label shape: {sample_label.shape}")

Batch image shape: torch.Size([64, 3, 128, 128])
Batch label shape: torch.Size([64])


**Model Definition**

In [9]:
class ImageClassifier(nn.Module):
    def __init__(self, num_classes=20):
        super(ImageClassifier, self).__init__()

        self.features = nn.Sequential(
            # Block 1: Input: 3 x 64 x 64
            nn.Conv2d(3, 64, 3, padding=1, bias=False), # 3 input channels, 64 filters
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),
            nn.Conv2d(64, 64, 3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),   # 64x64 → 64x32x32

            # Block 2: Input: 64 x 32 x 32
            nn.Conv2d(64, 128, 3, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),
            nn.Conv2d(128, 128, 3, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),   # 32x32 → 128x16x16

            # Block 3: Input: 128 x 16 x 16
            nn.Conv2d(128, 256, 3, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),
            nn.Conv2d(256, 256, 3, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),   # 16x16 → 256x8x8

            # Block 4: Input: 256x8x8
            nn.Conv2d(256, 512, 3, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2), #0.2
            nn.Conv2d(512, 512, 3, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2), # 8x8 → 512x8x8

            # Block 5: 512×8×8 → 512×4×4  (nuevo bloque)
            nn.Conv2d(512, 512, 3, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),
            nn.Conv2d(512, 512, 3, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )

        # Bottleneck and pooling
        self.bottleneck = nn.Sequential(
            nn.Conv2d(512, 256, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
        )
        self.pool = nn.AdaptiveAvgPool2d((1,1))  # 256x4x4 → 256x1x1

        # Fully connected layer
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.4), #0.5
            nn.Linear(256, 128, bias=False),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5), #0.4
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.bottleneck(x)
        x = self.pool(x)
        return self.classifier(x)

In [10]:
# Check if CUDA is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = ImageClassifier(num_classes=20).to(device)

class_weights_tensor = torch.FloatTensor(class_weights).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.08, weight=class_weights_tensor) #0.05
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.003, #0.003
    steps_per_epoch=len(train_loader),
    epochs=150,
    pct_start=0.1, # 10% warmup
    anneal_strategy='cos',
    cycle_momentum=False
)

Using device: cuda


**Training**

In [11]:
# Accuracy
def calculate_accuracy(y_pred, y_true):
    predicted_labels = y_pred.argmax(dim=1)
    correct = (predicted_labels == y_true).sum().item()
    accuracy = correct / y_true.size(0)
    return accuracy

In [12]:
#train function

def train(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()  # Set model to training mode
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()  # Reset gradients

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and update weights
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.data)
        total_samples += labels.size(0)

    # Computer loss and accuracy
    epoch_loss = running_loss / total_samples
    epoch_acc = running_corrects.double() / total_samples
    return epoch_loss, epoch_acc

In [13]:
#Evaluate function
def evaluate(model, dataloader, criterion, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0

    # Disable gradient
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            running_corrects += torch.sum(preds == labels.data)
            total_samples += labels.size(0)

    epoch_loss = running_loss / total_samples
    epoch_acc = running_corrects.double() / total_samples
    return epoch_loss, epoch_acc

In [14]:
num_epochs = 150
best_val_loss = float('inf')
patience = 10
epochs_no_improve = 0

for epoch in range(num_epochs):
    # Training
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, scheduler, device)

    # Validation
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)

    # Print results by epoch
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}  Train Acc: {train_acc:.4f}")
    print(f"Val Loss:   {val_loss:.4f}  Val Acc:   {val_acc:.4f}")

    #Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Best model saved at epoch {epoch + 1}\n")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

Epoch 1/150
Train Loss: 3.0017  Train Acc: 0.0735
Val Loss:   2.8977  Val Acc:   0.0774
Best model saved at epoch 1

Epoch 2/150
Train Loss: 2.8441  Train Acc: 0.1049
Val Loss:   2.7380  Val Acc:   0.1123
Best model saved at epoch 2

Epoch 3/150
Train Loss: 2.7538  Train Acc: 0.1203
Val Loss:   2.6889  Val Acc:   0.1261
Best model saved at epoch 3

Epoch 4/150
Train Loss: 2.6788  Train Acc: 0.1367
Val Loss:   2.5703  Val Acc:   0.1862
Best model saved at epoch 4

Epoch 5/150
Train Loss: 2.5447  Train Acc: 0.1859
Val Loss:   2.2984  Val Acc:   0.2564
Best model saved at epoch 5

Epoch 6/150
Train Loss: 2.4091  Train Acc: 0.2157
Val Loss:   2.2554  Val Acc:   0.2316
Best model saved at epoch 6

Epoch 7/150
Train Loss: 2.3276  Train Acc: 0.2429
Val Loss:   2.2172  Val Acc:   0.2853
Best model saved at epoch 7

Epoch 8/150
Train Loss: 2.2765  Train Acc: 0.2560
Val Loss:   2.1500  Val Acc:   0.3047
Best model saved at epoch 8

Epoch 9/150
Train Loss: 2.2581  Train Acc: 0.2644
Val Loss:   2.

**Test Predictions**

In [15]:
# Load the best saved model
model.load_state_dict(torch.load("best_model.pth"))

model.eval()
results = []

with torch.no_grad():
  for images, img_ids in test_loader:
      images = images.to(device)
      preds = model(images)
      predicted_labels = preds.argmax(dim=1).cpu().numpy()
      results.extend(zip(img_ids, predicted_labels))

  model.load_state_dict(torch.load("best_model.pth"))


In [16]:
import pandas as pd

submission_df = pd.DataFrame(results, columns=['id', 'label'])
submission_df.to_csv('submission.csv', index=False)