Import Libaries

In [1]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models
from torchvision import transforms
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm.notebook import tqdm
from torchvision.transforms.functional import InterpolationMode
from torchvision.models import vit_l_16, ViT_L_16_Weights

Define Dataset

In [2]:
class StreetViewDataset(Dataset):
    def __init__(self, csv_file: str, root_dir: str, mode: str, data_transforms: transforms.Compose, augmentations: transforms.Compose):
        """
        Args:
            csv_file (str): Path to the CSV file with annotations (filename and label).
            root_dir (str): Directory with all the images.
            mode (str): Mode of the dataset, one of ['train', 'valid'].
        """
        self.data_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.mode = mode

        # Define a fixed label-to-index mapping
        self.label_to_index = {'Istanbul': 0, 'Ankara': 1, 'Izmir': 2}

        # Define transformations
        self.transforms = data_transforms
        self.augmentations = augmentations

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        # Load image
        img_name = os.path.join(self.root_dir, self.data_frame.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')

        # Apply transformations
        if self.mode == 'train':
            image = self.transforms(image)
            image = self.augmentations(image)
        else:
            image = self.transforms(image)

        if self.mode in ['train', 'valid']:
            # Get label and convert to index
            label_str = self.data_frame.iloc[idx, 1]
            if label_str not in self.label_to_index:
                raise ValueError(f"Label '{label_str}' is not defined in label_to_index mapping.")
            label = self.label_to_index[label_str]  # Convert string label to numeric index
            label = torch.tensor(label, dtype=torch.long)
            return image, label

        return image

    @staticmethod
    def unnormalize_image(tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
        """
        Unnormalize a tensor image for visualization purposes.

        Args:
            tensor (torch.Tensor): Normalized image tensor.
            mean (list): Mean used for normalization.
            std (list): Standard deviation used for normalization.

        Returns:
            torch.Tensor: Unnormalized image tensor.
        """
        for t, m, s in zip(tensor, mean, std):
            t.mul_(s).add_(m)  # Reverse the normalization
        return tensor

# # Example usage
# train_dataset = StreetViewDataset(csv_file="data/train_data.csv", root_dir="data/train/train", mode='train')
# valid_dataset = StreetViewDataset(csv_file="data/train_data.csv", root_dir="data/train/train", mode='valid')
# image, label = valid_dataset[0]  # For test dataset, only the image is returned
# unnormalized_image = StreetViewDataset.unnormalize_image(image)

# # Visualize the image and label
# import matplotlib.pyplot as plt
# plt.imshow(unnormalized_image.permute(1, 2, 0))
# plt.title(label.item())
# plt.show()

Train Functions

In [3]:
def train_one_epoch(loader, model, criterion, optimizer, device, phase="train"):
    if phase == "train":
        model.train()
    else:
        model.eval()

    running_loss = 0.0
    running_corrects = 0
    all_preds = []
    all_labels = []

    for idx, (inputs, labels) in enumerate(tqdm(loader)):
        inputs = inputs.to(device)
        labels = labels.to(device)

        if phase == "train":
            optimizer.zero_grad()

        with torch.set_grad_enabled(phase == "train"):
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            if phase == "train":
                loss.backward()
                optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    epoch_loss = running_loss / len(loader.dataset)
    epoch_acc = running_corrects.double() / len(loader.dataset)

    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='macro')

    return epoch_loss, epoch_acc, precision, recall, f1

def train(train_loader, valid_loader, model, criterion, optimizer, num_epochs, device):
    best_f1 = 0.0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print("-" * 30)

        # Training phase
        train_loss, train_acc, train_precision, train_recall, train_f1 = train_one_epoch(train_loader, model, criterion, optimizer, device, phase="train")
        print(f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} Precision: {train_precision:.4f} Recall: {train_recall:.4f} F1: {train_f1:.4f}")

        # Validation phase
        valid_loss, valid_acc, valid_precision, valid_recall, valid_f1 = train_one_epoch(valid_loader, model, criterion, optimizer, device, phase="valid")
        print(f"Validation Loss: {valid_loss:.4f} Acc: {valid_acc:.4f} Precision: {valid_precision:.4f} Recall: {valid_recall:.4f} F1: {valid_f1:.4f}")

        # Save the best model based on F1 score
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            torch.save(model.state_dict(), "best_model.pth")
            print(f"Saved Best Model with F1: {best_f1:.4f}")

    print(f"Best Validation Macro F1: {best_f1:.4f}")

Configurations

In [4]:
train_csv = "data/train_data.csv"
train_dir = "data/train"

batch_size = 32
num_epochs_feature_extraction = 7
num_epochs_fine_tuning = 13
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Train Code

In [5]:
# Load the pretrained model
weights = ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V1
model = vit_l_16(weights=weights)

# Replace the classifier head
num_classes = 3
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze only the classifier head
for param in model.heads.head.parameters():
    param.requires_grad = True

transform = weights.transforms()  # Predefined transforms that match the pretrained model

augmentations = transforms.Compose([ # Data augmentations has close to decreased the performance
    #transforms.RandomHorizontalFlip(),
    #transforms.RandomRotation(15),
    #transforms.RandomResizedCrop(224, scale=(0.5, 1.0)),
])

# Datasets
full_dataset = StreetViewDataset(csv_file=train_csv, root_dir=train_dir, mode='train', data_transforms=transform, augmentations=augmentations)
train_size = int(0.8 * len(full_dataset))
valid_size = len(full_dataset) - train_size
train_dataset, valid_dataset = random_split(full_dataset, [train_size, valid_size])

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
# Since only the classifier head is trainable, we pass only its parameters to the optimizer
optimizer = optim.AdamW(model.heads.head.parameters(), lr=learning_rate, weight_decay=1e-4)

model = model.to(device)

# Train the model
print("Starting Feature Extraction Training...")
train(train_loader, valid_loader, model, criterion, optimizer, num_epochs_feature_extraction, device)

# Model Summary
print("Model Architecture:\n")
print(model)

# Unfreeze last two layers of the transformer encoder
# Example assumes model.encoder.layers is a nn.Sequential of transformer layers
for layer in model.encoder.layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Now that we have more parameters to train, we set a new optimizer over all trainable parameters
trainable_parameters = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.AdamW(trainable_parameters, lr=5e-5, weight_decay=1e-4)
train(train_loader, valid_loader, model, criterion, optimizer, num_epochs_fine_tuning, device)

Starting Feature Extraction Training...
Epoch 1/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.5272 Acc: 0.7945 Precision: 0.7951 Recall: 0.7945 F1: 0.7946


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.4088 Acc: 0.8493 Precision: 0.8565 Recall: 0.8493 F1: 0.8479
Saved Best Model with F1: 0.8479
Epoch 2/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.3407 Acc: 0.8721 Precision: 0.8722 Recall: 0.8721 F1: 0.8722


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3839 Acc: 0.8514 Precision: 0.8593 Recall: 0.8514 F1: 0.8511
Saved Best Model with F1: 0.8511
Epoch 3/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.2952 Acc: 0.8918 Precision: 0.8918 Recall: 0.8918 F1: 0.8918


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3291 Acc: 0.8664 Precision: 0.8705 Recall: 0.8664 F1: 0.8672
Saved Best Model with F1: 0.8672
Epoch 4/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.2661 Acc: 0.8993 Precision: 0.8993 Recall: 0.8993 F1: 0.8993


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3045 Acc: 0.8850 Precision: 0.8850 Recall: 0.8850 F1: 0.8852
Saved Best Model with F1: 0.8852
Epoch 5/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.2416 Acc: 0.9104 Precision: 0.9104 Recall: 0.9104 F1: 0.9103


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3077 Acc: 0.8829 Precision: 0.8845 Recall: 0.8829 F1: 0.8830
Epoch 6/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.2257 Acc: 0.9202 Precision: 0.9202 Recall: 0.9202 F1: 0.9202


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2972 Acc: 0.8871 Precision: 0.8882 Recall: 0.8871 F1: 0.8873
Saved Best Model with F1: 0.8873
Epoch 7/7
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.2148 Acc: 0.9223 Precision: 0.9223 Recall: 0.9223 F1: 0.9223


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2927 Acc: 0.8921 Precision: 0.8932 Recall: 0.8921 F1: 0.8919
Saved Best Model with F1: 0.8919
Best Validation Macro F1: 0.8919
Model Architecture:

VisionTransformer(
  (conv_proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=4096, out_features=1024, bias=True)
          (4)

  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.3789 Acc: 0.8821 Precision: 0.8822 Recall: 0.8821 F1: 0.8821


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2401 Acc: 0.9214 Precision: 0.9216 Recall: 0.9214 F1: 0.9217
Saved Best Model with F1: 0.9217
Epoch 2/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.1648 Acc: 0.9384 Precision: 0.9384 Recall: 0.9384 F1: 0.9384


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2222 Acc: 0.9300 Precision: 0.9312 Recall: 0.9300 F1: 0.9301
Saved Best Model with F1: 0.9301
Epoch 3/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.1122 Acc: 0.9582 Precision: 0.9582 Recall: 0.9582 F1: 0.9582


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2832 Acc: 0.9129 Precision: 0.9191 Recall: 0.9129 F1: 0.9133
Epoch 4/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0527 Acc: 0.9809 Precision: 0.9809 Recall: 0.9809 F1: 0.9809


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2410 Acc: 0.9371 Precision: 0.9377 Recall: 0.9371 F1: 0.9373
Saved Best Model with F1: 0.9373
Epoch 5/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0524 Acc: 0.9805 Precision: 0.9805 Recall: 0.9805 F1: 0.9805


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.4377 Acc: 0.9171 Precision: 0.9250 Recall: 0.9171 F1: 0.9171
Epoch 6/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0461 Acc: 0.9825 Precision: 0.9825 Recall: 0.9825 F1: 0.9825


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2309 Acc: 0.9464 Precision: 0.9473 Recall: 0.9464 F1: 0.9464
Saved Best Model with F1: 0.9464
Epoch 7/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0211 Acc: 0.9929 Precision: 0.9929 Recall: 0.9929 F1: 0.9929


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3194 Acc: 0.9386 Precision: 0.9398 Recall: 0.9386 F1: 0.9386
Epoch 8/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0327 Acc: 0.9882 Precision: 0.9882 Recall: 0.9882 F1: 0.9882


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3170 Acc: 0.9343 Precision: 0.9374 Recall: 0.9343 F1: 0.9347
Epoch 9/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0130 Acc: 0.9961 Precision: 0.9961 Recall: 0.9961 F1: 0.9961


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.2776 Acc: 0.9521 Precision: 0.9528 Recall: 0.9521 F1: 0.9523
Saved Best Model with F1: 0.9523
Epoch 10/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0237 Acc: 0.9909 Precision: 0.9909 Recall: 0.9909 F1: 0.9909


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.4512 Acc: 0.9107 Precision: 0.9197 Recall: 0.9107 F1: 0.9116
Epoch 11/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0238 Acc: 0.9927 Precision: 0.9927 Recall: 0.9927 F1: 0.9927


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.4676 Acc: 0.9200 Precision: 0.9271 Recall: 0.9200 F1: 0.9203
Epoch 12/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0154 Acc: 0.9945 Precision: 0.9945 Recall: 0.9945 F1: 0.9945


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3150 Acc: 0.9436 Precision: 0.9441 Recall: 0.9436 F1: 0.9437
Epoch 13/13
------------------------------


  0%|          | 0/175 [00:00<?, ?it/s]

Train Loss: 0.0050 Acc: 0.9982 Precision: 0.9982 Recall: 0.9982 F1: 0.9982


  0%|          | 0/44 [00:00<?, ?it/s]

Validation Loss: 0.3773 Acc: 0.9457 Precision: 0.9464 Recall: 0.9457 F1: 0.9459
Best Validation Macro F1: 0.9523
