In [31]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torchvision import transforms, models
from torch import nn, optim
from PIL import Image
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
from torchvision.transforms import InterpolationMode
from sklearn.model_selection import KFold

In [32]:
"""## Configurations"""

# Hyperparameter configuration class
class Config:
    BATCH_SIZE = 32  # Optimized batch size
    NUM_WORKERS = os.cpu_count() // 2
    IMG_SIZE = (512, 512)
    LEARNING_RATE = 1e-2  # Adjusted learning rate for CosineAnnealingLR
    DROPOUT = 0.3  # Adjusted dropout for better generalization
    NUM_NEURONS = 512  # Increased neurons in dense layer
    EPOCHS_FEATURE_EXTRACTION = 8
    EPOCHS_FINE_TUNING = 22
    FINE_TUNE_LR = 1e-3  # Adjusted fine-tuning learning rate
    WEIGHT_DECAY = 1e-2
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MODEL_SAVE_PATH = "best_model_vit_l.pth"
    NUM_FOLDS = 3  # Cross-validation folds

In [33]:
"""# Create Data Loader and Preprocessing"""

class CampusDataset(Dataset):
    def __init__(self, data, img_dir, transform=None):
        self.data = data.reset_index(drop=True)  # Ensure indices match the Subset
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = f"{self.img_dir}/{self.data.iloc[idx]['filename']}"
        image = Image.open(img_path).convert("RGB")
        latitude = self.data.iloc[idx]["latitude"]
        longitude = self.data.iloc[idx]["longitude"]

        if self.transform:
            image = self.transform(image)

        target = torch.tensor([latitude, longitude], dtype=torch.float32)
        return image, target

In [34]:
# Data augmentations
train_transform = transforms.Compose([
    transforms.RandomRotation(degrees=2),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

csv_file = "workspace/data_512/train.csv"
img_dir = "workspace/data_512/train"
full_data = pd.read_csv(csv_file, sep=";")

lat_min = full_data["latitude"].min()
lat_max = full_data["latitude"].max()
lon_min = full_data["longitude"].min()
lon_max = full_data["longitude"].max()

In [35]:
"""# Model arch"""

class LatitudeLongitudeModel(nn.Module):
    def __init__(self):
        super(LatitudeLongitudeModel, self).__init__()
        self.base_model = models.vit_l_16(weights=models.ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V1)

        self.base_model.heads = nn.Sequential(
            nn.Linear(self.base_model.heads.head.in_features, Config.NUM_NEURONS),
            nn.LeakyReLU(),
            nn.Dropout(Config.DROPOUT),
            nn.Linear(Config.NUM_NEURONS, 2)
        )

    def forward(self, x):
        return self.base_model(x)

In [36]:
"""## RMSEHaversineLoss Loss"""

class RMSEHaversineLoss(nn.Module):
    def __init__(self):
        super(RMSEHaversineLoss, self).__init__()
        self.earth_radius = 6371

    def forward(self, preds, targets):
        lat1, lon1 = preds[:, 0] * (torch.pi / 180), preds[:, 1] * (torch.pi / 180)
        lat2, lon2 = targets[:, 0] * (torch.pi / 180), targets[:, 1] * (torch.pi / 180)

        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1) * torch.cos(lat2) * torch.sin(dlon / 2) ** 2
        c = 2 * torch.asin(torch.sqrt(torch.clamp(a, 0, 1)))

        distances = self.earth_radius * c
        rmse = torch.sqrt(torch.mean(distances ** 2))
        return rmse

In [37]:
"""## Training and Cross-Validation"""

def denormalize_outputs(normalized_outputs, lat_min, lat_max, lon_min, lon_max):
    denormalized_outputs = torch.empty_like(normalized_outputs)
    denormalized_outputs[:, 0] = normalized_outputs[:, 0] * (lat_max - lat_min) + lat_min
    denormalized_outputs[:, 1] = normalized_outputs[:, 1] * (lon_max - lon_min) + lon_min
    return denormalized_outputs

def train_one_epoch(model, dataloader, criterion, optimizer, lat_min, lat_max, lon_min, lon_max):
    model.train()
    epoch_loss = 0
    for images, targets in tqdm(dataloader, desc="Training Epoch"):
        images, targets = images.to(Config.DEVICE), targets.to(Config.DEVICE)
        optimizer.zero_grad()
        normalized_outputs = model(images)
        denormalized_outputs = denormalize_outputs(normalized_outputs, lat_min, lat_max, lon_min, lon_max)
        loss = criterion(denormalized_outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def validate(model, dataloader, criterion, lat_min, lat_max, lon_min, lon_max):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for images, targets in tqdm(dataloader, desc="Validation Epoch"):
            images, targets = images.to(Config.DEVICE), targets.to(Config.DEVICE)
            normalized_outputs = model(images)
            denormalized_outputs = denormalize_outputs(normalized_outputs, lat_min, lat_max, lon_min, lon_max)
            loss = criterion(denormalized_outputs, targets)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [38]:
"""## Cross-Validation Loop"""

kf = KFold(n_splits=Config.NUM_FOLDS, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(full_data)):
    print(f"Starting Fold {fold+1}/{Config.NUM_FOLDS}")
    train_data = full_data.iloc[train_idx].reset_index(drop=True)
    val_data = full_data.iloc[val_idx].reset_index(drop=True)

    train_dataset = CampusDataset(train_data, img_dir, transform=train_transform)
    val_dataset = CampusDataset(val_data, img_dir, transform=val_transform)

    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=Config.NUM_WORKERS)

    model = LatitudeLongitudeModel().to(Config.DEVICE)
    
    # Freeze all layers except the classifier and the last 20% of layers
    for param in model.base_model.parameters():
        param.requires_grad = False

    total_layers = len(list(model.base_model.encoder.layers))
    unfrozen_layers = total_layers // 5  # Unfreeze last 20% of layers

    for name, param in model.base_model.named_parameters():
        if "heads" in name or any(f"encoder.layers.{i}" in name for i in range(total_layers - unfrozen_layers, total_layers)):
            param.requires_grad = True

    model.load_state_dict(torch.load("workspace/best_model_fold_1.pth"))  # Load the best model

    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=Config.LEARNING_RATE)
    criterion = RMSEHaversineLoss().to(Config.DEVICE)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Config.EPOCHS_FEATURE_EXTRACTION + Config.EPOCHS_FINE_TUNING)

    best_val_loss = float('inf')

    for epoch in range(Config.EPOCHS_FEATURE_EXTRACTION + Config.EPOCHS_FINE_TUNING):
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, lat_min, lat_max, lon_min, lon_max)
        val_loss = validate(model, val_loader, criterion, lat_min, lat_max, lon_min, lon_max)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f"best_model_fold_{fold+1}.pth")
            print(f"Model saved for fold {fold+1} at epoch {epoch+1} with validation loss: {val_loss:.4f}")

        print(f"Fold {fold+1} - Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        scheduler.step()

    fold_results.append(val_loss)

print(f"Cross-Validation Results: {fold_results}")
print(f"Average Validation Loss: {np.mean(fold_results):.4f}")

Starting Fold 1/3


  model.load_state_dict(torch.load("workspace/best_model_fold_1.pth"))  # Load the best model


Training Epoch:   0%|          | 0/92 [00:00<?, ?it/s]

Validation Epoch:   0%|          | 0/46 [00:00<?, ?it/s]

Model saved for fold 1 at epoch 1 with validation loss: 0.4757
Fold 1 - Epoch 1, Train Loss: 1.4128, Validation Loss: 0.4757


Training Epoch:   0%|          | 0/92 [00:00<?, ?it/s]

Validation Epoch:   0%|          | 0/46 [00:00<?, ?it/s]

Fold 1 - Epoch 2, Train Loss: 0.5252, Validation Loss: 0.5042


Training Epoch:   0%|          | 0/92 [00:00<?, ?it/s]

Validation Epoch:   0%|          | 0/46 [00:00<?, ?it/s]

Fold 1 - Epoch 3, Train Loss: 0.5763, Validation Loss: 0.4866


Training Epoch:   0%|          | 0/92 [00:00<?, ?it/s]

KeyboardInterrupt: 