In [1]:
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import torch.nn.functional as F_nn
import random

from PIL import Image
from tqdm import tqdm

import time
import os

import matplotlib.pyplot as plt
import pandas as pd
import timm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def paired_transform(color_img, depth_img, output_size=224, max_depth=10.0, apply_color_jitter=True):
    if random.random() > 0.5:
        color_img = F.hflip(color_img)
        depth_img = F.hflip(depth_img)

    angle = random.uniform(-15, 15)
    color_img = F.rotate(color_img, angle, interpolation=Image.BILINEAR)
    depth_img = F.rotate(depth_img, angle, interpolation=Image.NEAREST)

    i, j, h, w = transforms.RandomResizedCrop.get_params(
        color_img, scale=(0.8, 1.0), ratio=(1.0, 1.0)
    )
    color_img = F.resized_crop(color_img, i, j, h, w, size=(output_size, output_size))
    depth_img = F.resized_crop(depth_img, i, j, h, w, size=(output_size, output_size))

    if apply_color_jitter:
        color_jitter = transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3)
        color_img = color_jitter(color_img)

    # RGB → tensor
    color_tensor = F.to_tensor(color_img)
    color_tensor = F.normalize(color_tensor, mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])

    # Depth → tensor (meters)
    depth_np = np.array(depth_img).astype(np.float32) / 1000.0
    depth_tensor = torch.from_numpy(depth_np).unsqueeze(0)
    depth_tensor = torch.clamp(depth_tensor, 0, max_depth)

    return color_tensor, depth_tensor
    

class DepthDataset(Dataset):
    def __init__(self, data_dir, paired_transform=None, max_depth=10.0):
        """
        Args:
            data_dir (str): Path to folder containing 'colors/' and 'depths/' subfolders.
            paired_transform (callable, optional): Function to apply same geometric transform to both RGB and depth.
            max_depth (float): Maximum depth value to scale depth maps (in meters).
        """
        self.data_dir = data_dir
        self.paired_transform = paired_transform
        self.max_depth = max_depth

        # Paths to color and depth folders
        self.color_dir = os.path.join(data_dir, "colors")
        self.depth_dir = os.path.join(data_dir, "depths")

        # List all RGB color images
        self.color_files = sorted([f for f in os.listdir(self.color_dir) if f.endswith("_colors.png")])

    def __len__(self):
        return len(self.color_files)

    def __getitem__(self, idx):
        # Load RGB image
        color_path = os.path.join(self.color_dir, self.color_files[idx])
        color_img = Image.open(color_path).convert("RGB")

        # Load corresponding depth image
        depth_file = self.color_files[idx].replace("_colors.png", "_depth.png")
        depth_path = os.path.join(self.depth_dir, depth_file)
        depth_img = Image.open(depth_path)

        # Apply paired transform if provided
        if self.paired_transform:
            color_tensor, depth_tensor = self.paired_transform(color_img, depth_img)
        else:
            # Convert RGB to tensor and normalize
            color_tensor = T.ToTensor()(color_img)
            color_tensor = T.Normalize(mean=[0.485, 0.456, 0.406],
                                       std=[0.229, 0.224, 0.225])(color_tensor)
            # Convert depth to tensor and scale to meters
            depth_np = np.array(depth_img).astype(np.float32) / 1000.0  # mm → meters
            depth_tensor = torch.from_numpy(depth_np).unsqueeze(0)
            depth_tensor = torch.clamp(depth_tensor, 0, self.max_depth)

        return color_tensor, depth_tensor


class DepthModel(nn.Module):
    def __init__(self, backbone_name='efficientnet_b3', pretrained=True):
        super().__init__()
        
        # --------------------------
        # Encoder: pretrained CNN
        # --------------------------
        # Use features_only=True to get intermediate feature maps for decoder
        self.encoder = timm.create_model(backbone_name, pretrained=pretrained, features_only=True)
        
        # Channels of encoder feature maps at each stage
        encoder_channels = self.encoder.feature_info.channels()  # e.g., [40, 48, 136, 384]
        last_ch = encoder_channels[-1]

        # --------------------------
        # Simple decoder: upsample to original resolution
        # --------------------------
        self.decoder = nn.Sequential(
            nn.Conv2d(last_ch, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(64, 1, kernel_size=3, padding=1)  # Output: single-channel depth map
        )

    def forward(self, x):
        # Encoder forward: returns list of feature maps at different stages
        features = self.encoder(x)
        x = features[-1]  # Use last-stage feature map for decoder

        # Decode to depth map
        depth = self.decoder(x)
        # Unsample to match input size
        depth = F_nn.interpolate(depth, size=(224, 224), mode='bilinear', align_corners=False)
        return depth


# --------------------------
# Training Loop
# --------------------------
def train_depth_model(model, train_loader, val_loader, epochs=10, freeze_encoder=True, lr_head=1e-3, lr_finetune=1e-4, model_path="depth_model.pth", best_val_loss=float("inf")):
    criterion = nn.MSELoss()
    
    if freeze_encoder:
        for param in model.encoder.parameters():
            param.requires_grad = False
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_head)
    else:
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_finetune)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3)


    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs} {'(Frozen Encoder)' if freeze_encoder else '(Fine-tune)'}")

        # Training
        model.train()
        train_loss = 0.0
        for images, depths in tqdm(train_loader):
            images, depths = images.to(device), depths.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, depths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, depths in val_loader:
                images, depths = images.to(device), depths.to(device)
                outputs = model(images)
                loss = criterion(outputs, depths)
                val_loss += loss.item() * images.size(0)
                
        val_loss /= len(val_loader.dataset)
        rmse = np.sqrt(val_loss)
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val RMSE: {rmse:.4f} m")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_path)
            print("✅ Best model saved.")

        # Scheduler step
        scheduler.step(val_loss)

    print("Training complete")
    return best_val_loss



# --------------------------
# Run training
# --------------------------


# --------------------------
# Device
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------------------------
# Model
# --------------------------
model = DepthModel().to(device)


# --------------------------
# Datasets & Loaders
# --------------------------
train_folder = "/kaggle/input/nyu-depth-split-dataset/nyu_split/train"
valid_folder = "/kaggle/input/nyu-depth-split-dataset/nyu_split/val"
max_depth = 10

train_dataset = DepthDataset(train_folder, paired_transform, max_depth)
val_dataset = DepthDataset(valid_folder, paired_transform, max_depth)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

# --------------------------
# Freeze backbone initially
# --------------------------
for param in model.encoder.parameters():
    param.requires_grad = False

# --------------------------
# Training Parameters
# --------------------------
num_epoch_head = 5       # Train classifier head first
num_epoch_finetune = 25  # Fine-tune full model
MODEL_PATH = "/kaggle/working/depth_classifier.pth"

model = DepthModel().to(device)

# 1. Train decoder head first
best_val_loss = train_depth_model(model, train_loader, val_loader, epochs=num_epoch_head, freeze_encoder=True, 
                  model_path= MODEL_PATH)

# 2. Fine-tune full model
for param in model.encoder.parameters():
    param.requires_grad = True
train_depth_model(model, train_loader, val_loader, epochs=num_epoch_finetune, freeze_encoder=False, 
                  model_path="depth_model.pth", best_val_loss = best_val_loss)

Using device: cpu


model.safetensors:   0%|          | 0.00/49.3M [00:00<?, ?B/s]


Epoch 1/5 (Frozen Encoder)


100%|██████████| 19/19 [01:21<00:00,  4.29s/it]


Train Loss: 2.4380 | Val Loss: 1.4995 | Val RMSE: 1.2245 m
✅ Best model saved.

Epoch 2/5 (Frozen Encoder)


100%|██████████| 19/19 [01:17<00:00,  4.09s/it]


Train Loss: 1.3853 | Val Loss: 1.3745 | Val RMSE: 1.1724 m
✅ Best model saved.

Epoch 3/5 (Frozen Encoder)


100%|██████████| 19/19 [01:18<00:00,  4.13s/it]


Train Loss: 1.2904 | Val Loss: 1.1578 | Val RMSE: 1.0760 m
✅ Best model saved.

Epoch 4/5 (Frozen Encoder)


100%|██████████| 19/19 [01:17<00:00,  4.06s/it]


Train Loss: 1.0122 | Val Loss: 1.1405 | Val RMSE: 1.0680 m
✅ Best model saved.

Epoch 5/5 (Frozen Encoder)


100%|██████████| 19/19 [01:17<00:00,  4.10s/it]


Train Loss: 1.0188 | Val Loss: 1.1783 | Val RMSE: 1.0855 m
Training complete

Epoch 1/25 (Fine-tune)


100%|██████████| 19/19 [02:34<00:00,  8.14s/it]


Train Loss: 0.8768 | Val Loss: 0.9554 | Val RMSE: 0.9774 m
✅ Best model saved.

Epoch 2/25 (Fine-tune)


100%|██████████| 19/19 [02:43<00:00,  8.62s/it]


Train Loss: 0.7352 | Val Loss: 0.8694 | Val RMSE: 0.9324 m
✅ Best model saved.

Epoch 3/25 (Fine-tune)


100%|██████████| 19/19 [02:33<00:00,  8.08s/it]


Train Loss: 0.6385 | Val Loss: 0.8446 | Val RMSE: 0.9190 m
✅ Best model saved.

Epoch 4/25 (Fine-tune)


100%|██████████| 19/19 [02:25<00:00,  7.68s/it]


Train Loss: 0.6054 | Val Loss: 0.8500 | Val RMSE: 0.9219 m

Epoch 5/25 (Fine-tune)


100%|██████████| 19/19 [02:34<00:00,  8.15s/it]


Train Loss: 0.5508 | Val Loss: 0.8362 | Val RMSE: 0.9144 m
✅ Best model saved.

Epoch 6/25 (Fine-tune)


100%|██████████| 19/19 [02:29<00:00,  7.86s/it]


Train Loss: 0.5450 | Val Loss: 0.7529 | Val RMSE: 0.8677 m
✅ Best model saved.

Epoch 7/25 (Fine-tune)


100%|██████████| 19/19 [02:27<00:00,  7.78s/it]


Train Loss: 0.4894 | Val Loss: 0.7629 | Val RMSE: 0.8734 m

Epoch 8/25 (Fine-tune)


100%|██████████| 19/19 [02:36<00:00,  8.25s/it]


Train Loss: 0.4867 | Val Loss: 0.7772 | Val RMSE: 0.8816 m

Epoch 9/25 (Fine-tune)


100%|██████████| 19/19 [02:29<00:00,  7.86s/it]


Train Loss: 0.4498 | Val Loss: 0.6770 | Val RMSE: 0.8228 m
✅ Best model saved.

Epoch 10/25 (Fine-tune)


100%|██████████| 19/19 [02:27<00:00,  7.75s/it]


Train Loss: 0.4432 | Val Loss: 0.7468 | Val RMSE: 0.8642 m

Epoch 11/25 (Fine-tune)


100%|██████████| 19/19 [02:29<00:00,  7.87s/it]


Train Loss: 0.4280 | Val Loss: 0.6970 | Val RMSE: 0.8349 m

Epoch 12/25 (Fine-tune)


100%|██████████| 19/19 [02:29<00:00,  7.85s/it]


Train Loss: 0.3982 | Val Loss: 0.7267 | Val RMSE: 0.8525 m

Epoch 13/25 (Fine-tune)


100%|██████████| 19/19 [02:38<00:00,  8.36s/it]


Train Loss: 0.3864 | Val Loss: 0.6881 | Val RMSE: 0.8295 m

Epoch 14/25 (Fine-tune)


100%|██████████| 19/19 [02:28<00:00,  7.80s/it]


Train Loss: 0.3794 | Val Loss: 0.7001 | Val RMSE: 0.8367 m

Epoch 15/25 (Fine-tune)


100%|██████████| 19/19 [02:30<00:00,  7.92s/it]


Train Loss: 0.3635 | Val Loss: 0.6803 | Val RMSE: 0.8248 m

Epoch 16/25 (Fine-tune)


100%|██████████| 19/19 [02:30<00:00,  7.91s/it]


Train Loss: 0.3668 | Val Loss: 0.6995 | Val RMSE: 0.8364 m

Epoch 17/25 (Fine-tune)


100%|██████████| 19/19 [02:34<00:00,  8.11s/it]


Train Loss: 0.3735 | Val Loss: 0.6753 | Val RMSE: 0.8218 m
✅ Best model saved.

Epoch 18/25 (Fine-tune)


100%|██████████| 19/19 [02:32<00:00,  8.04s/it]


Train Loss: 0.3655 | Val Loss: 0.7089 | Val RMSE: 0.8420 m

Epoch 19/25 (Fine-tune)


100%|██████████| 19/19 [02:44<00:00,  8.68s/it]


Train Loss: 0.3627 | Val Loss: 0.6773 | Val RMSE: 0.8230 m

Epoch 20/25 (Fine-tune)


100%|██████████| 19/19 [02:32<00:00,  8.04s/it]


Train Loss: 0.3538 | Val Loss: 0.6586 | Val RMSE: 0.8115 m
✅ Best model saved.

Epoch 21/25 (Fine-tune)


100%|██████████| 19/19 [02:32<00:00,  8.03s/it]


Train Loss: 0.3606 | Val Loss: 0.6617 | Val RMSE: 0.8134 m

Epoch 22/25 (Fine-tune)


100%|██████████| 19/19 [02:29<00:00,  7.84s/it]


Train Loss: 0.3549 | Val Loss: 0.6337 | Val RMSE: 0.7961 m
✅ Best model saved.

Epoch 23/25 (Fine-tune)


100%|██████████| 19/19 [02:30<00:00,  7.90s/it]


Train Loss: 0.3525 | Val Loss: 0.6338 | Val RMSE: 0.7961 m

Epoch 24/25 (Fine-tune)


100%|██████████| 19/19 [02:33<00:00,  8.10s/it]


Train Loss: 0.3515 | Val Loss: 0.6715 | Val RMSE: 0.8195 m

Epoch 25/25 (Fine-tune)


100%|██████████| 19/19 [02:31<00:00,  7.98s/it]


Train Loss: 0.3506 | Val Loss: 0.6248 | Val RMSE: 0.7905 m
✅ Best model saved.
Training complete


0.6248449270541852