In [1]:
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import torch.nn.functional as F_nn
import random

from PIL import Image
from tqdm import tqdm

import time
import os

import matplotlib.pyplot as plt
import pandas as pd
import timm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def paired_transform(color_img, depth_img, output_size=224, max_depth=10.0, apply_color_jitter=True):
    if random.random() > 0.5:
        color_img = F.hflip(color_img)
        depth_img = F.hflip(depth_img)

    angle = random.uniform(-15, 15)
    color_img = F.rotate(color_img, angle, interpolation=Image.BILINEAR)
    depth_img = F.rotate(depth_img, angle, interpolation=Image.NEAREST)

    i, j, h, w = transforms.RandomResizedCrop.get_params(
        color_img, scale=(0.8, 1.0), ratio=(1.0, 1.0)
    )
    color_img = F.resized_crop(color_img, i, j, h, w, size=(output_size, output_size))
    depth_img = F.resized_crop(depth_img, i, j, h, w, size=(output_size, output_size))

    if apply_color_jitter:
        color_jitter = transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3)
        color_img = color_jitter(color_img)

    # RGB → tensor
    color_tensor = F.to_tensor(color_img)
    color_tensor = F.normalize(color_tensor, mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])

    # Depth → tensor (meters)
    depth_np = np.array(depth_img).astype(np.float32) / 1000.0
    
    # Augment depth (after cropping & before clamping)
    if random.random() < 0.3:
        depth_np *= random.uniform(0.9, 1.1)   # brightness-like scaling
    
    if random.random() < 0.3:
        depth_np = depth_np ** random.uniform(0.7, 1.3)   # gamma
    
    if random.random() < 0.3:
        depth_np += np.random.normal(0, 0.01, depth_np.shape)  # Gaussian noise

    
    depth_tensor = torch.from_numpy(depth_np).unsqueeze(0)
    depth_tensor = torch.clamp(depth_tensor, 0, max_depth)

    return color_tensor, depth_tensor
    

class DepthDataset(Dataset):
    def __init__(self, data_dir, paired_transform=None, max_depth=10.0):
        """
        Args:
            data_dir (str): Path to folder containing 'colors/' and 'depths/' subfolders.
            paired_transform (callable, optional): Function to apply same geometric transform to both RGB and depth.
            max_depth (float): Maximum depth value to scale depth maps (in meters).
        """
        self.data_dir = data_dir
        self.paired_transform = paired_transform
        self.max_depth = max_depth

        # Paths to color and depth folders
        self.color_dir = os.path.join(data_dir, "colors")
        self.depth_dir = os.path.join(data_dir, "depths")

        # List all RGB color images
        self.color_files = sorted([f for f in os.listdir(self.color_dir) if f.endswith("_colors.png")])

    def __len__(self):
        return len(self.color_files)

    def __getitem__(self, idx):
        # Load RGB image
        color_path = os.path.join(self.color_dir, self.color_files[idx])
        color_img = Image.open(color_path).convert("RGB")

        # Load corresponding depth image
        depth_file = self.color_files[idx].replace("_colors.png", "_depth.png")
        depth_path = os.path.join(self.depth_dir, depth_file)
        depth_img = Image.open(depth_path)

        # Apply paired transform if provided
        if self.paired_transform:
            color_tensor, depth_tensor = self.paired_transform(color_img, depth_img)
        else:
            # Convert RGB to tensor and normalize
            color_tensor = T.ToTensor()(color_img)
            color_tensor = T.Normalize(mean=[0.485, 0.456, 0.406],
                                       std=[0.229, 0.224, 0.225])(color_tensor)
            # Convert depth to tensor and scale to meters
            depth_np = np.array(depth_img).astype(np.float32) / 1000.0  # mm → meters
            depth_tensor = torch.from_numpy(depth_np).unsqueeze(0)
            depth_tensor = torch.clamp(depth_tensor, 0, self.max_depth)

        return color_tensor, depth_tensor


class DepthModel(nn.Module):
    def __init__(self, backbone_name='efficientnet_b3', pretrained=True):
        super().__init__()
        
        # --------------------------
        # Encoder: pretrained CNN
        # --------------------------
        # Use features_only=True to get intermediate feature maps for decoder
        self.encoder = timm.create_model(backbone_name, pretrained=pretrained, features_only=True)
        
        # Channels of encoder feature maps at each stage
        encoder_channels = self.encoder.feature_info.channels()  # e.g., [40, 48, 136, 384]
        last_ch = encoder_channels[-1]

        # --------------------------
        # Simple decoder: upsample to original resolution
        # --------------------------
        self.decoder = nn.Sequential(
            nn.Conv2d(last_ch, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),

            nn.Conv2d(64, 1, kernel_size=3, padding=1)  # Output: single-channel depth map
        )

    def forward(self, x):
        # Encoder forward: returns list of feature maps at different stages
        features = self.encoder(x)
        x = features[-1]  # Use last-stage feature map for decoder

        # Decode to depth map
        depth = self.decoder(x)
        # Unsample to match input size
        depth = F_nn.interpolate(depth, size=(224, 224), mode='bilinear', align_corners=False)
        return depth

Using device: cpu


In [2]:



# --------------------------
# Training Loop
# --------------------------
def train_depth_model(model, train_loader, val_loader, epochs=10, freeze_encoder=True, lr_head=1e-3, lr_finetune=1e-4, model_path="depth_model.pth", best_val_loss=float("inf")):
    criterion = nn.MSELoss()
    
    if freeze_encoder:
        for param in model.encoder.parameters():
            param.requires_grad = False
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_head)
    else:
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_finetune)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3)


    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs} {'(Frozen Encoder)' if freeze_encoder else '(Fine-tune)'}")

        # Training
        model.train()
        train_loss = 0.0
        for images, depths in tqdm(train_loader):
            images, depths = images.to(device), depths.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, depths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, depths in val_loader:
                images, depths = images.to(device), depths.to(device)
                outputs = model(images)
                loss = criterion(outputs, depths)
                val_loss += loss.item() * images.size(0)
                
        val_loss /= len(val_loader.dataset)
        rmse = np.sqrt(val_loss)
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val RMSE: {rmse:.4f} m")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_path)
            print("✅ Best model saved.")

        # Scheduler step
        scheduler.step(val_loss)

    print("Training complete")
    return best_val_loss



# --------------------------
# Run training
# --------------------------


# --------------------------
# Device
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------------------------
# Model
# --------------------------
model = DepthModel().to(device)


# --------------------------
# Datasets & Loaders
# --------------------------
train_folder = "/kaggle/input/nyu-depth-split-dataset/nyu_split/train"
valid_folder = "/kaggle/input/nyu-depth-split-dataset/nyu_split/val"
max_depth = 10

train_dataset = DepthDataset(train_folder, paired_transform, max_depth)
val_dataset = DepthDataset(valid_folder, paired_transform, max_depth)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

# --------------------------
# Freeze backbone initially
# --------------------------
for param in model.encoder.parameters():
    param.requires_grad = False

# --------------------------
# Training Parameters
# --------------------------
num_epoch_head = 10       # Train classifier head first
num_epoch_finetune = 40  # Fine-tune full model
MODEL_PATH = "/kaggle/working/depth_classifier.pth"

model = DepthModel().to(device)

# 1. Train decoder head first
best_val_loss = train_depth_model(model, train_loader, val_loader, epochs=num_epoch_head, freeze_encoder=True, 
                  model_path= MODEL_PATH)

# 2. Fine-tune full model
for param in model.encoder.parameters():
    param.requires_grad = True
train_depth_model(model, train_loader, val_loader, epochs=num_epoch_finetune, freeze_encoder=False, 
                  model_path="depth_model.pth", best_val_loss = best_val_loss)

model.safetensors:   0%|          | 0.00/49.3M [00:00<?, ?B/s]


Epoch 1/10 (Frozen Encoder)


100%|██████████| 19/19 [01:43<00:00,  5.46s/it]


Train Loss: 2.6969 | Val Loss: 1.9382 | Val RMSE: 1.3922 m
✅ Best model saved.

Epoch 2/10 (Frozen Encoder)


100%|██████████| 19/19 [01:35<00:00,  5.05s/it]


Train Loss: 1.8146 | Val Loss: 1.6446 | Val RMSE: 1.2824 m
✅ Best model saved.

Epoch 3/10 (Frozen Encoder)


100%|██████████| 19/19 [01:37<00:00,  5.16s/it]


Train Loss: 1.4265 | Val Loss: 1.6849 | Val RMSE: 1.2980 m

Epoch 4/10 (Frozen Encoder)


100%|██████████| 19/19 [01:37<00:00,  5.11s/it]


Train Loss: 1.1774 | Val Loss: 1.7670 | Val RMSE: 1.3293 m

Epoch 5/10 (Frozen Encoder)


100%|██████████| 19/19 [01:35<00:00,  5.03s/it]


Train Loss: 1.2991 | Val Loss: 1.2664 | Val RMSE: 1.1253 m
✅ Best model saved.

Epoch 6/10 (Frozen Encoder)


100%|██████████| 19/19 [01:36<00:00,  5.06s/it]


Train Loss: 1.1601 | Val Loss: 1.3712 | Val RMSE: 1.1710 m

Epoch 7/10 (Frozen Encoder)


100%|██████████| 19/19 [01:36<00:00,  5.09s/it]


Train Loss: 1.0705 | Val Loss: 0.9960 | Val RMSE: 0.9980 m
✅ Best model saved.

Epoch 8/10 (Frozen Encoder)


100%|██████████| 19/19 [01:39<00:00,  5.23s/it]


Train Loss: 1.1693 | Val Loss: 1.0636 | Val RMSE: 1.0313 m

Epoch 9/10 (Frozen Encoder)


100%|██████████| 19/19 [01:39<00:00,  5.21s/it]


Train Loss: 1.1247 | Val Loss: 1.5226 | Val RMSE: 1.2339 m

Epoch 10/10 (Frozen Encoder)


100%|██████████| 19/19 [01:39<00:00,  5.26s/it]


Train Loss: 1.0843 | Val Loss: 1.3552 | Val RMSE: 1.1641 m
Training complete

Epoch 1/40 (Fine-tune)


100%|██████████| 19/19 [03:07<00:00,  9.87s/it]


Train Loss: 0.9890 | Val Loss: 1.3862 | Val RMSE: 1.1774 m

Epoch 2/40 (Fine-tune)


100%|██████████| 19/19 [02:57<00:00,  9.35s/it]


Train Loss: 0.8503 | Val Loss: 1.0904 | Val RMSE: 1.0442 m

Epoch 3/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.19s/it]


Train Loss: 0.8586 | Val Loss: 0.9287 | Val RMSE: 0.9637 m
✅ Best model saved.

Epoch 4/40 (Fine-tune)


100%|██████████| 19/19 [02:55<00:00,  9.22s/it]


Train Loss: 0.8378 | Val Loss: 0.9828 | Val RMSE: 0.9914 m

Epoch 5/40 (Fine-tune)


100%|██████████| 19/19 [02:56<00:00,  9.29s/it]


Train Loss: 0.7803 | Val Loss: 1.0295 | Val RMSE: 1.0146 m

Epoch 6/40 (Fine-tune)


100%|██████████| 19/19 [02:52<00:00,  9.08s/it]


Train Loss: 0.7581 | Val Loss: 0.8720 | Val RMSE: 0.9338 m
✅ Best model saved.

Epoch 7/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.19s/it]


Train Loss: 0.7349 | Val Loss: 0.9666 | Val RMSE: 0.9831 m

Epoch 8/40 (Fine-tune)


100%|██████████| 19/19 [02:52<00:00,  9.08s/it]


Train Loss: 0.6795 | Val Loss: 1.0922 | Val RMSE: 1.0451 m

Epoch 9/40 (Fine-tune)


100%|██████████| 19/19 [03:04<00:00,  9.71s/it]


Train Loss: 0.6293 | Val Loss: 0.8665 | Val RMSE: 0.9309 m
✅ Best model saved.

Epoch 10/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.17s/it]


Train Loss: 0.6568 | Val Loss: 1.1461 | Val RMSE: 1.0706 m

Epoch 11/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.11s/it]


Train Loss: 0.6608 | Val Loss: 0.9372 | Val RMSE: 0.9681 m

Epoch 12/40 (Fine-tune)


100%|██████████| 19/19 [02:55<00:00,  9.22s/it]


Train Loss: 0.6115 | Val Loss: 0.9403 | Val RMSE: 0.9697 m

Epoch 13/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.14s/it]


Train Loss: 0.5823 | Val Loss: 0.9089 | Val RMSE: 0.9533 m

Epoch 14/40 (Fine-tune)


100%|██████████| 19/19 [02:56<00:00,  9.31s/it]


Train Loss: 0.6050 | Val Loss: 0.9910 | Val RMSE: 0.9955 m

Epoch 15/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.15s/it]


Train Loss: 0.5922 | Val Loss: 0.8633 | Val RMSE: 0.9291 m
✅ Best model saved.

Epoch 16/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.18s/it]


Train Loss: 0.6034 | Val Loss: 0.9030 | Val RMSE: 0.9503 m

Epoch 17/40 (Fine-tune)


100%|██████████| 19/19 [02:51<00:00,  9.04s/it]


Train Loss: 0.5267 | Val Loss: 0.6373 | Val RMSE: 0.7983 m
✅ Best model saved.

Epoch 18/40 (Fine-tune)


100%|██████████| 19/19 [02:57<00:00,  9.35s/it]


Train Loss: 0.5557 | Val Loss: 1.1782 | Val RMSE: 1.0855 m

Epoch 19/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.15s/it]


Train Loss: 0.5739 | Val Loss: 0.9841 | Val RMSE: 0.9920 m

Epoch 20/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.20s/it]


Train Loss: 0.5997 | Val Loss: 0.8024 | Val RMSE: 0.8958 m

Epoch 21/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.15s/it]


Train Loss: 0.5262 | Val Loss: 0.7489 | Val RMSE: 0.8654 m

Epoch 22/40 (Fine-tune)


100%|██████████| 19/19 [02:56<00:00,  9.27s/it]


Train Loss: 0.5193 | Val Loss: 0.9147 | Val RMSE: 0.9564 m

Epoch 23/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.17s/it]


Train Loss: 0.6084 | Val Loss: 0.9044 | Val RMSE: 0.9510 m

Epoch 24/40 (Fine-tune)


100%|██████████| 19/19 [02:52<00:00,  9.09s/it]


Train Loss: 0.5827 | Val Loss: 0.7407 | Val RMSE: 0.8607 m

Epoch 25/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.13s/it]


Train Loss: 0.5975 | Val Loss: 0.9096 | Val RMSE: 0.9537 m

Epoch 26/40 (Fine-tune)


100%|██████████| 19/19 [02:56<00:00,  9.30s/it]


Train Loss: 0.5172 | Val Loss: 0.9268 | Val RMSE: 0.9627 m

Epoch 27/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.16s/it]


Train Loss: 0.5695 | Val Loss: 0.9469 | Val RMSE: 0.9731 m

Epoch 28/40 (Fine-tune)


100%|██████████| 19/19 [02:57<00:00,  9.32s/it]


Train Loss: 0.6316 | Val Loss: 0.7591 | Val RMSE: 0.8712 m

Epoch 29/40 (Fine-tune)


100%|██████████| 19/19 [02:56<00:00,  9.27s/it]


Train Loss: 0.5627 | Val Loss: 0.9502 | Val RMSE: 0.9748 m

Epoch 30/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.12s/it]


Train Loss: 0.5882 | Val Loss: 1.0064 | Val RMSE: 1.0032 m

Epoch 31/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.18s/it]


Train Loss: 0.5745 | Val Loss: 1.0221 | Val RMSE: 1.0110 m

Epoch 32/40 (Fine-tune)


100%|██████████| 19/19 [02:52<00:00,  9.10s/it]


Train Loss: 0.5586 | Val Loss: 1.0384 | Val RMSE: 1.0190 m

Epoch 33/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.16s/it]


Train Loss: 0.5631 | Val Loss: 0.9524 | Val RMSE: 0.9759 m

Epoch 34/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.11s/it]


Train Loss: 0.5620 | Val Loss: 0.7480 | Val RMSE: 0.8649 m

Epoch 35/40 (Fine-tune)


100%|██████████| 19/19 [02:57<00:00,  9.34s/it]


Train Loss: 0.5811 | Val Loss: 0.8173 | Val RMSE: 0.9041 m

Epoch 36/40 (Fine-tune)


100%|██████████| 19/19 [02:55<00:00,  9.25s/it]


Train Loss: 0.5790 | Val Loss: 0.8079 | Val RMSE: 0.8989 m

Epoch 37/40 (Fine-tune)


100%|██████████| 19/19 [02:58<00:00,  9.38s/it]


Train Loss: 0.5636 | Val Loss: 0.8252 | Val RMSE: 0.9084 m

Epoch 38/40 (Fine-tune)


100%|██████████| 19/19 [02:55<00:00,  9.21s/it]


Train Loss: 0.5793 | Val Loss: 0.9424 | Val RMSE: 0.9708 m

Epoch 39/40 (Fine-tune)


100%|██████████| 19/19 [02:54<00:00,  9.21s/it]


Train Loss: 0.5516 | Val Loss: 0.7693 | Val RMSE: 0.8771 m

Epoch 40/40 (Fine-tune)


100%|██████████| 19/19 [02:53<00:00,  9.12s/it]


Train Loss: 0.5463 | Val Loss: 1.0670 | Val RMSE: 1.0330 m
Training complete


0.6373271135183481

In [3]:
# # --------------------------
# # Testing / Inference
# # --------------------------
# import torch
# import torch.nn as nn
# import numpy as np
# import matplotlib.pyplot as plt
# from tqdm import tqdm


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

# # --------------------------
# # Test Dataset & DataLoader
# # --------------------------
# test_folder = "/kaggle/input/nyu-test-colors-depths/nyu2_test_colors_depths"
# test_dataset = DepthDataset(test_folder, paired_transform, max_depth=10.0)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# # Load trained model
# test_model = DepthModel().to(device)
# test_model.load_state_dict(torch.load(
#     "/kaggle/input/depthpredicterv2/pytorch/default/1/depth_classifierV2.pth", 
#     map_location=device
# ))
# test_model.eval()

# # Loss function for evaluation
# criterion = nn.MSELoss()

# # --------------------------
# # Evaluation with tqdm
# # --------------------------
# val_loss, val_mae = 0.0, 0.0
# with torch.no_grad():
#     for images, depths in tqdm(test_loader, desc="Evaluating on test set"):
#         images, depths = images.to(device), depths.to(device)
#         outputs = test_model(images)

#         # Loss metrics
#         mse_loss = criterion(outputs, depths)
#         mae_loss = torch.mean(torch.abs(outputs - depths))

#         val_loss += mse_loss.item() * images.size(0)
#         val_mae += mae_loss.item() * images.size(0)

# val_loss /= len(test_loader.dataset)
# val_mae /= len(test_loader.dataset)
# val_rmse = np.sqrt(val_loss)

# print(f"✅ Test Results -> MSE: {val_loss:.4f}, RMSE: {val_rmse:.4f} m, MAE: {val_mae:.4f} m")

# # --------------------------
# # Visualization
# # --------------------------
# def show_predictions(model, dataset, num_samples=5):
#     model.eval()
#     fig, axes = plt.subplots(num_samples, 3, figsize=(12, 4 * num_samples))

#     for i in tqdm(range(num_samples), desc="Visualizing predictions"):
#         color, depth_gt = dataset[i]
#         color_batch = color.unsqueeze(0).to(device)

#         with torch.no_grad():
#             depth_pred = model(color_batch).cpu().squeeze().numpy()

#         depth_gt = depth_gt.squeeze().numpy()

#         # Plot RGB
#         axes[i, 0].imshow(np.transpose(color.numpy(), (1, 2, 0)))
#         axes[i, 0].set_title("RGB Input")
#         axes[i, 0].axis("off")

#         # Plot Ground Truth Depth
#         axes[i, 1].imshow(depth_gt, cmap="inferno")
#         axes[i, 1].set_title("Ground Truth Depth")
#         axes[i, 1].axis("off")

#         # Plot Predicted Depth
#         axes[i, 2].imshow(depth_pred, cmap="inferno")
#         axes[i, 2].set_title("Predicted Depth")
#         axes[i, 2].axis("off")

#     plt.tight_layout()
#     plt.show()

# # Visualize predictions from test set
# show_predictions(test_model, test_dataset, num_samples=5)