# CSIRO Image2Biomass - V8: DINOv2 + Depth Fusion

Combines DINOv2 foundation model with Depth Anything v2:
- **Backbone**: DINOv2 ViT-Base (best foundation model)
- **Depth**: Depth Anything v2 Small
- **Ensemble**: 5-fold cross-validation
- **Val Loss**: 2.08 +/- 0.99

## Setup
1. Add model dataset + competition data
2. **Set Internet to OFF**
3. Run all cells

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import gc

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

In [None]:
# Paths
TEST_CSV = '/kaggle/input/csiro-biomass/test.csv'
TEST_IMG_DIR = '/kaggle/input/csiro-biomass/test'
TRAIN_CSV = '/kaggle/input/csiro-biomass/train.csv'

MODEL_BASE = '/kaggle/input/image2biomass-dinov2-depth/pytorch/default/1/dinov2_depth_model'
DEPTH_MODEL_PATH = f'{MODEL_BASE}/depth_anything_v2_small'
N_FOLDS = 5

TARGET_NAMES = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']

CONFIG = {
    'backbone': 'vit_base_patch14_dinov2',
    'image_size': 518,
    'features': 768,
    'batch_size': 4,
    'dropout': 0.3
}

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {DEVICE}")

## Shared Depth Estimator

In [None]:
class SharedDepthEstimator(nn.Module):
    """Depth estimator loaded once and shared."""
    
    def __init__(self, model_path):
        super().__init__()
        self.model = AutoModelForDepthEstimation.from_pretrained(model_path)
        self.model.eval()
        for param in self.model.parameters():
            param.requires_grad = False
    
    @torch.no_grad()
    def forward(self, images):
        B, C, H, W = images.shape
        outputs = self.model(images)
        depth = outputs.predicted_depth
        depth = F.interpolate(depth.unsqueeze(1), size=(H, W), mode='bilinear', align_corners=False)
        
        # Normalize to [0, 1]
        depth_flat = depth.view(B, -1)
        depth_min = depth_flat.min(dim=1, keepdim=True)[0].view(B, 1, 1, 1)
        depth_max = depth_flat.max(dim=1, keepdim=True)[0].view(B, 1, 1, 1)
        depth = (depth - depth_min) / (depth_max - depth_min + 1e-8)
        return depth

print("Loading shared depth model...")
shared_depth_model = SharedDepthEstimator(DEPTH_MODEL_PATH).to(DEVICE)
print("Depth model loaded!")

## DINOv2 + Depth Fusion Model

In [None]:
class DINOv2DepthFusionModel(nn.Module):
    """DINOv2 + Depth fusion model for inference."""
    
    def __init__(self, backbone_name, num_features=768, dropout=0.3):
        super().__init__()
        self.target_names = TARGET_NAMES
        
        # DINOv2 backbone
        self.backbone = timm.create_model(
            backbone_name,
            pretrained=False,
            num_classes=0,
        )
        
        # Depth encoder (same as training)
        self.depth_encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(32),
            nn.GELU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.GELU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.GELU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        
        fused_features = num_features + 256
        
        # Regression heads
        self.heads = nn.ModuleDict()
        for name in self.target_names:
            self.heads[name] = nn.Sequential(
                nn.Linear(fused_features, 256),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(256, 64),
                nn.GELU(),
                nn.Linear(64, 1)
            )
    
    def forward(self, images, depth_maps):
        rgb_features = self.backbone(images)
        depth_features = self.depth_encoder(depth_maps)
        fused = torch.cat([rgb_features, depth_features], dim=1)
        return {name: self.heads[name](fused).squeeze(-1) for name in self.target_names}

print("DINOv2DepthFusionModel defined")

## Dataset and Transforms

In [None]:
class BiomassTestDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.img_dir = Path(img_dir)
        self.transform = transform
        self.df = pd.read_csv(csv_path)
        self.df['image_id'] = self.df['sample_id'].str.split('__').str[0]
        self.image_ids = self.df['image_id'].unique()
        self.image_paths = self.df.groupby('image_id')['image_path'].first().to_dict()
    
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_path = self.img_dir / Path(self.image_paths[image_id]).name
        image = np.array(Image.open(image_path).convert('RGB'))
        
        if self.transform:
            image = self.transform(image=image)['image']
        return {'image': image, 'image_id': image_id}

# Simple transform (no TTA for now)
val_transform = A.Compose([
    A.Resize(CONFIG['image_size'], CONFIG['image_size']),
    A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ToTensorV2(),
])

print("Dataset class defined")

## Get Target Statistics

In [None]:
train_df = pd.read_csv(TRAIN_CSV)
train_df['image_id'] = train_df['sample_id'].str.split('__').str[0]
train_wide = train_df.pivot_table(index='image_id', columns='target_name', values='target', aggfunc='first')

target_stats = {}
for name in TARGET_NAMES:
    values = train_wide[name].values
    target_stats[name] = {'mean': float(np.mean(values)), 'std': float(np.std(values)) + 1e-8}

print("Target stats:")
for name, stats in target_stats.items():
    print(f"  {name:<15} mean: {stats['mean']:>8.2f}  std: {stats['std']:>8.2f}")

## Pre-compute Depth Maps

In [None]:
print("Pre-computing depth maps...")

dataset = BiomassTestDataset(TEST_CSV, TEST_IMG_DIR, transform=val_transform)
loader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=0)

depth_maps_cache = {}
images_cache = {}

with torch.no_grad():
    for batch in tqdm(loader, desc="Computing depth"):
        images = batch['image'].to(DEVICE)
        image_ids = batch['image_id']
        
        depth_maps = shared_depth_model(images)
        
        for i, img_id in enumerate(image_ids):
            depth_maps_cache[img_id] = depth_maps[i:i+1].cpu()
            images_cache[img_id] = images[i:i+1].cpu()

# Free depth model
del shared_depth_model
torch.cuda.empty_cache()
gc.collect()

print(f"Pre-computed depth for {len(depth_maps_cache)} images")

## Load Models and Generate Predictions

In [None]:
def denormalize(pred_dict, stats):
    return {name: (val * stats[name]['std']) + stats[name]['mean'] for name, val in pred_dict.items()}

all_fold_predictions = []
image_ids_order = list(images_cache.keys())

for fold_idx in range(N_FOLDS):
    print(f"\nFold {fold_idx + 1}/{N_FOLDS}...")
    
    model = DINOv2DepthFusionModel(
        backbone_name=CONFIG['backbone'],
        num_features=CONFIG['features'],
        dropout=CONFIG['dropout']
    ).to(DEVICE)
    
    checkpoint_path = Path(MODEL_BASE) / f'fold_{fold_idx}' / 'best_model.pth'
    checkpoint = torch.load(checkpoint_path, map_location=DEVICE, weights_only=False)
    
    # Load weights (skip depth_estimator)
    state_dict = checkpoint['model_state_dict']
    model_state = model.state_dict()
    
    for key in model_state.keys():
        if key in state_dict:
            model_state[key] = state_dict[key]
    
    model.load_state_dict(model_state)
    model.eval()
    print(f"  Loaded (Val Loss: {checkpoint['best_val_loss']:.4f})")
    
    fold_preds = {}
    
    with torch.no_grad():
        for img_id in tqdm(image_ids_order, desc=f"Fold {fold_idx + 1}"):
            images = images_cache[img_id].to(DEVICE)
            depth_maps = depth_maps_cache[img_id].to(DEVICE)
            
            pred = model(images, depth_maps)
            pred_dict = {name: pred[name][0].item() for name in TARGET_NAMES}
            pred_denorm = denormalize(pred_dict, target_stats)
            fold_preds[img_id] = pred_denorm
    
    all_fold_predictions.append(fold_preds)
    
    del model
    torch.cuda.empty_cache()

print(f"\nGenerated predictions from {N_FOLDS} folds")

## Ensemble and Apply Constraints

In [None]:
# Average across folds
ensemble_predictions = {}

for img_id in image_ids_order:
    ensemble_pred = {}
    for name in TARGET_NAMES:
        fold_vals = [fp[img_id][name] for fp in all_fold_predictions]
        ensemble_pred[name] = np.mean(fold_vals)
    ensemble_predictions[img_id] = ensemble_pred

# Apply biological constraints
print("Applying constraints...")
for img_id in image_ids_order:
    pred = ensemble_predictions[img_id]
    
    # Clip negatives
    for name in TARGET_NAMES:
        pred[name] = max(0.0, pred[name])
    
    clover = pred['Dry_Clover_g']
    dead = pred['Dry_Dead_g']
    green = pred['Dry_Green_g']
    total = pred['Dry_Total_g']
    
    # Enforce: Total = Clover + Dead + Green
    component_sum = clover + dead + green
    new_total = (total + component_sum) / 2
    
    if component_sum > 0:
        scale = new_total / component_sum
        pred['Dry_Clover_g'] = clover * scale
        pred['Dry_Dead_g'] = dead * scale
        pred['Dry_Green_g'] = green * scale
    pred['Dry_Total_g'] = new_total
    
    ensemble_predictions[img_id] = pred

print("\nPredictions summary:")
for name in TARGET_NAMES:
    vals = [ensemble_predictions[img_id][name] for img_id in image_ids_order]
    print(f"  {name:<15} mean: {np.mean(vals):>8.2f}  min: {np.min(vals):>8.2f}  max: {np.max(vals):>8.2f}")

## Create Submission

In [None]:
test_df = pd.read_csv(TEST_CSV)

submission_rows = []
for _, row in test_df.iterrows():
    sample_id = row['sample_id']
    image_id = sample_id.split('__')[0]
    target_name = row['target_name']
    
    pred_value = ensemble_predictions[image_id][target_name]
    
    submission_rows.append({
        'sample_id': sample_id,
        'target': pred_value
    })

submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission.csv', index=False)

print("Submission created!")
print(f"Shape: {submission_df.shape}")
print(submission_df.head(10))

In [None]:
print("\n" + "="*70)
print("V8: DINOv2 + Depth Fusion")
print(f"Backbone: {CONFIG['backbone']}")
print(f"Image size: {CONFIG['image_size']}")
print(f"Ensemble: {N_FOLDS}-fold")
print("Val Loss: 2.08 +/- 0.99")
print("Submission file ready: submission.csv")
print("="*70)