# CSIRO Image2Biomass - V4: 5-Fold EfficientNetV2-M Ensemble (MSE Loss)

This notebook generates predictions using a **5-fold ensemble** of EfficientNetV2-M models:
- **Loss function**: MSE (better aligned with R^2 metric)
- Mean validation loss: 1.95 +/- 0.36 (MSE)
- Image size: 512x512
- Conservative augmentation during training
- Ensemble reduces variance and improves robustness

## Setup Instructions
1. Add the model dataset (image2biomass-efficientnetv2-kfold-mse)
2. Add the competition data
3. **Set Internet to OFF** (required for submission)
4. Run all cells to generate submission

## 1. Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"timm version: {timm.__version__}")

## 2. Configuration

In [None]:
# Paths
TEST_CSV = '/kaggle/input/csiro-biomass/test.csv'
TEST_IMG_DIR = '/kaggle/input/csiro-biomass/test'
TRAIN_CSV = '/kaggle/input/csiro-biomass/train.csv'
TRAIN_IMG_DIR = '/kaggle/input/csiro-biomass/train'

# Model checkpoints path
CHECKPOINT_BASE = '/kaggle/input/image2biomass-efficientnetv2-kfold-mse/pytorch/default/1/checkpoints_kfold_mse_kaggle'
N_FOLDS = 5

# Target names
TARGET_NAMES = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']

# Model config (MSE loss version)
CONFIG = {
    'backbone': 'tf_efficientnetv2_m',
    'image_size': 512,
    'batch_size': 16,
    'num_workers': 0,  # Set to 0 for Kaggle
    'dropout': 0.5,
    'head_hidden_dim': 512
}

# ImageNet normalization
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

## 3. Model Architecture (EfficientNetV2-M with timm)

In [None]:
class MultiTaskEfficientNet(nn.Module):
    """Multi-task EfficientNetV2-M for biomass prediction."""
    
    def __init__(self, backbone='tf_efficientnetv2_m', num_targets=5, 
                 dropout=0.5, head_hidden_dim=512, pretrained=True):
        super().__init__()
        
        # Load EfficientNetV2-M backbone from timm
        self.backbone = timm.create_model(
            backbone,
            pretrained=pretrained,
            num_classes=0,  # Remove classification head
            global_pool='avg'
        )
        
        # Get backbone output features
        backbone_features = self.backbone.num_features
        
        # Create prediction heads for each target
        self.heads = nn.ModuleDict({
            target_name: self._make_head(backbone_features, head_hidden_dim, dropout)
            for target_name in TARGET_NAMES
        })
    
    def _make_head(self, in_features, hidden_dim, dropout):
        """Create a prediction head."""
        return nn.Sequential(
            nn.Linear(in_features, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        """Forward pass."""
        features = self.backbone(x)
        
        outputs = {
            target_name: self.heads[target_name](features).squeeze(-1)
            for target_name in TARGET_NAMES
        }
        return outputs

print("Model architecture defined")

## 4. Dataset Class (with Albumentations)

In [None]:
class BiomassTestDataset(Dataset):
    """Test dataset for biomass prediction using Albumentations."""
    
    def __init__(self, csv_path, img_dir, transform=None, target_stats=None):
        self.csv_path = csv_path
        self.img_dir = Path(img_dir)
        self.transform = transform
        self.target_stats = target_stats
        
        # Load CSV
        self.df = pd.read_csv(csv_path)
        self.df['image_id'] = self.df['sample_id'].str.split('__').str[0]
        self.image_ids = self.df['image_id'].unique()
        self.image_paths = self.df.groupby('image_id')['image_path'].first().to_dict()
    
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_filename = Path(self.image_paths[image_id]).name
        image_path = self.img_dir / image_filename
        
        # Load image and convert to numpy array for Albumentations
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)
        
        # Apply Albumentations transforms
        if self.transform is not None:
            transformed = self.transform(image=image)
            image = transformed['image']
        
        return {
            'image': image,
            'image_id': image_id
        }

print("Dataset class defined (Albumentations)")

## 5. Get Target Statistics from Training Data

In [None]:
# Load training data to compute normalization statistics
train_df = pd.read_csv(TRAIN_CSV)
train_df['image_id'] = train_df['sample_id'].str.split('__').str[0]

# Pivot to wide format
train_wide = train_df.pivot_table(
    index='image_id',
    columns='target_name',
    values='target',
    aggfunc='first'
)

# Compute statistics
target_stats = {}
for target_name in TARGET_NAMES:
    values = train_wide[target_name].values
    target_stats[target_name] = {
        'mean': float(np.mean(values)),
        'std': float(np.std(values)) + 1e-8
    }

print("Target normalization statistics:")
for target_name, stats in target_stats.items():
    print(f"  {target_name:<20} mean: {stats['mean']:>8.2f}  std: {stats['std']:>8.2f}")

## 6. Create Test Dataset and DataLoader

In [None]:
# Define validation transforms using Albumentations (same as training)
val_transform = A.Compose([
    A.Resize(CONFIG['image_size'], CONFIG['image_size']),
    A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ToTensorV2(),
])

# Create test dataset
test_dataset = BiomassTestDataset(
    csv_path=TEST_CSV,
    img_dir=TEST_IMG_DIR,
    transform=val_transform,
    target_stats=target_stats
)

# Create dataloader
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['batch_size'],
    shuffle=False,
    num_workers=CONFIG['num_workers'],
    pin_memory=False
)

print(f"Test dataset: {len(test_dataset)} samples")
print(f"Test batches: {len(test_loader)}")

## 7. Load All Fold Models

In [None]:
# Load all 5 fold models
fold_models = []

print(f"Loading {N_FOLDS} fold models (MSE loss)...")
for fold_idx in range(N_FOLDS):
    checkpoint_path = Path(CHECKPOINT_BASE) / f'fold_{fold_idx}' / 'best_model.pth'
    
    print(f"\nFold {fold_idx + 1}/{N_FOLDS}:")
    print(f"  Loading from: {checkpoint_path}")
    
    # Create model
    model = MultiTaskEfficientNet(
        backbone=CONFIG['backbone'],
        num_targets=len(TARGET_NAMES),
        dropout=CONFIG['dropout'],
        head_hidden_dim=CONFIG['head_hidden_dim'],
        pretrained=False  # Weights from checkpoint
    )
    model = model.to(DEVICE)
    
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=DEVICE, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"  Epoch: {checkpoint['epoch']}")
    print(f"  Val Loss (MSE): {checkpoint['best_val_loss']:.4f}")
    
    fold_models.append(model)

print(f"\nSuccessfully loaded {len(fold_models)} models")

n_params = sum(p.numel() for p in fold_models[0].parameters())
print(f"Parameters per model: {n_params:,}")

## 8. Generate Ensemble Predictions

In [None]:
def denormalize_predictions(pred_dict, target_stats):
    """Denormalize predictions back to original scale."""
    denormalized = {}
    for target_name, value in pred_dict.items():
        stats = target_stats[target_name]
        denormalized[target_name] = (value * stats['std']) + stats['mean']
    return denormalized

def enforce_constraint(predictions, method='average'):
    """Enforce constraint: Dry_Total = Dry_Clover + Dry_Dead + Dry_Green"""
    enforced = {}
    
    for image_id, pred_dict in predictions.items():
        pred = pred_dict.copy()
        
        clover = pred['Dry_Clover_g']
        dead = pred['Dry_Dead_g']
        green = pred['Dry_Green_g']
        total = pred['Dry_Total_g']
        
        component_sum = clover + dead + green
        
        if method == 'average':
            # Average the predicted total and sum of components
            new_total = (total + component_sum) / 2
            
            # Distribute discrepancy proportionally
            if component_sum > 0:
                scale = new_total / component_sum
                pred['Dry_Clover_g'] = clover * scale
                pred['Dry_Dead_g'] = dead * scale
                pred['Dry_Green_g'] = green * scale
                pred['Dry_Total_g'] = new_total
            else:
                pred['Dry_Total_g'] = 0.0
        
        enforced[image_id] = pred
    
    return enforced

# Generate predictions from each fold
print("Generating predictions from ensemble...")
all_fold_predictions = []

for fold_idx, model in enumerate(fold_models):
    print(f"\nFold {fold_idx + 1}/{N_FOLDS}:")
    fold_preds = {}
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Fold {fold_idx + 1}"):
            images = batch['image'].to(DEVICE)
            image_ids = batch['image_id']
            
            # Get predictions
            pred = model(images)
            
            # Store predictions for each image
            for i, image_id in enumerate(image_ids):
                pred_dict = {
                    target_name: pred[target_name][i].cpu().item()
                    for target_name in TARGET_NAMES
                }
                # Denormalize
                pred_dict = denormalize_predictions(pred_dict, target_stats)
                fold_preds[image_id] = pred_dict
    
    all_fold_predictions.append(fold_preds)
    print(f"  Generated {len(fold_preds)} predictions")

# Average predictions across folds
print("\nAveraging predictions across folds...")
ensemble_predictions = {}

# Get all image IDs
all_image_ids = list(all_fold_predictions[0].keys())

for image_id in all_image_ids:
    ensemble_pred = {}
    for target_name in TARGET_NAMES:
        # Collect predictions from all folds
        fold_values = [fold_preds[image_id][target_name] for fold_preds in all_fold_predictions]
        # Average them
        ensemble_pred[target_name] = np.mean(fold_values)
    ensemble_predictions[image_id] = ensemble_pred

print(f"Generated ensemble predictions for {len(ensemble_predictions)} images")

# Apply constraint enforcement
print("\nApplying constraint enforcement...")
ensemble_predictions = enforce_constraint(ensemble_predictions, method='average')

# Check constraint violations
violations = []
for image_id, pred in ensemble_predictions.items():
    total = pred['Dry_Total_g']
    component_sum = pred['Dry_Clover_g'] + pred['Dry_Dead_g'] + pred['Dry_Green_g']
    violation = abs(total - component_sum)
    violations.append(violation)

print(f"Constraint violations:")
print(f"  Mean: {np.mean(violations):.6f}g")
print(f"  Max: {np.max(violations):.6f}g")
print(f"  All exact: {all(v < 1e-6 for v in violations)}")

## 9. Create Submission File

In [None]:
# Load test.csv to get correct sample_id ordering
test_df = pd.read_csv(TEST_CSV)

# Create submission rows
submission_rows = []
for _, row in test_df.iterrows():
    sample_id = row['sample_id']
    image_id = sample_id.split('__')[0]
    target_name = row['target_name']
    
    # Get prediction
    pred_value = ensemble_predictions[image_id][target_name]
    
    submission_rows.append({
        'sample_id': sample_id,
        'target': pred_value
    })

# Create DataFrame and save
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission.csv', index=False)

print("Submission file created!")
print(f"Shape: {submission_df.shape}")
print("\nFirst few predictions:")
print(submission_df.head(10))
print("\nSummary statistics:")
print(submission_df['target'].describe())

## 10. Display Predictions Summary

In [None]:
print("\nEnsemble predictions by target:")
for target_name in TARGET_NAMES:
    values = [pred[target_name] for pred in ensemble_predictions.values()]
    print(f"  {target_name:<20} mean: {np.mean(values):>8.2f}  "
          f"min: {np.min(values):>8.2f}  max: {np.max(values):>8.2f}")

# Show individual fold predictions for comparison
print("\n" + "="*70)
print("Individual fold predictions (for first image):")
print("="*70)
first_image_id = list(ensemble_predictions.keys())[0]
for target_name in TARGET_NAMES:
    print(f"\n{target_name}:")
    for fold_idx in range(N_FOLDS):
        pred = all_fold_predictions[fold_idx][first_image_id][target_name]
        print(f"  Fold {fold_idx + 1}: {pred:>8.2f}")
    ensemble_pred = ensemble_predictions[first_image_id][target_name]
    print(f"  Ensemble: {ensemble_pred:>8.2f}")

print("\n" + "="*70)
print("Submission file ready: submission.csv")
print("5-Fold Ensemble with EfficientNetV2-M (MSE Loss)")
print("Ready to submit with Internet OFF!")
print("="*70)