# Import necessary libraries

Import all required packages for the biomass prediction model including PyTorch for deep learning, pandas for data handling, PIL for image processing, and scikit-learn for data splitting.

**Note:** If Optuna is not installed, run: `pip install optuna optuna-dashboard` (or `!pip install optuna` in a notebook cell)


# Analyze Image Sizes in Dataset

Define a function to scan through image directories and analyze the distribution of image dimensions. This helps understand the dataset structure and ensures proper image sizing for the model. Displays results in a formatted table showing width, height, and quantity of each size.


In [1]:
# Install required packages from requirements.txt
# This ensures all dependencies are installed before running the notebook
import sys
import subprocess

def install_requirements():
    """Install packages from requirements.txt"""
    try:
        # Try to read requirements.txt
        with open('requirements.txt', 'r') as f:
            requirements = f.read()
        print("📦 Installing packages from requirements.txt...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
        print("✅ All packages installed successfully!")
    except FileNotFoundError:
        print("⚠️  requirements.txt not found. Installing core packages individually...")
        packages = [
            'torch>=2.0.0',
            'torchvision>=0.15.0',
            'numpy>=1.21.0',
            'pandas>=1.3.0',
            'Pillow>=9.0.0',
            'scikit-learn>=1.0.0',
            'matplotlib>=3.5.0',
            'seaborn>=0.11.0',
            'optuna>=3.0.0'
        ]
        for package in packages:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])
        print("✅ Core packages installed!")
    except Exception as e:
        print(f"❌ Error installing packages: {e}")
        print("Please install packages manually or check your internet connection.")

# Install packages
install_requirements()

📦 Installing packages from requirements.txt...
✅ All packages installed successfully!


# Environment Configuration

Detect whether running on Kaggle or locally and set up data paths accordingly.


In [2]:
# Detect environment and set data paths
import os

# Check if running on Kaggle
IS_KAGGLE = os.path.exists('/kaggle') or os.getenv('KAGGLE_KERNEL_TYPE') is not None

if IS_KAGGLE:
    # Kaggle paths
    DATA_DIR = '/kaggle/input/csiro-biomass'
    TRAIN_DIR = '/kaggle/input/csiro-biomass/train'
    TEST_DIR = '/kaggle/input/csiro-biomass/test'
    TRAIN_CSV = '/kaggle/input/csiro-biomass/train.csv'
    TEST_CSV = '/kaggle/input/csiro-biomass/test.csv'
    print("🔵 Running on Kaggle")
else:
    # Local paths - adjust these to match your local data structure
    # Assuming data is in ./data/ directory relative to notebook
    DATA_DIR = '.'
    TRAIN_DIR = './train'
    TEST_DIR = './test'
    TRAIN_CSV = './train.csv'
    TEST_CSV = './test.csv'
    print("🟢 Running locally")
    
print(f"Data directory: {DATA_DIR}")
print(f"Train CSV: {TRAIN_CSV}")
print(f"Test CSV: {TEST_CSV}")


🟢 Running locally
Data directory: .
Train CSV: ./train.csv
Test CSV: ./test.csv


In [3]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import optuna

from PIL import Image
from collections import defaultdict
from torchvision.models import resnet34
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Load training data

Load the training CSV file which contains metadata about the images including image paths, target values, and target names (biomass types).


In [4]:
def get_unique_sizes(directory):
    size_counts = defaultdict(int)
    if not os.path.exists(directory):
        return size_counts
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', 'JPG')):
                try:
                    with Image.open(os.path.join(root, file)) as img:
                        size = img.size
                        size_counts[size] += 1
                except Exception as e:
                    print(f"Error {file}: {e}")

    return size_counts

folders = [
    TRAIN_DIR,
    TEST_DIR,
]

for folder in folders:
    print(f"\n📂 Folder: {folder}")
    if not os.path.exists(folder):
        print(f"⚠️  Directory does not exist: {folder}")
        continue
    sizes = get_unique_sizes(folder)

    if not sizes:
        print("No images or mistake in code")
        continue
    
    sorted_sizes = sorted(sizes.items(), key=lambda x: x[1], reverse=True)

    print("┌───────────────┬───────────────┬─────────┐")
    print("│  Width (px)  │ Height (px)  │Quantity│")
    print("├───────────────┼───────────────┼─────────┤")
    for (w, h), count in sorted_sizes:
        print(f"│ {w:<13} │ {h:<13} │ {count:<7} │")
    print("└───────────────┴───────────────┴─────────┘")


📂 Folder: ./train
┌───────────────┬───────────────┬─────────┐
│  Width (px)  │ Height (px)  │Quantity│
├───────────────┼───────────────┼─────────┤
│ 2000          │ 1000          │ 357     │
└───────────────┴───────────────┴─────────┘

📂 Folder: ./test
┌───────────────┬───────────────┬─────────┐
│  Width (px)  │ Height (px)  │Quantity│
├───────────────┼───────────────┼─────────┤
│ 2000          │ 1000          │ 1       │
└───────────────┴───────────────┴─────────┘


# Custom Dataset Class and Data Transforms

Define a custom PyTorch Dataset class that:
- Loads images from paths in the dataframe
- Applies transformations (resizing, augmentation for training)
- Maps different target types (biomass categories) to indices
- Returns images with their corresponding target values and types

Also defines two transform pipelines:
- `train_transform`: Includes data augmentation (random flips, color jitter) for training
- `val_transform`: Only resizing and normalization for validation/testing


# Model Architecture: Multi-Head ResNet34

Define a ResNet34-based model with multiple output heads (one for each target type):
- Uses ResNet34 as the feature extraction backbone
- Extracts shared features through fully connected layers
- Has 5 separate heads (one for each biomass type: Dry_Green_g, Dry_Dead_g, Dry_Clover_g, GDM_g, Dry_Total_g)
- Each head outputs a single value (regression prediction)
- The model selects which head to use based on the target_type during training


# Training Function

Define the model training loop that:
- Trains the model for specified number of epochs
- Iterates through training batches, computes loss, and updates weights
- Evaluates on validation set after each epoch
- Tracks and returns training and validation losses
- Uses the appropriate model head based on target_type for each sample


# Split Data into Train and Validation Sets

Split the training data into train (80%) and validation (20%) sets while maintaining stratification by target_name to ensure balanced distribution of biomass types in both sets.


# Create Datasets and Data Loaders

Instantiate the training and validation datasets with their respective transforms, then create DataLoader objects for efficient batch loading during training. Also determine and set the computation device (GPU if available, else CPU).


# Initialize Model and Train

Create the ResNet34 model instance, define loss function (MSE for regression), optimizer (Adam), and learning rate scheduler. Then train the model for the specified number of epochs.


# Visualize Training History

Plot the training and validation loss curves to visualize model performance over epochs and check for overfitting or convergence.


# Generate Predictions on Test Set

Load the test dataset and make predictions:
- Load test CSV and create test dataset with validation transforms
- Run inference on test images
- For each sample, select the appropriate output head based on the target_name
- Collect predictions and corresponding sample IDs


# Create Submission File

Format the predictions into a submission CSV file with sample_id and target columns, and save it for competition submission.


In [5]:
train = pd.read_csv(TRAIN_CSV)
train.head()

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,ID1011485656__Dry_Green_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,ID1011485656__Dry_Total_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,ID1011485656__GDM_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.275


In [6]:
class DatasetCS(Dataset):
    def __init__(self, df, images_dir, transform=None, is_test=False):
        self.df = df
        self.images_dir = images_dir
        self.transform = transform
        self.is_test = is_test
        
        if not is_test:
            self.target_mapping = {
                'Dry_Green_g': 0, 'Dry_Dead_g': 1, 'Dry_Clover_g': 2,
                'GDM_g': 3, 'Dry_Total_g': 4
            }
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.images_dir, row['image_path'])
        
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            return image, row['sample_id']
        else:
            target_value = row['target']
            target_type = self.target_mapping[row['target_name']]
            return image, torch.tensor(target_value, dtype=torch.float32), target_type

train_transform = transforms.Compose([
    transforms.Resize((500, 250)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((500, 250)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
class ResNet34(nn.Module):
    def __init__(self, num_targets=5, hidden1=512, hidden2=256, head_hidden=128, 
                 dropout_shared=0.3, dropout_head=0.2):
        super(ResNet34, self).__init__()
        self.backbone = resnet34(weights=None)
        self.backbone.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        
        self.shared_features = nn.Sequential(
            nn.Linear(in_features, hidden1),
            nn.BatchNorm1d(hidden1),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_shared),
            nn.Linear(hidden1, hidden2),
            nn.BatchNorm1d(hidden2),
            nn.ReLU(inplace=True),
        )
        
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Dropout(dropout_head),
                nn.Linear(hidden2, head_hidden),
                nn.ReLU(inplace=True),
                nn.Linear(head_hidden, 1)
            ) for _ in range(num_targets)
        ])
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x, target_type=None):
        features = self.backbone(x)
        shared_out = self.shared_features(features)
        
        if target_type is not None:
            outputs = []
            for i, t_type in enumerate(target_type):
                outputs.append(self.heads[t_type](shared_out[i].unsqueeze(0)))
            return torch.cat(outputs, dim=0)
        else:
            all_outputs = [head(shared_out) for head in self.heads]
            return torch.cat(all_outputs, dim=1)

In [8]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=15, 
                scheduler=None, verbose=True, return_best_val_loss=False):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        
        for images, targets, target_types in train_loader:
            images = images.to(device)
            targets = targets.to(device)
            target_types = target_types.to(device)
            
            optimizer.zero_grad()
            outputs = model(images, target_types).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for images, targets, target_types in val_loader:
                images = images.to(device)
                targets = targets.to(device)
                target_types = target_types.to(device)
                
                outputs = model(images, target_types).squeeze()
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        
        if scheduler is not None:
            scheduler.step(val_loss)
        
        if verbose:
            print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    if return_best_val_loss:
        return train_losses, val_losses, best_val_loss
    return train_losses, val_losses

In [9]:
train_indices, val_indices = train_test_split(
    range(len(train)), 
    test_size=0.2, 
    random_state=42, 
    stratify=train['target_name']
)

train_subset = train.iloc[train_indices].reset_index(drop=True)
val_subset = train.iloc[val_indices].reset_index(drop=True)

print(f"train length: {len(train_subset)}")
print(f"val length: {len(val_subset)}")

train length: 1428
val length: 357


In [10]:
train_dataset = DatasetCS(train_subset, DATA_DIR, transform=train_transform)
val_dataset = DatasetCS(val_subset, DATA_DIR, transform=val_transform)

# Adjust num_workers for local runs (0 for local to avoid multiprocessing issues, 2 for Kaggle)
# Using 0 for local runs avoids DataLoader worker errors on macOS/Windows
num_workers = 0 if not IS_KAGGLE else 2

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=num_workers)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
device

Using device: cpu


device(type='cpu')

# Hyperparameter Optimization with Optuna

Define an objective function for Optuna that creates and trains a model with different hyperparameters. Optuna will search for the best combination of learning rate, batch size, hidden layer sizes, dropout rates, and optimizer settings.


In [11]:
def objective(trial):
    """Optuna objective function to optimize hyperparameters"""
    
    # Suggest hyperparameters
    lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    hidden1 = trial.suggest_categorical('hidden1', [256, 512, 768])
    hidden2 = trial.suggest_categorical('hidden2', [128, 256, 512])
    head_hidden = trial.suggest_categorical('head_hidden', [64, 128, 256])
    dropout_shared = trial.suggest_float('dropout_shared', 0.1, 0.5)
    dropout_head = trial.suggest_float('dropout_head', 0.1, 0.4)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'AdamW', 'RMSprop'])
    
    # Adjust num_workers for local runs (0 for local to avoid multiprocessing issues, 2 for Kaggle)
    num_workers_opt = 0 if not IS_KAGGLE else 2
    
    # Create data loaders with new batch size
    train_loader_opt = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=num_workers_opt
    )
    val_loader_opt = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=num_workers_opt
    )
    
    # Create model with hyperparameters
    model = ResNet34(
        num_targets=5,
        hidden1=hidden1,
        hidden2=hidden2,
        head_hidden=head_hidden,
        dropout_shared=dropout_shared,
        dropout_head=dropout_head
    ).to(device)
    
    # Create optimizer
    if optimizer_name == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=2, factor=0.5
    )
    
    # Train for a few epochs (fewer for optimization)
    _, _, best_val_loss = train_model(
        model,
        train_loader_opt,
        val_loader_opt,
        criterion,
        optimizer,
        num_epochs=5,  # Reduced epochs for faster optimization
        scheduler=scheduler,
        verbose=False,
        return_best_val_loss=True
    )
    
    return best_val_loss


# Run Hyperparameter Optimization

Create an Optuna study and run optimization trials. This will search for the best hyperparameters by training multiple models with different configurations and comparing their validation losses.


In [None]:
# Create Optuna study
study = optuna.create_study(
    direction='minimize',
    study_name='biomass_optimization',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=3)
)

# Run optimization (adjust n_trials based on available time)
print("Starting hyperparameter optimization...")
study.optimize(objective, n_trials=20, show_progress_bar=True)

print("\nBest trial:")
trial = study.best_trial
print(f"  Value (best validation loss): {trial.value:.4f}")
print("\n  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
    
# Store best parameters
best_params = trial.params


[I 2025-10-29 10:53:51,368] A new study created in memory with name: biomass_optimization


Starting hyperparameter optimization...


  0%|          | 0/20 [00:00<?, ?it/s]

# Visualize Optimization Results

Plot the optimization history and parameter importance to understand which hyperparameters have the most impact on model performance.


In [None]:
# Plot optimization history
fig1 = optuna.visualization.plot_optimization_history(study)
fig1.show()

# Plot parameter importance
fig2 = optuna.visualization.plot_param_importances(study)
fig2.show()


# Train Final Model with Best Hyperparameters

Train the model using the optimized hyperparameters found by Optuna. This should yield better performance than the manually selected hyperparameters.


In [None]:
# Create data loaders with best batch size
best_batch_size = best_params['batch_size']
# Adjust num_workers for local runs
num_workers_best = 0 if not IS_KAGGLE else 2
train_loader_best = DataLoader(
    train_dataset, 
    batch_size=best_batch_size, 
    shuffle=True, 
    num_workers=num_workers_best
)
val_loader_best = DataLoader(
    val_dataset, 
    batch_size=best_batch_size, 
    shuffle=False, 
    num_workers=num_workers_best
)

# Create model with best hyperparameters
model_best = ResNet34(
    num_targets=5,
    hidden1=best_params['hidden1'],
    hidden2=best_params['hidden2'],
    head_hidden=best_params['head_hidden'],
    dropout_shared=best_params['dropout_shared'],
    dropout_head=best_params['dropout_head']
).to(device)

# Create optimizer with best learning rate
if best_params['optimizer'] == 'Adam':
    optimizer_best = torch.optim.Adam(model_best.parameters(), lr=best_params['lr'])
elif best_params['optimizer'] == 'AdamW':
    optimizer_best = torch.optim.AdamW(model_best.parameters(), lr=best_params['lr'])
else:
    optimizer_best = torch.optim.RMSprop(model_best.parameters(), lr=best_params['lr'])

criterion = nn.MSELoss()
scheduler_best = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_best, patience=3, factor=0.5
)

# Train with best hyperparameters
train_losses_best, val_losses_best = train_model(
    model_best,
    train_loader_best,
    val_loader_best,
    criterion,
    optimizer_best,
    num_epochs=15,  # Full training with best hyperparameters
    scheduler=scheduler_best
)

# Store the best model for inference
model = model_best


In [None]:
# Original training without optimization (for comparison)
# Uncomment below to train with default hyperparameters instead of optimized ones

# model = ResNet34(num_targets=5).to(device)
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
# train_losses, val_losses = train_model(
#     model,
#     train_loader,
#     val_loader,
#     criterion,
#     optimizer,
#     num_epochs=3
# )

In [None]:
plt.figure(figsize=(10, 4))

plt.plot(train_losses_best, label='Train Loss (Optimized)')
plt.plot(val_losses_best, label='Val Loss (Optimized)')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training History with Optimized Hyperparameters')

plt.legend()
plt.grid(True)
plt.show()

In [None]:
test = pd.read_csv(TEST_CSV)
test_dataset = DatasetCS(test, DATA_DIR, transform=val_transform, is_test=True)
# Adjust num_workers for local runs (0 for local to avoid multiprocessing issues, 2 for Kaggle)
num_workers = 0 if not IS_KAGGLE else 2
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=num_workers)

model.eval()
predictions = []
sample_ids = []

target_mapping = {
    'Dry_Green_g': 0, 'Dry_Dead_g': 1, 'Dry_Clover_g': 2,
    'GDM_g': 3, 'Dry_Total_g': 4
}

with torch.no_grad():
    for images, batch_sample_ids in test_loader:
        images = images.to(device)
        batch_outputs = model(images)
        
        for i, sample_id in enumerate(batch_sample_ids):
            row = test[test['sample_id'] == sample_id].iloc[0]
            target_idx = target_mapping[row['target_name']]
            prediction = batch_outputs[i, target_idx].item()
            predictions.append(prediction)
            sample_ids.append(sample_id)

In [None]:
submission = pd.DataFrame({
    'sample_id': sample_ids,
    'target': predictions
})

submission.to_csv('submission.csv', index=False)
submission