In [1]:
# ======================================
# Part 1: Data Preparation and Dataset Class
# ======================================

# ======================================
# Import Necessary Libraries
# ======================================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import h5py
import matplotlib.pyplot as plt
import random
import os
from torchvision import models

# Install necessary packages if not already installed
try:
    from fvcore.nn import FlopCountAnalysis, parameter_count
except ImportError:
    !pip install fvcore -q
    from fvcore.nn import FlopCountAnalysis, parameter_count

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ======================================
# Reproducibility
# ======================================
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
set_seed(42)

# ======================================
# Device Configuration
# ======================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# ======================================
# Data Preparation
# ======================================
# File path to the dataset
data_path = '/kaggle/input/tcir-cpac-io-sh-h5-file/TCIR-CPAC_IO_SH.h5'

# Load Dataset Information and Filter
try:
    data_info = pd.read_hdf(data_path, key="info", mode='r')
except Exception as e:
    print(f"Error loading HDF5 file: {e}")
    raise

# Filter for the 'SH' dataset
data_info_filtered = data_info[data_info['data_set'].isin(['SH'])]

# Undersampling for Balanced Dataset
low_vmax_threshold = np.percentile(data_info_filtered['Vmax'].values, 35)
low_vmax_indices = data_info_filtered[
    data_info_filtered['Vmax'] <= low_vmax_threshold].index
remaining_indices = data_info_filtered[
    data_info_filtered['Vmax'] > low_vmax_threshold].index
undersample_ratio = 0.3
undersample_size = int(len(low_vmax_indices) * undersample_ratio) \
    if len(low_vmax_indices) > 0 else 0

if undersample_size > 0:
    undersample_indices = np.random.choice(
        low_vmax_indices, undersample_size, replace=False)
    balanced_indices = np.concatenate(
        (undersample_indices, remaining_indices))
else:
    balanced_indices = remaining_indices

data_info_balanced = data_info_filtered.loc[
    balanced_indices].reset_index()

# Define Transformations
# Separate transforms for InceptionV3 (input size 299x299)
common_train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(degrees=(0, 360), fill=0),
    transforms.CenterCrop(size=(152, 152)),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

common_val_test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.CenterCrop(size=(152, 152)),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

inception_train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(degrees=(0, 360), fill=0),
    transforms.CenterCrop(size=(152, 152)),
    transforms.Resize((299, 299)),
    transforms.ToTensor()
])

inception_val_test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.CenterCrop(size=(152, 152)),
    transforms.Resize((299, 299)),
    transforms.ToTensor()
])

# Dataset Class
class TCIRLazyDataset(Dataset):
    def __init__(self, hdf5_file, data_info, channels=[0, 1, 3],
                 transform=None):
        self.hdf5_file = hdf5_file
        self.data_info = data_info
        self.channels = channels
        self.transform = transform
        self.channel_norm_values = {0: 350, 1: 275, 3: 4.35}
        try:
            self.hf = h5py.File(self.hdf5_file, 'r')
        except Exception as e:
            print(f"Error opening HDF5 file: {e}")
            raise

    def __len__(self):
        return len(self.data_info)

    def __getitem__(self, idx):
        try:
            hdf5_index = self.data_info.at[idx, 'index']
            data_matrix = self.hf['matrix'][hdf5_index, :, :, self.channels]
        except Exception as e:
            print(f"Error accessing data at index {idx}: {e}")
            raise

        # Normalize image data
        image = np.empty_like(data_matrix, dtype=np.float32)
        for i, ch in enumerate(self.channels):
            norm_value = self.channel_norm_values.get(ch, 1.0)
            channel_data = data_matrix[:, :, i]
            channel_data = np.clip(channel_data, None, norm_value)
            image[:, :, i] = np.nan_to_num(channel_data / norm_value)

        # Convert to tensor and apply transformations
        image = torch.tensor(image).permute(2, 0, 1)
        if self.transform:
            image = self.transform(image)

        # Retrieve the label (Vmax)
        label = torch.tensor(
            self.data_info.at[idx, 'Vmax'], dtype=torch.float32)
        return image, label

    def __del__(self):
        if hasattr(self, 'hf') and self.hf:
            self.hf.close()

# Split Dataset
full_dataset_size = len(data_info_balanced)
indices = list(range(full_dataset_size))
np.random.shuffle(indices)
train_size = int(0.7 * full_dataset_size)
val_size = int(0.15 * full_dataset_size)
test_size = full_dataset_size - train_size - val_size
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size + val_size]
test_indices = indices[train_size + val_size:]

train_data_info = data_info_balanced.iloc[
    train_indices].reset_index(drop=True)
val_data_info = data_info_balanced.iloc[
    val_indices].reset_index(drop=True)
test_data_info = data_info_balanced.iloc[
    test_indices].reset_index(drop=True)

print(f'Total Samples - Train: {len(train_data_info)}, '
      f'Validation: {len(val_data_info)}, Test: {len(test_data_info)}')

# ======================================
# Part 2: Model Definitions
# ======================================

# ======================================
# Function to Create Models Dynamically
# ======================================
def create_model(architecture, num_channels=3):
    """
    Creates a model based on the specified architecture.

    Args:
        architecture (str): Name of the architecture.
        num_channels (int): Number of input channels.

    Returns:
        model (nn.Module): The constructed model.
        train_transform (Compose): Transformation for training data.
        val_test_transform (Compose): Transformation for validation/test data.
        input_size (tuple): Input size expected by the model.
    """
    # Define a mapping from architecture to weights enum
    weights_dict = {
        'efficientnet_v2_s': models.EfficientNet_V2_S_Weights.DEFAULT,
        'efficientnet_v2_m': models.EfficientNet_V2_M_Weights.DEFAULT,
        'inception_v3': models.Inception_V3_Weights.DEFAULT,
        # 'convnext_tiny': models.ConvNeXt_Tiny_Weights.DEFAULT,
        # 'convnext_small': models.ConvNeXt_Small_Weights.DEFAULT
    }
    
    if architecture not in weights_dict:
        raise ValueError(f"Architecture {architecture} not supported.")
    
    weights = weights_dict[architecture]
    
    if architecture in ['efficientnet_v2_s', 'efficientnet_v2_m']:
        # Load the base model with pretrained weights
        base_model = getattr(models, architecture)(weights=weights)
        # Modify the first conv layer if num_channels != 3
        if num_channels != 3:
            base_model.features[0][0] = nn.Conv2d(
                num_channels,
                base_model.features[0][0].out_channels,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False)
        # Modify the classifier
        in_features = base_model.classifier[1].in_features
        base_model.classifier[1] = nn.Linear(in_features, 1)
        model = base_model
        # Use common transforms
        train_transform = common_train_transform
        val_test_transform = common_val_test_transform
        input_size = (num_channels, 224, 224)

    elif architecture == 'inception_v3':
        # Load the base model with pretrained weights
        base_model = models.inception_v3(weights=weights)
        # Modify the first conv layer if num_channels != 3
        if num_channels != 3:
            base_model.Conv2d_1a_3x3.conv = nn.Conv2d(
                num_channels,
                base_model.Conv2d_1a_3x3.conv.out_channels,
                kernel_size=3,
                stride=2,
                padding=0,
                bias=False)
        # Modify the classifier
        in_features = base_model.fc.in_features
        base_model.fc = nn.Linear(in_features, 1)
        # Set aux_logits to False if not needed
        base_model.aux_logits = False
        model = base_model
        # Use Inception-specific transforms
        train_transform = inception_train_transform
        val_test_transform = inception_val_test_transform
        input_size = (num_channels, 299, 299)

    elif architecture in ['convnext_tiny', 'convnext_small']:
        # Load the base model with pretrained weights
        base_model = getattr(models, architecture)(weights=weights)
        # Modify the first conv layer if num_channels != 3
        if num_channels != 3:
            base_model.features[0][0] = nn.Conv2d(
                num_channels,
                base_model.features[0][0].out_channels,
                kernel_size=4,
                stride=4,
                bias=False)
        # Modify the classifier
        in_features = base_model.classifier[2].in_features
        base_model.classifier[2] = nn.Linear(in_features, 1)
        model = base_model
        # Use common transforms
        train_transform = common_train_transform
        val_test_transform = common_val_test_transform
        input_size = (num_channels, 224, 224)

    else:
        raise ValueError(f"Architecture {architecture} not supported.")
    return model, train_transform, val_test_transform, input_size

# ======================================
# Part 3: Training, Evaluation, and Results Compilation
# ======================================

# ======================================
# Training and Evaluation Functions
# ======================================
def train_and_evaluate(model, train_loader, val_loader,
                       num_epochs=100, patience=15,
                       checkpoint_path='best_model.pth'):
    """
    Trains the model and evaluates it on the validation set.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        num_epochs (int): Maximum number of epochs.
        patience (int): Early stopping patience.
        checkpoint_path (str): Path to save the best model.

    Returns:
        train_losses (list): List of training losses per epoch.
        val_losses (list): List of validation losses per epoch.
        model (nn.Module): The best trained model.
    """
    model.to(device)
    criterion = nn.SmoothL1Loss()
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.1, patience=5)

    best_val_loss = float('inf')
    early_stop_counter = 0
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images).squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)

        train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).squeeze(1)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)

        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        scheduler.step(val_loss)
        print(f"Epoch [{epoch + 1}/{num_epochs}], "
              f"Train Loss: {train_loss:.3f}, "
              f"Val Loss: {val_loss:.3f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), checkpoint_path)
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered")
                break

    # Load the best model
    model.load_state_dict(torch.load(checkpoint_path, weights_only=True))
    return train_losses, val_losses, model

def evaluate_model(model, test_loader):
    """
    Evaluates the model on the test set.

    Args:
        model (nn.Module): The trained model.
        test_loader (DataLoader): DataLoader for test data.

    Returns:
        mae (float): Mean Absolute Error.
        rmse (float): Root Mean Squared Error.
        r2 (float): R² Score.
        predictions (np.array): Model predictions.
        actuals (np.array): Actual labels.
    """
    model.eval()
    model.to(device)
    predictions = []
    actuals = []
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images).squeeze(1).cpu().numpy()
            labels = labels.numpy()
            predictions.extend(outputs)
            actuals.extend(labels)
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    r2 = r2_score(actuals, predictions)
    return mae, rmse, r2, predictions, actuals

def get_model_complexity(model, input_size):
    """
    Computes the total number of parameters and FLOPs of the model.

    Args:
        model (nn.Module): The model to analyze.
        input_size (tuple): Size of the input tensor.

    Returns:
        total_params (int): Total number of parameters.
        total_flops (int): Total number of floating-point operations.
    """
    model.to('cpu')
    model.eval()
    dummy_input = torch.randn(1, *input_size)
    try:
        flop_count = FlopCountAnalysis(model, dummy_input)
        total_flops = flop_count.total()
    except Exception as e:
        print(f"Error calculating FLOPs: {e}")
        total_flops = None

    # Calculate total parameters
    total_params = parameter_count(model)['']
    return total_params, total_flops

# ======================================
# Model Architectures to Train
# ======================================
architectures = [
    'efficientnet_v2_s',
    'efficientnet_v2_m',
    'inception_v3',
    # 'convnext_tiny',
    # 'convnext_small'
]

# List to store results
results = []

# Iterate over architectures
for architecture in architectures:
    model_name = architecture.upper()
    print(f"\nTraining {model_name}...\n")

    # Create the model and transforms
    try:
        model, train_transform, val_test_transform, input_size = create_model(
            architecture, num_channels=3)
    except ValueError as ve:
        print(f"Error creating model {model_name}: {ve}")
        continue

    # Create Dataset Instances with appropriate transforms
    train_dataset = TCIRLazyDataset(data_path, train_data_info, transform=train_transform)
    val_dataset = TCIRLazyDataset(data_path, val_data_info, transform=val_test_transform)
    test_dataset = TCIRLazyDataset(data_path, test_data_info, transform=val_test_transform)

    # Create DataLoaders
    train_loader = DataLoader(
        train_dataset, batch_size=32, shuffle=True,
        num_workers=3, pin_memory=True)
    val_loader = DataLoader(
        val_dataset, batch_size=32, shuffle=False,
        num_workers=3, pin_memory=True)
    test_loader = DataLoader(
        test_dataset, batch_size=32, shuffle=False,
        num_workers=3, pin_memory=True)

    # Define checkpoint path
    checkpoint_path = f"best_model_{architecture}.pth"

    # Train the model
    train_losses, val_losses, best_model = train_and_evaluate(
        model, train_loader, val_loader,
        num_epochs=100, patience=15,
        checkpoint_path=checkpoint_path)

    # Evaluate on test set
    print(f"\nEvaluating {model_name} on test set...\n")
    mae, rmse, r2, predictions, actuals = evaluate_model(
        best_model, test_loader)
    print(f"{model_name} - Test MAE: {mae:.3f}, "
          f"RMSE: {rmse:.3f}, R2 Score: {r2:.3f}")

    # Compute FLOPs and Params
    print(f"\nCalculating FLOPs and Parameters "
          f"for {model_name}...\n")
    total_params, total_flops = get_model_complexity(best_model, input_size)
    if total_flops is not None:
        flops_display = f"{total_flops:,}"
    else:
        flops_display = "N/A"
    print(f"{model_name} - Total Params: "
          f"{total_params:,}, Total FLOPs: {flops_display}")

    # Save training history
    history_df = pd.DataFrame({
        'Epoch': range(1, len(train_losses) + 1),
        'Train Loss': train_losses,
        'Val Loss': val_losses
    })
    history_csv_path = f"training_history_{architecture}.csv"
    history_df.to_csv(history_csv_path, index=False)
    print(f"\nTraining history saved to '{history_csv_path}'")

    # Store results
    results.append({
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R2 Score': r2,
        'Total Params': total_params,
        'Total FLOPs': total_flops
    })

# ======================================
# Compile Results into a Table
# ======================================
results_df = pd.DataFrame(results)
print("\nFinal Results:\n")
print(results_df)

# Save results to a CSV file
results_df.to_csv('model_evaluation_results_experiment2.csv', index=False)
print("\nModel evaluation results saved to "
      "'model_evaluation_results_experiment2.csv'")


Using device: cuda
Total Samples - Train: 9532, Validation: 2042, Test: 2044

Training EFFICIENTNET_V2_S...



Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 170MB/s]


Epoch [1/100], Train Loss: 43.172, Val Loss: 34.715
Epoch [2/100], Train Loss: 27.560, Val Loss: 20.620
Epoch [3/100], Train Loss: 16.547, Val Loss: 11.806
Epoch [4/100], Train Loss: 11.525, Val Loss: 10.607
Epoch [5/100], Train Loss: 10.497, Val Loss: 9.371
Epoch [6/100], Train Loss: 9.896, Val Loss: 9.312
Epoch [7/100], Train Loss: 9.497, Val Loss: 8.727
Epoch [8/100], Train Loss: 9.219, Val Loss: 8.781
Epoch [9/100], Train Loss: 9.099, Val Loss: 9.091
Epoch [10/100], Train Loss: 8.861, Val Loss: 8.551
Epoch [11/100], Train Loss: 8.530, Val Loss: 8.325
Epoch [12/100], Train Loss: 8.407, Val Loss: 8.429
Epoch [13/100], Train Loss: 8.189, Val Loss: 8.349
Epoch [14/100], Train Loss: 8.037, Val Loss: 7.944
Epoch [15/100], Train Loss: 7.804, Val Loss: 8.394
Epoch [16/100], Train Loss: 7.354, Val Loss: 7.992
Epoch [17/100], Train Loss: 7.293, Val Loss: 7.822
Epoch [18/100], Train Loss: 7.040, Val Loss: 7.907
Epoch [19/100], Train Loss: 6.672, Val Loss: 8.381
Epoch [20/100], Train Loss: 6.4

Downloading: "https://download.pytorch.org/models/efficientnet_v2_m-dc08266a.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_m-dc08266a.pth
100%|██████████| 208M/208M [00:03<00:00, 66.0MB/s]


Epoch [1/100], Train Loss: 33.661, Val Loss: 22.673
Epoch [2/100], Train Loss: 15.867, Val Loss: 11.253
Epoch [3/100], Train Loss: 11.135, Val Loss: 9.386
Epoch [4/100], Train Loss: 10.283, Val Loss: 9.302
Epoch [5/100], Train Loss: 9.695, Val Loss: 8.970
Epoch [6/100], Train Loss: 9.286, Val Loss: 9.126
Epoch [7/100], Train Loss: 9.064, Val Loss: 8.525
Epoch [8/100], Train Loss: 8.700, Val Loss: 8.695
Epoch [9/100], Train Loss: 8.599, Val Loss: 8.552
Epoch [10/100], Train Loss: 8.275, Val Loss: 8.456
Epoch [11/100], Train Loss: 8.173, Val Loss: 8.248
Epoch [12/100], Train Loss: 7.814, Val Loss: 7.878
Epoch [13/100], Train Loss: 7.526, Val Loss: 7.990
Epoch [14/100], Train Loss: 7.141, Val Loss: 8.134
Epoch [15/100], Train Loss: 7.077, Val Loss: 8.350
Epoch [16/100], Train Loss: 6.750, Val Loss: 7.936
Epoch [17/100], Train Loss: 6.607, Val Loss: 7.596
Epoch [18/100], Train Loss: 6.361, Val Loss: 7.497
Epoch [19/100], Train Loss: 6.096, Val Loss: 7.648
Epoch [20/100], Train Loss: 5.988,

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 176MB/s]


Epoch [1/100], Train Loss: 42.315, Val Loss: 29.541
Epoch [2/100], Train Loss: 21.147, Val Loss: 13.092
Epoch [3/100], Train Loss: 11.753, Val Loss: 10.253
Epoch [4/100], Train Loss: 10.349, Val Loss: 9.735
Epoch [5/100], Train Loss: 9.744, Val Loss: 8.974
Epoch [6/100], Train Loss: 9.400, Val Loss: 9.239
Epoch [7/100], Train Loss: 9.365, Val Loss: 8.709
Epoch [8/100], Train Loss: 8.994, Val Loss: 8.439
Epoch [9/100], Train Loss: 8.854, Val Loss: 8.572
Epoch [10/100], Train Loss: 8.610, Val Loss: 8.870
Epoch [11/100], Train Loss: 8.525, Val Loss: 8.690
Epoch [12/100], Train Loss: 8.215, Val Loss: 8.452
Epoch [13/100], Train Loss: 7.987, Val Loss: 8.006
Epoch [14/100], Train Loss: 7.832, Val Loss: 8.118
Epoch [15/100], Train Loss: 7.620, Val Loss: 8.482
Epoch [16/100], Train Loss: 7.471, Val Loss: 8.087
Epoch [17/100], Train Loss: 7.265, Val Loss: 8.136
Epoch [18/100], Train Loss: 7.203, Val Loss: 8.020
Epoch [19/100], Train Loss: 7.002, Val Loss: 8.332
Epoch [20/100], Train Loss: 6.457