# Dataset Preparation and Preprocessing (Bonn FCD-II)

This notebook implements the dataset setup and preprocessing pipeline as described in the **Methodology** section. It processes the **Bonn FCD-II dataset** (N=85), enforcing the standard train/test split (57 training, 28 testing), and prepares the data for **nnU-Net v2** automated preprocessing.

---
**Paper Reference:** *Evaluation of nnU-Net for FCD II Lesion Segmentation in FLAIR MRI*

# Imports

In [None]:
import sys
import os
# Add the src directory to sys.path so we can import config
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

from config import *
setup_env()


In [None]:
import numpy as np 
import pandas as pd 
import os
import shutil
import json
import glob

# !pip install nnunetv2

# Environment Setup and Directory Creation

## clear previous folders

In [None]:
import shutil, os

# Paths used by nnUNet
paths_to_clear = [
    "../data/nnUNet_raw_data_base",
    "../data/nnUNet_preprocessed",
    "../data/nnUNet_results"
]

for p in paths_to_clear:
    if os.path.exists(p):
        print(f"🗑️ Removing {p} ...")
        shutil.rmtree(p)
print("✅ All previous nnUNet folders cleared.")

In [None]:
os.environ['nnUNet_raw_data_base'] = '../data/nnUNet_raw_data_base'

# Create necessary directories
!mkdir -p $nnUNet_raw_data_base/nnUNet_raw/Dataset002_BonnFCD_FLAIR

print("nnUNet environment setup complete.")

In [None]:
UPLOADED_DATASET_NAME = 'participants-data' 
CSV_DATASET_ROOT = f'../data/{UPLOADED_DATASET_NAME}'

print(f"Listing files in the root of your uploaded metadata dataset ({CSV_DATASET_ROOT}):")
if os.path.exists(CSV_DATASET_ROOT):
    # This will print the actual file name (e.g., ['participants.csv'] or ['participants.xlsx'])
    print(os.listdir(CSV_DATASET_ROOT))
else:
    print(f"Error: The dataset root {CSV_DATASET_ROOT} was not found. Please verify the uploaded dataset name.")

# Data Conversion and dataset.json Generation

In [None]:
# --- 🎯 FINAL PATHS AND NAMES 🎯 ---
BASE_DIR = RAW_DATA_PATH # Loaded from config.py
UPLOADED_DATASET_NAME = 'participants-data' 
EXCEL_FILE_NAME = 'participants.xlsx' 
EXCEL_PATH = EXCEL_PATH # Loaded from config.py
# ---------------------------------------------

TASK_ID = 2
TASK_NAME = 'Dataset002_BonnFCD_FLAIR' 
NNUNET_RAW_DATA_DIR = os.path.join(os.environ['nnUNet_raw_data_base'], 'nnUNet_raw', TASK_NAME) 

# Create target directories
# Create target directories
IMAGES_TR_DIR = os.path.join(NNUNET_RAW_DATA_DIR, 'imagesTr')
LABELS_TR_DIR = os.path.join(NNUNET_RAW_DATA_DIR, 'labelsTr')
IMAGES_TS_DIR = os.path.join(NNUNET_RAW_DATA_DIR, 'imagesTs')
LABELS_TS_DIR = os.path.join(NNUNET_RAW_DATA_DIR, 'labelsTs')  # optional test labels

# Create them if missing
for d in [IMAGES_TR_DIR, LABELS_TR_DIR, IMAGES_TS_DIR, LABELS_TS_DIR]:
    os.makedirs(d, exist_ok=True)

# --- 1. Load and Filter the Excel Data ---
print(f"Attempting to read Excel file from: {EXCEL_PATH}")

try:
    participants_df = pd.read_excel(EXCEL_PATH, sheet_name='participants')
except ValueError:
    xls = pd.ExcelFile(EXCEL_PATH)
    print(f"Available sheets: {xls.sheet_names}")
    participants_df = pd.read_excel(xls, sheet_name=xls.sheet_names[0])
except Exception as e:
    print(f"FATAL ERROR: Could not read Excel file. Please verify the EXCEL_PATH: {EXCEL_PATH}. Error: {e}")
    raise

# Ensure required columns exist
required_columns = {'participant_id', 'group', 'split'}
missing_columns = required_columns - set(participants_df.columns)
if missing_columns:
    raise Exception(f"Missing required columns in Excel file: {missing_columns}")

# Filter FCD subjects for training and testing
train_fcd_subjects = participants_df[
    (participants_df['group'].str.lower() == 'fcd') & 
    (participants_df['split'].str.lower() == 'train')
]['participant_id'].tolist()

test_fcd_subjects = participants_df[
    (participants_df['group'].str.lower() == 'fcd') & 
    (participants_df['split'].str.lower() == 'test')
]['participant_id'].tolist()

print(f"Total FCD subjects for TRAINING: {len(train_fcd_subjects)}")
print(f"Total FCD subjects for TESTING: {len(test_fcd_subjects)}")

training_files = []
test_files = []
skipped_subjects = []

# --- Helper Function to Process Subjects ---
def process_subject(subject_id, target_images_dir, is_training=True):
    subject_path_with_anat = os.path.join(BASE_DIR, subject_id, 'anat')
    
    if not os.path.exists(subject_path_with_anat):
        return False, f"'anat' folder missing"

    # Look for relevant MRI and label files
    # Look for relevant FLAIR and ROI files only
    flair_search_pattern = os.path.join(subject_path_with_anat, f'{subject_id}*_FLAIR.nii')
    label_search_pattern = os.path.join(subject_path_with_anat, f'{subject_id}*_FLAIR_roi.nii')
    
    flair_files = glob.glob(flair_search_pattern)
    label_files = glob.glob(label_search_pattern)
    
    # Validation checks
    if len(flair_files) != 1:
        return False, f"Missing or ambiguous FLAIR (found {len(flair_files)})."
    
    if is_training and len(label_files) != 1:
        return False, f"Training subject missing label file (found {len(label_files)})."

    # Prepare target filenames
    flair_target_name = f'{subject_id}_0000.nii'  # FLAIR only
    label_target_name = f'{subject_id}.nii'
    
    try:
        # Copy MRI modalities
        shutil.copy(flair_files[0], os.path.join(target_images_dir, flair_target_name))

        if is_training:
            shutil.copy(label_files[0], os.path.join(LABELS_TR_DIR, label_target_name))
            return True, {"image": f"./imagesTr/{subject_id}", "label": f"./labelsTr/{subject_id}.nii"}
        else:
            return True, {"image": f"./imagesTs/{subject_id}"}
            
    except Exception as e:
        return False, f"Copy error: {e}"

# --- 2. Process Training and Test Subjects ---
print("\n--- Processing Training Subjects ---")
for subject_id in train_fcd_subjects:
    success, result = process_subject(subject_id, IMAGES_TR_DIR, is_training=True)
    if success:
        training_files.append(result)
    else:
        skipped_subjects.append((subject_id, f"TRAIN - {result}"))

print("\n--- Processing Test Subjects ---")
for subject_id in test_fcd_subjects:
    success, result = process_subject(subject_id, IMAGES_TS_DIR, is_training=False)
    if success:
        test_files.append(result)

    else:
        skipped_subjects.append((subject_id, f"TEST - {result}"))

# --- 3. Generate dataset.json ---
dataset_json = {
    "name": "FCD Lesion Segmentation",
    "description": "Focal Cortical Dysplasia Lesion Segmentation Dataset (Pre-defined Splits)",
    "reference": "your/publication/link/here",
    "licence": "CC-BY-4.0",
    "release": "1.0",
    "channel_names": {
    "0": "FLAIR"
    },
    "labels": {
        "background": 0,
        "lesion": 1
    },
    "numTraining": len(training_files),
    "file_ending": ".nii",
    "training": training_files,
    "test": test_files
}

# Save JSON
with open(os.path.join(NNUNET_RAW_DATA_DIR, 'dataset.json'), 'w') as f:
    json.dump(dataset_json, f, indent=4)

print(f"\n✅ Conversion complete!")
print(f"  Training subjects: {len(training_files)}")
print(f"  Test subjects: {len(test_files)}")

if skipped_subjects:
    print("\n--- ⚠️ Skipped Subjects Summary ---")
    for subj, reason in skipped_subjects:
        print(f"  {subj}: {reason}")

In [None]:
# --- Copy test labels if available (optional evaluation) ---
LABELS_TS_DIR = os.path.join(NNUNET_RAW_DATA_DIR, 'labelsTs')
os.makedirs(LABELS_TS_DIR, exist_ok=True)

for subject_id in test_fcd_subjects:
    anat_folder = os.path.join(BASE_DIR, subject_id, 'anat')
    label_search_pattern = os.path.join(anat_folder, f'{subject_id}*_FLAIR_roi.nii')
    label_files = glob.glob(label_search_pattern)
    
    if label_files:
        shutil.copy(label_files[0], os.path.join(LABELS_TS_DIR, f'{subject_id}.nii'))
        print(f"✅ Copied label for test subject {subject_id}")


# Data Preprocessing

## 1. Set nnU-Net environment paths

In [None]:
import os, subprocess

# Set environment variables for nnU-Net
os.environ["nnUNet_preprocessed"] = "../data/nnUNet_preprocessed"
os.environ["nnUNet_results"] = "../data/nnUNet_results"

print("✅ nnU-Net environment variables set")
print("RAW:", os.environ["nnUNet_raw"])

## 2. Verify dataset integrity

In [None]:
print("\n🔍 Verifying dataset structure for Dataset002_BonnFCD_FLAIR...")
verify_command = [
    "nnUNetv2_plan_and_preprocess",
    "-d", "2",  # Dataset002_
    "--verify_dataset_integrity"
]

subprocess.run(verify_command, check=False)

## 3. Run preprocessing

In [None]:
print("\n⚙️ Starting preprocessing (this may take several minutes)...")

preprocess_command = [
    "nnUNetv2_plan_and_preprocess",
    "-d", "2"
]

subprocess.run(preprocess_command, check=False)

print("\n✅ Preprocessing complete!")
print("Preprocessed data stored in: ../data/nnUNet_preprocessed/Dataset002_BonnFCD_FLAIR/")


In [None]:
# 4. (Optional) Verify output paths
import os

pre_dir = "../data/nnUNet_preprocessed/Dataset002_BonnFCD_FLAIR"
if os.path.exists(pre_dir):
    print("✅ Folder exists! Contents:")
    print(os.listdir(pre_dir))
else:
    print("❌ Preprocessed folder not found!")
