In [1]:
import os
import pydicom
import numpy as np
import nibabel as nib
import pandas as pd
import torch
from monai.data import DataLoader, Dataset
from monai.transforms import Compose, LoadImaged, EnsureChannelFirstD, Resized, ToTensord
from monai.transforms import Compose, LoadImaged, EnsureChannelFirstD, Spacingd, Orientationd, ScaleIntensityRanged, CropForegroundd, Resized, ToTensord
from monai.utils import set_determinism

set_determinism(seed=0)

def convert_dicom_to_nifti(source_folder, target_folder, target_filename='output.nii.gz'):
    """Converts a series of DICOM images within a folder into a single compressed NIfTI file."""
    
    # Check if the NIfTI file already exists
    nifti_path = os.path.join(target_folder, target_filename)
    if os.path.exists(nifti_path):
        print(f"NIfTI file already exists at {nifti_path}. Skipping conversion.")
        return nifti_path
    
    dicom_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.endswith('.dcm')]
    if not dicom_files:
        print(f"No DICOM files found in {source_folder}.")
        return None

    try:
        dicom_images = [pydicom.dcmread(file_path) for file_path in dicom_files]
        dicom_images.sort(key=lambda x: int(x.InstanceNumber))
        image_data = np.stack([img.pixel_array for img in dicom_images])
    except Exception as e:
        print(f"Failed to read DICOM files: {e}")
        return None

    try:
        affine = np.diag([1, 1, 1, 1])  # Placeholder affine matrix
        nifti_image = nib.Nifti1Image(image_data, affine=affine)
        os.makedirs(target_folder, exist_ok=True)
        nib.save(nifti_image, nifti_path)
        print(f"NIfTI file saved at {nifti_path}")
        return nifti_path
    except Exception as e:
        print(f"Failed to convert to NIfTI: {e}")
        return None


def load_images(root_path):
    """Navigate through folder levels to find and convert DICOM files to NIfTI."""
    for root, _, _ in os.walk(root_path):
        dicom_files = [f for f in os.listdir(root) if f.endswith('.dcm')]
        if len(dicom_files) > 1:
            nifti_folder = os.path.join(root_path, 'nifti')
            nifti_file = os.path.join(nifti_folder, 'output.nii.gz')
            
            # Check if the NIfTI file already exists
            if os.path.exists(nifti_file):
                print(f"NIfTI file already exists at {nifti_file}. Skipping conversion.")
                return nifti_file
            
            os.makedirs(nifti_folder, exist_ok=True)
            return convert_dicom_to_nifti(root, nifti_folder)
    return None


class CustomDataset(Dataset):
    def __init__(self, root_dir, excel_path, transform=None):
        self.root_dir = root_dir
        self.data_frame = pd.read_excel(excel_path)
        self.patient_folders = [os.path.join(root_dir, name) for name in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, name))]
        self.transform = transform

    def __getitem__(self, idx):
        folder_name = self.patient_folders[idx]
        patient_id = os.path.basename(folder_name)
        row = self.data_frame[self.data_frame['Patient-ID'] == patient_id]
        image_path = load_images(folder_name)

        if image_path is None:
            return None  # Handle according to your application's needs

        # Prepare a dictionary for the transform
        image_dict = {"vol": image_path}

        # Apply transforms
        if self.transform:
            image_dict = self.transform(image_dict)  # Pass the dictionary containing the file path

        # The transformed dictionary should now have "vol" as a tensor
        image = image_dict["vol"]  # This is now the tensor directly

        # Return the image and other data
        survival_time = row['overall_survival_months'].values[0]
        vital_status = row['vital_status'].values[0]
        
        return image, survival_time, vital_status

    def __len__(self):
        return len(self.patient_folders)

# Define transformations
train_transforms = Compose([
    LoadImaged(keys=["vol"]),  # Use the correct key in your dataset
    EnsureChannelFirstD(keys=["vol"]),
    Spacingd(keys=["vol"], pixdim=(1.0, 1.0, 1.0), mode='trilinear'),
    Orientationd(keys=["vol"], axcodes='RAS'),
    ScaleIntensityRanged(keys=["vol"], a_min=-1000, a_max=400, b_min=0.0, b_max=1.0, clip=True),
    Resized(keys=["vol"], spatial_size=(128, 128, 64)),
    ToTensord(keys=["vol"])
])


# Instantiate the dataset
dataset = CustomDataset(root_dir='/home/nikhil/Downloads/tcia_download/NBIA-Download', 
                        excel_path='/home/nikhil/Downloads/tcia_download/Dataset/crlm.xlsx', 
                        transform=train_transforms)

# Load data and print
for i, data in enumerate(dataset):
    if data is not None:
        print(data)  # Load each sample
    # if i == 5:
    #     break




  from .autonotebook import tqdm as notebook_tqdm
Unknown extension is not supported and will be removed


NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1143/nifti/output.nii.gz. Skipping conversion.
(metatensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0

In [2]:
print("Total samples in dataset:", len(dataset))

# Create data loader
data_loader = DataLoader(dataset, batch_size=3)
dataiter = iter(data_loader)
images = next(dataiter)
img, labels, event = images
print("img.shape", img.shape)

for i, img in enumerate(images):
    print("single_img.shape", img.shape)

Total samples in dataset: 197
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1143/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1083/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1010/nifti/output.nii.gz. Skipping conversion.
img.shape torch.Size([3, 1, 128, 128, 64])
single_img.shape torch.Size([3, 1, 128, 128, 64])
single_img.shape torch.Size([3])
single_img.shape torch.Size([3])


In [3]:
# Load the Excel file
excel_path = '/home/nikhil/Downloads/tcia_download/Dataset/crlm.xlsx'
data_frame = pd.read_excel(excel_path)

patient_ids = data_frame['Patient-ID'].tolist()
vital_statuses = data_frame['vital_status'].tolist()  # Ensure 'vital_status' is in your DataFrame



Unknown extension is not supported and will be removed


In [4]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets (e.g., 80% training, 20% testing)
# Split the dataset into training and testing sets (e.g., 80% training, 20% testing)
train_ids, test_ids = train_test_split(
    patient_ids, 
    test_size=0.25, 
    stratify=vital_statuses,  # Stratify based on vital_status
    random_state=42
)
print(len(train_ids))
print(len(test_ids))

147
50


In [5]:
# Create datasets for training and testing
train_dataset = CustomDataset(root_dir='/home/nikhil/Downloads/tcia_download/NBIA-Download', 
                               excel_path=excel_path, 
                               transform=train_transforms)
test_dataset = CustomDataset(root_dir='/home/nikhil/Downloads/tcia_download/NBIA-Download', 
                              excel_path=excel_path, 
                              transform=train_transforms)

Unknown extension is not supported and will be removed
Unknown extension is not supported and will be removed


In [6]:
# Filter datasets based on train_ids and test_ids
train_dataset.patient_folders = [os.path.join(train_dataset.root_dir, pid) for pid in train_ids if os.path.isdir(os.path.join(train_dataset.root_dir, pid))]
test_dataset.patient_folders = [os.path.join(test_dataset.root_dir, pid) for pid in test_ids if os.path.isdir(os.path.join(test_dataset.root_dir, pid))]


In [7]:
test_dataset.patient_folders

['/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1071',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1092',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1056',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1154',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1119',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1080',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1177',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1153',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1070',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1026',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1186',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1195',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1122',
 '/home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1002',
 '/home/nikhil/Downloads/tcia_download/NBIA-Down

In [8]:
# Create data loaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=False)
print(len(train_loader))
print(len(test_loader))

49
17


In [9]:
def check_class_distribution(data_loader, label_index=1):
    """Check and print the distribution of classes in the DataLoader."""
    count_dict = {}
    total_samples = 0

    for _, _, labels in data_loader:  # Assuming labels are at index 1
        total_samples += labels.shape[0]  # Number of samples in the batch
        for label in labels:
            label_value = label.item()  # Get the label value
            if label_value not in count_dict:
                count_dict[label_value] = 0
            count_dict[label_value] += 1

    # Print the distribution
    print("Class distribution:")
    for label_value, count in count_dict.items():
        proportion = count / total_samples * 100
        print(f"Label {label_value}: {count} samples ({proportion:.2f}%)")
    
    print(f"Total samples in DataLoader: {total_samples}")

# Check class distribution for training and testing data loaders
print("Training DataLoader Distribution:")
check_class_distribution(train_loader)

print("\nTesting DataLoader Distribution:")
check_class_distribution(test_loader)


Training DataLoader Distribution:
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1105/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1115/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1005/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1160/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1113/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1049/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1116/nifti/output.nii.gz. Skipping conversion.
NIfTI file already ex

In [10]:
# Load data from the training DataLoader and print shapes
print("Training Data:")
for i, data in enumerate(train_loader):
    if data is not None:
        img, labels, event = data
        print(f"Batch {i + 1}: img.shape = {img.shape}, labels = {labels}, event = {event}")
    if i == 2:  # Show only the first 3 batches
        break

Training Data:
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1046/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1086/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1051/nifti/output.nii.gz. Skipping conversion.
Batch 1: img.shape = torch.Size([3, 1, 128, 128, 64]), labels = tensor([  3.7667,  54.0667, 102.4333], dtype=torch.float64), event = tensor([1, 1, 0])
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1169/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1009/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1033/nifti/output.nii.gz. Skipping conversion.
Batch 2: img.shape = tor

In [11]:
# Load data from the testing DataLoader and print shapes
print("\nTesting Data:")
for i, data in enumerate(test_loader):
    if data is not None:
        img, labels, event = data
        print(f"Batch {i + 1}: img.shape = {img.shape}, labels = {labels}, event = {event}")
    if i == 2:  # Show only the first 3 batches
        break


Testing Data:
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1071/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1092/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1056/nifti/output.nii.gz. Skipping conversion.
Batch 1: img.shape = torch.Size([3, 1, 128, 128, 64]), labels = tensor([71.1000, 81.5000, 67.5667], dtype=torch.float64), event = tensor([1, 1, 1])
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1154/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1119/nifti/output.nii.gz. Skipping conversion.
NIfTI file already exists at /home/nikhil/Downloads/tcia_download/NBIA-Download/CRLM-CT-1080/nifti/output.nii.gz. Skipping conversion.
Batch 2: img.shape = torch.