In [2]:
import torch
import matplotlib.pyplot as plt
import nibabel as nib
import pydicom as dicom
import cv2
from PIL import Image
import numpy as np
import os
import shutil
import pandas as pd
import csv
from dataset_utils import *

Transform .nii files to .jpeg format

In [None]:
dataset_path = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_Training_Data'

patients = os.listdir(dataset_path)

example_slices = []
for i, patient in enumerate(patients):
    
    dir_patient = dataset_path + '\\' + patient
    
    if '_0' in dir_patient:
        with os.scandir(dir_patient) as entries:
            
            print(f"Opening patient files: {dir_patient}")
            for entry in entries:
                ## Unzip the compressed files
                if entry.is_file() and entry.name.endswith('.nii.gz'):  # Check for .nii.gz files
                    file_path = entry.path
                    idx_format = str.find(file_path, '.nii.gz')
                    unzipped_file = file_path[:idx_format]  
                    unzipped_file = unzipped_file + '.jpeg' 

                    # Load the .nii.gz file
                    nii_data = nib.load(file_path)
                    # print(nii_data.header)
                    img = nii_data.get_fdata()
                    
                    n_slices = img.shape[2]
                    slice = img[..., n_slices//2]
                    
                    plt.figure()
                    plt.imshow(slice, cmap='gray')
                    plt.axis('off')
                    plt.savefig(unzipped_file, format='jpeg', bbox_inches='tight', pad_inches=0)
                    plt.close()
        

Select the first X samples from original folder, apply rotation of the images, specify the number of patients to use

In [None]:
dataset_path = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_Training_Data'
output_path = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_full\\'
os.makedirs(output_path, exist_ok=True)
patients = os.listdir(dataset_path)

example_slices = []
for i, patient in enumerate(patients):
    
    if '_0' in patient: 
        dir_patient = dataset_path + '\\' + patient
        
        files = os.listdir(dir_patient)
        for file in files:
            
            if '.jpeg' in file:
                in_dir = dir_patient + '\\' + file
                img = Image.open(in_dir)
                
                im_rot = img.rotate(90)
                im_resized = im_rot.resize((224,224), Image.Resampling.LANCZOS)
                
                out_dir = output_path + file
                im_resized.save(out_dir, 'jpeg')
            
            

Split modalities into different folders for usage

In [None]:

# Define dataset path and output base path
dataset_path = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_full\\'
output_base_path = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_full_classified'

# List of modalities
modalities = ['Flair', 't1ce', 't1', 't2', 'seg']

# Ensure output directories exist
for modality in modalities:
    os.makedirs(os.path.join(output_base_path, modality), exist_ok=True)

# List all files in the dataset directory
files = os.listdir(dataset_path)

# Process each file
for file in files:
    dir_file = os.path.join(dataset_path, file)

    # Determine the modality and set the corresponding output directory
    if '_flair' in file:
        modality = 'Flair'
    elif '_t1ce' in file:
        modality = 't1ce'
    elif '_t1' in file and '_t1ce' not in file:
        modality = 't1'
    elif '_t2' in file:
        modality = 't2'
    elif '_seg' in file:  # Skip segmentation files
        modality = 'seg'
    else:
        print(f"Unknown modality for file: {file}")
        continue

    # Build the new file name and output path
    file_idx = file.find('_' + modality.lower())  # Adjust modality for filenames
    file_name = file[:file_idx] + '.jpeg'
    out_path = os.path.join(output_base_path, modality, file_name)

    # Copy the file to the output folder
    shutil.copy(dir_file, out_path)
    print(f"Copied {file} to {out_path}")

print("File separation complete!")


Generate prompts in 'metadata.csv' for each folder modality

In [None]:
csv_files = []
q4 = 19000
q1 = 5900

# Define paths
out_dir = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_full_classified\\'
ref_dir = out_dir + 'seg' # REF. Contains the segmentation guides
out_dirs = [out_dir + 't1', out_dir + 't1ce', out_dir + 't2', out_dir + 'Flair'] # OUT.FOLDERS. Contain all the splitted up folders

modalities = ['t1', 't1ce', 't2', 'flair']

# Loop through segmentation files
for i, file in enumerate(os.listdir(ref_dir)):
    # Process the segmentation file
    dir_file = os.path.join(ref_dir, file)

    # Read the mask
    mask = cv2.imread(dir_file, cv2.IMREAD_GRAYSCALE)
    
    # Process tumor mask
    tumor_mask = get_tumor_mask(mask)
    tumor_size = get_size(tumor_mask)

    # Loop through each modality and output directory
    for j, modality in enumerate(modalities):
        out_dir = out_dirs[j]
        
        # Metadata CSV path
        csv_file = os.path.join(out_dir, 'metadata.csv')

        # Generate prompt
        if tumor_size > 0:
            position = get_position_label(tumor_mask)
            if tumor_size >= q4:
                size = 'large'
            elif tumor_size > q1:
                size = 'medium'
            else:
                size = 'small'
            # prompt = f'1' #NOTE : This is for classification (test diffusion training)
            prompt = f'Brain {modality} with {size} tumor on the {position}'
        else:
            # prompt = f'0' #NOTE: This is for classification (test diffusion training)
            prompt = f'Brain MRI {modality} healthy'

        # Write the prompts
        if i == 0:
            with open(csv_file, mode='w', newline='', encoding='utf-8') as csv_file_obj:
                    csv_writer = csv.writer(csv_file_obj, )
                    # Write header row
                    csv_writer.writerow(['file_name', 'text'])

        # Write metadata to CSV
        with open(csv_file, mode='a', newline='', encoding='utf-8') as csv_file_obj:
            csv_writer = csv.writer(csv_file_obj)
            # Write metadata to CSV
            csv_writer.writerow([file, prompt])


The following lines of code are for balanced selection of training and test purposes

In [None]:
folder_task = r'C:\Users\marin\Desktop\deepL\project\BraTS2021_full_classified\\' # NOTE: Folder containing the samples used for training/ classifying

modality = 't1ce' # NOTE: This is the modality used for training/ testing
image_folder = (folder_task + modality+ '\\') # NOTE: folder containing metadata.csv file w prompts
output_dir = folder_task + 'sample_train\\' # NOTE: This is the folder where the selected sample of images and new metadata report will get generated to

data = pd.read_csv(image_folder + 'metadata.csv') 

# os.makedirs(healthy_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# For diffusion training, prompts are text based
healthy_cases = data[data['text'].str.contains('healthy', case=False)]
diseased_cases = data[~data['text'].str.contains('healthy', case=False)]

# For classification labels get simplified to 1 (with tumor) or 0 (healthy)
# healthy_cases = data[data['text'] == 0]
# diseased_cases = data[data['text'] == 1]

# Select n healthy and m diseased cases
healthy_cases_sample = healthy_cases.sample(n=120, random_state=33) # NOTE: Max number of healthy images is 180, 
# generate the train and test set w different random states
diseased_cases_sample = diseased_cases.sample(n=180, random_state=33)

# Combine healthy and diseased cases and shuffle them
combined_cases = pd.concat([healthy_cases_sample, diseased_cases_sample]).sample(frac=1, random_state=42)

# Save the combined metadata to a single CSV file
combined_output_csv = os.path.join(output_dir, 'metadata.csv')
combined_cases.to_csv(combined_output_csv, index=False)

# Copy images for healthy and diseased cases
copy_images(healthy_cases_sample['file_name'], output_dir, image_folder)
copy_images(diseased_cases_sample['file_name'], output_dir, image_folder)


## Two folders will get generated from this code in the folder directory, sample_test and sample_train

Additional utils

In [None]:
## The following lines of code are to transform text labels to binary labels 

metadata_path = '/home/mcrespo/migros_deepL/BraTS2021_final/sample_test/metadata.csv'
metadata = pd.read_csv(metadata_path)

healthy_files = metadata[metadata.apply(lambda row: row.astype(str).str.contains('healthy', case=False).any(), axis=1)]
diseased_files = metadata[metadata.apply(lambda row: ~row.astype(str).str.contains('healthy', case=False).any(), axis=1)]

# Read the Excel file into a DataFrame
df = pd.read_csv(metadata_path)

# Replace text in the specified column
df['text'] = df['text'].apply(
    lambda x: 0 if isinstance(x, str) and 'healthy' in x.lower() else 1
)
# Display the data
# print(matadata)
output_file = '/home/mcrespo/migros_deepL/BraTS2021_final/sample_test/metadata.csv'  # Replace with your desired output file name
df.to_csv(output_file, index=False)

In [None]:
## Additional utils to resize images (if needed)
folder_resized =  'C:\\Users\\marin\\Desktop\\deepL\\project\\BraTS2021_simplified\\sample_t1ce_300' # Specify folder to use
for img in os.listdir(folder_resized):
    if '.jpeg' in img:
        img_dir = folder_resized + '\\' + img
        img = Image.open(img_dir)
        im_resized = img.resize((224,224), Image.Resampling.LANCZOS)
        im_resized.save(img_dir)
                