### Library

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import pydicom
import cv2
import os
import zipfile
import warnings

### Configure

In [6]:
# Seed
SEED = 202406
np.random.seed(SEED)
torch.manual_seed(SEED)

# Constants
TEST_SIZE = 0.02
HEIGHT = 256
WIDTH = 256
CHANNELS = 3
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 64
SHAPE = (HEIGHT, WIDTH, CHANNELS)

# Folders
DATA_DIR = './rsna-intracranial-hemorrhage-detection/'
TEST_IMAGES_DIR = DATA_DIR + 'stage_2_test/'
TRAIN_IMAGES_DIR = DATA_DIR + 'stage_2_train/'

### Windowing

In [7]:
def correct_dcm(dcm):
    x = dcm.pixel_array + 1000
    px_mode = 4096
    x[x>=px_mode] = x[x>=px_mode] - px_mode
    dcm.PixelData = x.tobytes()
    dcm.RescaleIntercept = -1000

def window_image(dcm, window_center, window_width):    
    if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100):
        correct_dcm(dcm)
    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
    
    # Resize
    img = cv2.resize(img, SHAPE[:2], interpolation = cv2.INTER_LINEAR)
   
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    img = np.clip(img, img_min, img_max)
    return img

def bsb_window(dcm):
    brain_img = window_image(dcm, 40, 80)
    subdural_img = window_image(dcm, 80, 200)
    soft_img = window_image(dcm, 40, 380)
    
    brain_img = (brain_img - 0) / 80
    subdural_img = (subdural_img - (-20)) / 200
    soft_img = (soft_img - (-150)) / 380
    # bsb_img = np.array([brain_img, subdural_img, soft_img])
    bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0)

    return bsb_img

def _read(path, SHAPE):
    dcm = pydicom.dcmread(path)
    try:
        img = bsb_window(dcm)
    except:
        img = np.zeros(SHAPE)
    return img

### Read Dataset

In [8]:
def read_testset(filename=DATA_DIR + "stage_2_sample_submission.csv"):
    df = pd.read_csv(filename)
    df["Image"] = df["ID"].str.slice(stop=12)
    df["Diagnosis"] = df["ID"].str.slice(start=13)
    df = df.loc[:, ["Label", "Diagnosis", "Image"]]
    df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
    return df

def read_trainset(filename=DATA_DIR + "stage_2_train.csv"):
    df = pd.read_csv(filename)
    df["Image"] = df["ID"].str.slice(stop=12)
    df["Diagnosis"] = df["ID"].str.slice(start=13)
    duplicates_to_remove = [56346, 56347, 56348, 56349,
                            56350, 56351, 1171830, 1171831,
                            1171832, 1171833, 1171834, 1171835,
                            3705312, 3705313, 3705314, 3705315,
                            3705316, 3705317, 3842478, 3842479,
                            3842480, 3842481, 3842482, 3842483]
    df = df.drop(index=duplicates_to_remove)
    df = df.reset_index(drop=True)
    df = df.loc[:, ["Label", "Diagnosis", "Image"]]
    df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
    return df

# Read Train and Test Datasets
test_df = read_testset()
train_df = read_trainset()

FileNotFoundError: [Errno 2] No such file or directory: './rsna-intracranial-hemorrhage-detection/stage_2_sample_submission.csv'

### Plot Dataset Samples

In [4]:
def plot_multiple_samples(data_list, labels_list, images_per_row=5):
    num_images = len(data_list)
    num_rows = num_images // images_per_row + (num_images % images_per_row > 0)
    
    fig, axs = plt.subplots(num_rows, images_per_row, figsize=(15, num_rows * 3))
    axs = axs.ravel()  # Flatten the array of axes for easy iteration
    
    for i in range(num_images):
        img = data_list[i]
        axs[i].imshow(img, cmap='gray')
        axs[i].set_title(f'Label: {labels_list[i]}')
        axs[i].axis('off')  # Hide axes ticks
    
    # Hide any unused subplots
    for j in range(num_images, num_rows * images_per_row):
        axs[j].axis('off')
    
    plt.subplots_adjust(wspace=0.5, hspace=0.5)
    plt.show()

# # Load and plot the first 100 images from the training dataset
# images_to_plot = []
# labels_to_plot = []

# for i in range(20):
#     img = _read(TRAIN_IMAGES_DIR + train_df.index[i] + ".dcm", SHAPE)
#     images_to_plot.append(img)
#     labels_to_plot.append(train_df.values[i])  # Assuming labels are in train_df

# plot_multiple_samples(images_to_plot, labels_to_plot)


In [None]:
# def print_dicom_info(directory, num_files=1):
#     """
#     Prints DICOM information for the first `num_files` in the specified directory.
    
#     Parameters:
#     - directory (str): The path to the directory containing DICOM files.
#     - num_files (int): The number of DICOM files to process (default is 1).
#     """
#     # Get a list of DICOM files in the directory
#     dicom_files = [f for f in os.listdir(directory) if f.endswith('.dcm')]
    
#     # Process only the specified number of files
#     for i, file_name in enumerate(dicom_files[:num_files]):
#         file_path = os.path.join(directory, file_name)
#         ds = pydicom.dcmread(file_path)
        
#         # Print all DICOM tags and their values
#         for elem in ds:
#             print(f"{elem.tag}: {elem.name} = {elem.value}")

# # Example usage
# print_dicom_info(TRAIN_IMAGES_DIR)

### Create a Dataframe

In [5]:
# Suppress specific warnings from pydicom
warnings.filterwarnings("ignore", category=UserWarning, module='pydicom.valuerep')

def create_dicom_dataframe(zip_file_path, train_images_dir):
    """
    Creates a pandas DataFrame from DICOM files in the specified directory within a ZIP archive.
    
    Parameters:
    - zip_file_path (str): The path to the ZIP file containing the DICOM files.
    - train_images_dir (str): The path to the directory within the ZIP file containing the DICOM files.
    
    Returns:
    - pandas DataFrame: A DataFrame containing DICOM information.
    """
    data = []
    
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Get a list of DICOM files in the directory within the ZIP
        dicom_files = [f for f in zip_ref.namelist() if f.startswith(train_images_dir) and f.endswith('.dcm')]
        
        for file_path in dicom_files:
            with zip_ref.open(file_path) as file:
                ds = pydicom.dcmread(file)
                
                # Extract relevant DICOM tags
                filename = os.path.basename(file_path)
                patient_id = ds.PatientID
                study_instance_uid = ds.StudyInstanceUID
                series_instance_uid = ds.SeriesInstanceUID
                image_position = str(ds.ImagePositionPatient)
                samples_per_pixel = ds.SamplesPerPixel
                pixel_spacing = str(ds.PixelSpacing)
                pixel_representation = ds.PixelRepresentation
                window_center = ds.WindowCenter
                window_width = ds.WindowWidth
                rescale_intercept = ds.RescaleIntercept
                rescale_slope = ds.RescaleSlope
                
                # Assign 0 or NaN to the specified columns
                any_value = 0
                epidural = 0
                intraparenchymal = 0
                intraventricular = 0
                subarachnoid = 0
                subdural = 0
                
                row = {
                    'filename': filename,
                    'any': any_value,
                    'epidural': epidural,
                    'intraparenchymal': intraparenchymal,
                    'intraventricular': intraventricular,
                    'subarachnoid': subarachnoid,
                    'subdural': subdural,
                    'patient_id': patient_id,
                    'study_instance_uid': study_instance_uid,
                    'series_instance_uid': series_instance_uid,
                    'image_position': image_position,
                    'samples_per_pixel': samples_per_pixel,
                    'pixel_spacing': pixel_spacing,
                    'pixel_representation': pixel_representation,
                    'window_center': window_center,
                    'window_width': window_width,
                    'rescale_intercept': rescale_intercept,
                    'rescale_slope': rescale_slope
                }
                
                data.append(row)
    
    df = pd.DataFrame(data)
    return df

# Example usage
zip_file_path = 'rsna-intracranial-hemorrhage-detection.zip'
train_images_dir = 'rsna-intracranial-hemorrhage-detection/stage_2_train'
dicom_df = create_dicom_dataframe(zip_file_path, train_images_dir)

### Save dataframe to CSV file

In [6]:
# # Save the DataFrame to a CSV file
dicom_df.to_csv('raw_training_dataset.csv', index=False)
print("DataFrame saved to raw_training_dataset.csv")

dicom_df.head()

DataFrame saved to raw_training_dataset.csv


Unnamed: 0,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
0,ID_000012eaf.dcm,0,0,0,0,0,0,ID_f15c0eee,ID_30ea2b02d4,ID_0ab5820b2a,"[-125.000000, -115.897980, 77.970825]",1,"[0.488281, 0.488281]",1,30.0,80.0,-1024.0,1.0
1,ID_000039fa0.dcm,0,0,0,0,0,0,ID_eeaf99e7,ID_134d398b61,ID_5f8484c3e0,"[-125.000000, -141.318451, 62.720940]",1,"[0.488281, 0.488281]",1,30.0,80.0,-1024.0,1.0
2,ID_00005679d.dcm,0,0,0,0,0,0,ID_18f2d431,ID_b5c26cda09,ID_203cd6ec46,"[-134.463, -110.785, -39.569]",1,"[0.460938, 0.460938]",1,50.0,100.0,-1024.0,1.0
3,ID_00008ce3c.dcm,0,0,0,0,0,0,ID_ce8a3cd2,ID_974735bf79,ID_3780d48b28,"[-125, -83.0468112, 175.995344]",1,"[0.48828125, 0.48828125]",0,"[00040, 00040]","[00080, 00080]",-1024.0,1.0
4,ID_0000950d7.dcm,0,0,0,0,0,0,ID_d278c67b,ID_8881b1c4b1,ID_84296c3845,"[-126.437378, -126.437378, 157.500000]",1,"[0.494863, 0.494863]",1,35.0,135.0,-1024.0,1.0


### Sorted DataFrame for the same Patient_id

In [9]:
import ast

def sort_dicom_dataframe(df):
    """
    Sorts the DataFrame by patient_id, study_instance_uid, and the last value in image_position.
    
    Parameters:
    - df (pandas DataFrame): The DataFrame containing DICOM information.
    
    Returns:
    - pandas DataFrame: The sorted DataFrame.
    """
    # Convert image_position strings to lists and extract the last value for sorting
    df['image_position_last'] = df['image_position'].apply(lambda x: ast.literal_eval(x)[-1] if isinstance(x, str) else float('nan'))

    # Sort by patient_id, study_instance_uid and then by the last value in image_position
    sorted_df = df.sort_values(by=['patient_id', 'study_instance_uid', 'image_position_last'], ascending=[False, False, False])
    
    # Drop the temporary column used for sorting
    sorted_df = sorted_df.drop(columns=['image_position_last'])
    
    return sorted_df

# dicom_df = read from raw_training_dataset.csv
dicom_df = pd.read_csv('raw_training_dataset.csv')

# Example usage
sorted_dicom_df = sort_dicom_dataframe(dicom_df)

In [10]:
# Save the sorted DataFrame to a new CSV file
sorted_dicom_df.to_csv('sorted_training_dataset_descending.csv', index=False)
print("Sorted DataFrame saved to sorted_training_dataset.csv")

Sorted DataFrame saved to sorted_training_dataset.csv


In [11]:
sorted_dicom_df.head()

Unnamed: 0,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
655283,ID_deb85caf0.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 174.759506]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
334384,ID_716b72762.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 169.421448]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
496845,ID_a8aca4f40.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 164.083405]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
192889,ID_4184c4f03.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 158.745346]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
338774,ID_72e823e2c.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 153.409485]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0


### Plot Images With Specific Patient ID

In [None]:
def plot_images_by_patient_id(df, patient_id, images_per_row=5):
    """
    Plots all images for a specific patient_id from the DataFrame.
    
    Parameters:
    - df (pandas DataFrame): The DataFrame containing DICOM information.
    - patient_id (str): The patient_id for which to plot images.
    - images_per_row (int): Number of images to display per row in the plot.
    """
    # Filter the DataFrame for the specific patient_id
    patient_data = df[df['patient_id'] == patient_id]
    
    # Prepare lists to hold images and labels
    images_to_plot = []
    labels_to_plot = []
    
    # Load images corresponding to the filenames
    for index, row in patient_data.iterrows():
        file_name = row['filename']
        img = _read(os.path.join(TRAIN_IMAGES_DIR, file_name), SHAPE)  # Adjust the path and shape as needed
        images_to_plot.append(img)
        labels_to_plot.append(row['patient_id'])  # You can change this to any other label if needed
    
    # Plot the images using the provided function
    plot_multiple_samples(images_to_plot, labels_to_plot, images_per_row)

# Example usage
plot_images_by_patient_id(sorted_dicom_df, 'ID_0002cd41')

### Analyzer Patient Data

In [12]:
def analyze_patient_data(df):
    """
    Analyzes the DataFrame to find unique patient IDs and image statistics.
    
    Parameters:
    - df (pandas DataFrame): The DataFrame containing DICOM information.
    """
    # Count unique patient IDs
    unique_patient_ids = df['patient_id'].nunique()
    
    # Group by patient_id and count the number of images for each
    image_counts = df.groupby('patient_id').size()
    
    # Calculate minimum, maximum, and average number of images per patient_id
    min_images = image_counts.min()
    max_images = image_counts.max()
    avg_images = image_counts.mean()
    
    # Print the results
    print(f"Number of unique patient IDs: {unique_patient_ids}")
    print(f"Minimum number of images per patient ID: {min_images}")
    print(f"Maximum number of images per patient ID: {max_images}")
    print(f"Average number of images per patient ID: {avg_images:.2f}")

# Example usage
analyze_patient_data(sorted_dicom_df)

Number of unique patient IDs: 18938
Minimum number of images per patient ID: 20
Maximum number of images per patient ID: 548
Average number of images per patient ID: 39.75
