In [5]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths
original_images_dir = r"C:\Users\Admin\Documents\rsna-pneumonia-detection-challenge\stage_2_train_images"
labels_file = r"C:\Users\Admin\Documents\rsna-pneumonia-detection-challenge\stage_2_train_labels.csv"
train_dir = "D:/data/train"
val_dir = "D:/data/val"

# Create the main train and val directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Create class-specific subdirectories (0/ and 1/) for train and val directories
os.makedirs(os.path.join(train_dir, '0'), exist_ok=True)
os.makedirs(os.path.join(train_dir, '1'), exist_ok=True)
os.makedirs(os.path.join(val_dir, '0'), exist_ok=True)
os.makedirs(os.path.join(val_dir, '1'), exist_ok=True)

# Read the labels CSV file
labels_df = pd.read_csv(labels_file)

# Remove duplicates based on patientId (keep the first occurrence)
labels_df_unique = labels_df.drop_duplicates(subset='patientId')

# Add the .dcm extension to patientId to match the image filenames
labels_df_unique['image_filename'] = labels_df_unique['patientId'].apply(lambda x: x + '.dcm')

# Split the data into training and validation sets (80-20 split)
train_df, val_df = train_test_split(labels_df_unique, test_size=0.2, stratify=labels_df_unique['Target'], random_state=42)

# Copy the images into the corresponding directories based on labels

# Process the train set
for index, row in train_df.iterrows():
    src = os.path.join(original_images_dir, row['image_filename'])
    
    # Determine the target directory based on the label (0 or 1)
    target_dir = os.path.join(train_dir, str(row['Target']))
    
    # Construct the destination path
    dst = os.path.join(target_dir, row['image_filename'])
    
    # Copy the image if it exists
    if os.path.exists(src):
        shutil.copy(src, dst)  # Copy image to the appropriate class folder

# Process the validation set
for index, row in val_df.iterrows():
    src = os.path.join(original_images_dir, row['image_filename'])
    
    # Determine the target directory based on the label (0 or 1)
    target_dir = os.path.join(val_dir, str(row['Target']))
    
    # Construct the destination path
    dst = os.path.join(target_dir, row['image_filename'])
    
    # Copy the image if it exists
    if os.path.exists(src):
        shutil.copy(src, dst)  # Copy image to the appropriate class folder

print("Data split complete.")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_df_unique['image_filename'] = labels_df_unique['patientId'].apply(lambda x: x + '.dcm')


Data split complete.


In [6]:
import os
import pydicom
from PIL import Image

# Function to convert DICOM to image
def dicom_to_image(dicom_file, output_folder, output_format="png"):
    dicom_data = pydicom.dcmread(dicom_file)
    
    # Convert DICOM pixel data to an image
    img_array = dicom_data.pixel_array
    
    # Convert to PIL Image
    img = Image.fromarray(img_array)
    
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Generate a valid filename (without extension)
    filename = os.path.splitext(os.path.basename(dicom_file))[0] + f'.{output_format}'
    output_path = os.path.join(output_folder, filename)
    
    # Save the image
    img.save(output_path)
    print(f"Saved {output_path}")
    
    # Delete the original DICOM file after conversion
    os.remove(dicom_file)
    print(f"Deleted {dicom_file}")

# Root folder where your dataset is located
root_folder = 'D:/data'

# Loop over all subdirectories (train/0, train/1, val/0, val/1)
for subfolder in ['train', 'val']:
    for class_folder in ['0', '1']:
        input_folder = os.path.join(root_folder, subfolder, class_folder)
        output_folder = os.path.join(root_folder, subfolder, class_folder)
        
        # Convert all .dcm files in the input folder
        for filename in os.listdir(input_folder):
            if filename.endswith('.dcm'):
                dicom_file = os.path.join(input_folder, filename)
                dicom_to_image(dicom_file, output_folder, output_format="png")  # Change to "jpg" if you prefer


Saved D:/data\train\0\0004cfab-14fd-4e49-80ba-63a80b6bddd6.png
Deleted D:/data\train\0\0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm
Saved D:/data\train\0\000924cf-0f8d-42bd-9158-1af53881a557.png
Deleted D:/data\train\0\000924cf-0f8d-42bd-9158-1af53881a557.dcm
Saved D:/data\train\0\0010f549-b242-4e94-87a8-57d79de215fc.png
Deleted D:/data\train\0\0010f549-b242-4e94-87a8-57d79de215fc.dcm
Saved D:/data\train\0\0022995a-45eb-4cfa-9a59-cd15f5196c64.png
Deleted D:/data\train\0\0022995a-45eb-4cfa-9a59-cd15f5196c64.dcm
Saved D:/data\train\0\0025d2de-bd78-4d36-9f72-e15a5e22ca82.png
Deleted D:/data\train\0\0025d2de-bd78-4d36-9f72-e15a5e22ca82.dcm
Saved D:/data\train\0\002c591d-df62-4e34-8eda-838c664430a9.png
Deleted D:/data\train\0\002c591d-df62-4e34-8eda-838c664430a9.dcm
Saved D:/data\train\0\00313ee0-9eaa-42f4-b0ab-c148ed3241cd.png
Deleted D:/data\train\0\00313ee0-9eaa-42f4-b0ab-c148ed3241cd.dcm
Saved D:/data\train\0\00322d4d-1c29-4943-afc9-b6754be640eb.png
Deleted D:/data\train\0\00322d4d-1c29-494