In [1]:
import zipfile
import os

def extract_folder_from_zip(zip_path, folder_name, output_path):
    """
    Extracts all files from a specific folder in a ZIP file to a destination directory,
    preserving the original folder structure.

    Parameters:
    zip_path (str): Path to the ZIP file.
    folder_name (str): Name of the folder within the ZIP file to extract.
    output_path (str): Destination directory where files will be extracted.
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Iterate over all the items in the ZIP file
        for file_info in zip_ref.infolist():
            # Check if the current item is in the specified folder
            if file_info.filename.startswith(folder_name + '/'):
                # Define the full path for extraction
                # Use os.path.relpath to keep the structure relative to folder_name
                extracted_path = os.path.join(output_path, os.path.relpath(file_info.filename, start=folder_name))
                # Create any necessary directories
                os.makedirs(os.path.dirname(extracted_path), exist_ok=True)
                # Extract the file
                with zip_ref.open(file_info) as source_file, open(extracted_path, 'wb') as target_file:
                    target_file.write(source_file.read())
                # print(f"Extracted: {file_info.filename} to {extracted_path}")

# Example usage
zip_file_path = 'rsna-intracranial-hemorrhage-detection.zip'  # Path to your ZIP file
folder_to_extract = 'rsna-intracranial-hemorrhage-detection/stage_2_train'  # Folder inside the ZIP to extract
destination_directory = './rsna-ich-mil/'  # Where to extract files

extract_folder_from_zip(zip_file_path, folder_to_extract, destination_directory)

In [2]:
import os
import pandas as pd
import shutil

def organize_dicom_files(csv_file, source_dir):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Remove 'ID_' prefix from patient_id and study_instance_uid
    df['patient_id'] = df['patient_id'].str.replace('ID_', '', regex=False)
    df['study_instance_uid'] = df['study_instance_uid'].str.replace('ID_', '', regex=False)

    # Group by patient_id and study_instance_uid
    grouped = df.groupby(['patient_id', 'study_instance_uid'])

    for (patient_id, study_instance_uid), group in grouped:
        # Create a subfolder name based on patient_id and study_instance_uid
        subfolder_name = f"{patient_id}_{study_instance_uid}"
        subfolder_path = os.path.join(source_dir, subfolder_name)

        # Create the subfolder if it does not exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Move each file in the group to the respective subfolder
        for _, row in group.iterrows():
            filename = row['filename']
            source_file_path = os.path.join(source_dir, filename)
            destination_file_path = os.path.join(subfolder_path, filename)

            # Move the file
            if os.path.exists(source_file_path):
                shutil.move(source_file_path, destination_file_path)
                # print(f"Moved: {source_file_path} to {destination_file_path}")
            else:
                print(f"File not found: {source_file_path}")

# Example usage
csv_file_path = './rsna/data_analyze/training_dataset.csv'  # Path to your CSV file
source_directory = 'rsna-ich-mil/'        # Source directory containing DICOM files

organize_dicom_files(csv_file_path, source_directory)