In [None]:
import pandas as pd

df = pd.read_csv('./rsna/data_analyze/sorted_training_dataset_with_labels.csv')

# Function to compute patient label
def compute_patient_label(group):
    # If all specified columns are 0, patient_label is 0, otherwise 1
    return 0 if group[['any', 'epidural', 'intraparenchymal', 
                        'intraventricular', 'subarachnoid', 
                        'subdural']].sum().sum() == 0 else 1

# Group by patient_id and study_instance_uid, and create the patient_label column
grouped = df.groupby(['patient_id', 'study_instance_uid'])
patient_labels = grouped.apply(compute_patient_label).reset_index(name='patient_label')

# Merge the labels back into the original DataFrame
result = df.merge(patient_labels, on=['patient_id', 'study_instance_uid'])

# Display the result
print(result.head())

# Save the result to a new CSV file
result.to_csv('./rsna/data_analyze/training_dataset.csv', index=False)

In [1]:
import pandas as pd
import zipfile
import os

def create_structured_zip(source_zip_path, destination_zip_path, dataset):
    # Load dataset into DataFrame
    df = pd.DataFrame(dataset)

    # Generate subfolder names by removing 'ID_' prefix
    df['subfolder_name'] = df['patient_id'].str.replace('ID_', '') + '_' + df['study_instance_uid'].str.replace('ID_', '')

    # Group by subfolder name and list filenames
    grouped = df.groupby('subfolder_name')['filename'].apply(list).reset_index()

    # Create or open the destination zip file
    with zipfile.ZipFile(destination_zip_path, 'a') as dest_zip:
        # Open the source zip file for reading
        with zipfile.ZipFile(source_zip_path, 'r') as source_zip:
            for _, row in grouped.iterrows():
                subfolder = row['subfolder_name']
                filenames = row['filename']
                
                for filename in filenames:
                    # Define the path inside the zip for this file
                    source_file_path = f"rsna-intracranial-hemorrhage-detection/stage_2_train/{filename}"
                    
                    # Check if the file exists in the source zip
                    if source_file_path in source_zip.namelist():
                        # Create a new path for the file in the destination zip
                        dest_file_path = os.path.join(subfolder, filename)
                        
                        # Copy the file from source to destination zip
                        with source_zip.open(source_file_path) as source_file:
                            dest_zip.writestr(dest_file_path, source_file.read())
                    else:
                        print(f"File {source_file_path} not found in source zip.")

# Sample dataset creation (as provided)
dataset = {
    'filename': ['ID_45785016b.dcm', 'ID_37f32aed2.dcm', 'ID_1b9de2922.dcm', 
                 'ID_d61a6a7b9.dcm', 'ID_406c82112.dcm', 'ID_47beede43.dcm', 
                 'ID_bad62dc58.dcm'],
    'patient_id': ['ID_0002cd41']*7,
    'study_instance_uid': ['ID_66929e09d4']*7
}

# dataset = pd.read_csv('./rsna/data_analyze/training_dataset.csv')

# Define paths for source and destination zip files
source_zip_path = 'rsna-intracranial-hemorrhage-detection.zip'
destination_zip_path = 'rsna-ich-mil-temp.zip'

# Call the function to create structured zip
create_structured_zip(source_zip_path, destination_zip_path, dataset)

In [5]:
# Going to zip file and check the structure of first 10 folders
with zipfile.ZipFile('rsna-ich-mil.zip', 'r') as zip_ref:
    for i in range(10):
        print(zip_ref.namelist()[i])

0002cd41_66929e09d4/ID_45785016b.dcm
0002cd41_66929e09d4/ID_37f32aed2.dcm
0002cd41_66929e09d4/ID_1b9de2922.dcm
0002cd41_66929e09d4/ID_d61a6a7b9.dcm
0002cd41_66929e09d4/ID_406c82112.dcm
0002cd41_66929e09d4/ID_47beede43.dcm
0002cd41_66929e09d4/ID_bad62dc58.dcm
0002cd41_66929e09d4/ID_40b991168.dcm
0002cd41_66929e09d4/ID_00e680819.dcm
0002cd41_66929e09d4/ID_54ef737a6.dcm


In [4]:
with zipfile.ZipFile('rsna-ich-mil.zip', 'r') as zip_ref:
    folder_path = '00326f32_81646f5161'
    for name in zip_ref.namelist():
        if name.startswith(folder_path):
            print(name)

00326f32_81646f5161/ID_6ae043a75.dcm
00326f32_81646f5161/ID_c20fb17ad.dcm
00326f32_81646f5161/ID_08189d9eb.dcm
00326f32_81646f5161/ID_618aa2870.dcm
00326f32_81646f5161/ID_97e302c22.dcm
00326f32_81646f5161/ID_fd83b2d03.dcm
00326f32_81646f5161/ID_c46cd6e1d.dcm
00326f32_81646f5161/ID_a7038750e.dcm
00326f32_81646f5161/ID_7d97d2ab8.dcm
00326f32_81646f5161/ID_53d4db036.dcm
00326f32_81646f5161/ID_001b4d902.dcm
00326f32_81646f5161/ID_b4fe793cd.dcm
00326f32_81646f5161/ID_479a5c561.dcm
00326f32_81646f5161/ID_a09f63171.dcm
00326f32_81646f5161/ID_b16c9e181.dcm
00326f32_81646f5161/ID_25520db15.dcm
00326f32_81646f5161/ID_8892b92c6.dcm
00326f32_81646f5161/ID_7c638957c.dcm
00326f32_81646f5161/ID_8de5edd02.dcm
00326f32_81646f5161/ID_7bebe71fc.dcm
00326f32_81646f5161/ID_51bc23a4e.dcm
00326f32_81646f5161/ID_22aa802b5.dcm
00326f32_81646f5161/ID_527954d58.dcm
00326f32_81646f5161/ID_20ac73f3d.dcm
00326f32_81646f5161/ID_adfd9c480.dcm
00326f32_81646f5161/ID_095be076f.dcm
00326f32_81646f5161/ID_c0d6b4bd5.dcm
0

In [1]:
import zipfile
import os

def extract_folder_from_zip(zip_path, folder_name, output_path):
    """
    Extracts all files from a specific folder in a ZIP file to a destination directory,
    preserving the original folder structure.

    Parameters:
    zip_path (str): Path to the ZIP file.
    folder_name (str): Name of the folder within the ZIP file to extract.
    output_path (str): Destination directory where files will be extracted.
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Iterate over all the items in the ZIP file
        for file_info in zip_ref.infolist():
            # Check if the current item is in the specified folder
            if file_info.filename.startswith(folder_name + '/'):
                # Define the full path for extraction
                # Use os.path.relpath to keep the structure relative to folder_name
                extracted_path = os.path.join(output_path, os.path.relpath(file_info.filename, start=folder_name))
                # Create any necessary directories
                os.makedirs(os.path.dirname(extracted_path), exist_ok=True)
                # Extract the file
                with zip_ref.open(file_info) as source_file, open(extracted_path, 'wb') as target_file:
                    target_file.write(source_file.read())
                # print(f"Extracted: {file_info.filename} to {extracted_path}")

# Example usage
zip_file_path = 'rsna-intracranial-hemorrhage-detection.zip'  # Path to your ZIP file
folder_to_extract = 'rsna-intracranial-hemorrhage-detection/stage_2_train'  # Folder inside the ZIP to extract
destination_directory = './rsna-ich-mil/'  # Where to extract files

extract_folder_from_zip(zip_file_path, folder_to_extract, destination_directory)

In [4]:
# delete all the files that .dcm files
import os
import shutil

folder_path = './'
for filename in os.listdir(folder_path):
    if filename.endswith('.dcm'):
        file_path = os.path.join(folder_path, filename)
        os.remove(file_path)

In [2]:
import os
import pandas as pd
import shutil

def organize_dicom_files(csv_file, source_dir):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Remove 'ID_' prefix from patient_id and study_instance_uid
    df['patient_id'] = df['patient_id'].str.replace('ID_', '', regex=False)
    df['study_instance_uid'] = df['study_instance_uid'].str.replace('ID_', '', regex=False)

    # Group by patient_id and study_instance_uid
    grouped = df.groupby(['patient_id', 'study_instance_uid'])

    for (patient_id, study_instance_uid), group in grouped:
        # Create a subfolder name based on patient_id and study_instance_uid
        subfolder_name = f"{patient_id}_{study_instance_uid}"
        subfolder_path = os.path.join(source_dir, subfolder_name)

        # Create the subfolder if it does not exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Move each file in the group to the respective subfolder
        for _, row in group.iterrows():
            filename = row['filename']
            source_file_path = os.path.join(source_dir, filename)
            destination_file_path = os.path.join(subfolder_path, filename)

            # Move the file
            if os.path.exists(source_file_path):
                shutil.move(source_file_path, destination_file_path)
                # print(f"Moved: {source_file_path} to {destination_file_path}")
            else:
                print(f"File not found: {source_file_path}")

# Example usage
csv_file_path = './rsna/data_analyze/training_dataset.csv'  # Path to your CSV file
source_directory = 'rsna-ich-mil/'        # Source directory containing DICOM files

organize_dicom_files(csv_file_path, source_directory)