In [1]:
# import zipfile
# import os
#
# def extract_folder_from_zip(zip_path, folder_name, output_path):
#     """
#     Extracts all files from a specific folder in a ZIP file to a destination directory,
#     preserving the original folder structure.
#
#     Parameters:
#     zip_path (str): Path to the ZIP file.
#     folder_name (str): Name of the folder within the ZIP file to extract.
#     output_path (str): Destination directory where files will be extracted.
#     """
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         # Iterate over all the items in the ZIP file
#         for file_info in zip_ref.infolist():
#             # Check if the current item is in the specified folder
#             if file_info.filename.startswith(folder_name + '/'):
#                 # Define the full path for extraction
#                 # Use os.path.relpath to keep the structure relative to folder_name
#                 extracted_path = os.path.join(output_path, os.path.relpath(file_info.filename, start=folder_name))
#                 # Create any necessary directories
#                 os.makedirs(os.path.dirname(extracted_path), exist_ok=True)
#                 # Extract the file
#                 with zip_ref.open(file_info) as source_file, open(extracted_path, 'wb') as target_file:
#                     target_file.write(source_file.read())
#                 # print(f"Extracted: {file_info.filename} to {extracted_path}")
#
# # Example usage
# zip_file_path = '/media02/tdhoang01/21127112-21127734/data/rsna-intracranial-hemorrhage-detection.zip'  # Path to your ZIP file
# folder_to_extract = 'rsna-intracranial-hemorrhage-detection/stage_2_train'  # Folder inside the ZIP to extract
# destination_directory = '/media02/tdhoang01/21127112-21127734/data/rsna-ich-mil/'  # Where to extract files
#
# extract_folder_from_zip(zip_file_path, folder_to_extract, destination_directory)

In [2]:
# import os
# import pandas as pd
# import shutil
#
# def organize_dicom_files(csv_file, source_dir):
#     # Read the CSV file into a DataFrame
#     df = pd.read_csv(csv_file)
#
#     # Remove 'ID_' prefix from patient_id and study_instance_uid
#     df['patient_id'] = df['patient_id'].str.replace('ID_', '', regex=False)
#     df['study_instance_uid'] = df['study_instance_uid'].str.replace('ID_', '', regex=False)
#
#     # Group by patient_id and study_instance_uid
#     grouped = df.groupby(['patient_id', 'study_instance_uid'])
#
#     for (patient_id, study_instance_uid), group in grouped:
#         # Create a subfolder name based on patient_id and study_instance_uid
#         subfolder_name = f"{patient_id}_{study_instance_uid}"
#         subfolder_path = os.path.join(source_dir, subfolder_name)
#
#         # Create the subfolder if it does not exist
#         os.makedirs(subfolder_path, exist_ok=True)
#
#         # Move each file in the group to the respective subfolder
#         for _, row in group.iterrows():
#             filename = row['filename']
#             source_file_path = os.path.join(source_dir, filename)
#             destination_file_path = os.path.join(subfolder_path, filename)
#
#             # Move the file
#             if os.path.exists(source_file_path):
#                 shutil.move(source_file_path, destination_file_path)
#                 # print(f"Moved: {source_file_path} to {destination_file_path}")
#             else:
#                 print(f"File not found: {source_file_path}")
#
# # Example usage
# csv_file_path = '/media02/tdhoang01/21127112-21127734/data/training_dataset.csv'  # Path to your CSV file
# source_directory = '/media02/tdhoang01/21127112-21127734/data/rsna-ich-mil/'        # Source directory containing DICOM files
#
# organize_dicom_files(csv_file_path, source_directory)

In [3]:
import zipfile
import os
import pandas as pd

def extract_and_organize_from_zip(zip_path, csv_file, output_path, n, folder_name):
    """
    Extracts specific files from a ZIP file based on the filenames listed in a CSV,
    and organizes them into subfolders named based on patient_id and study_instance_uid,
    but only for the first n rows in the CSV. Only files within a specific folder in the ZIP are considered.

    Parameters:
    zip_path (str): Path to the ZIP file.
    csv_file (str): Path to the CSV file containing filenames and metadata.
    output_path (str): Destination directory where files will be extracted and organized.
    n (int): Number of rows to process from the CSV file.
    folder_name (str): The specific folder within the ZIP file to read files from.
    """
    # Read the CSV file into a DataFrame, limiting to the first n rows
    df = pd.read_csv(csv_file, nrows=n)

    # Ensure the CSV column names match the expected column names
    df.columns = df.columns.str.strip()  # Remove leading/trailing spaces from column names
    if 'patient_id' not in df.columns or 'study_instance_uid' not in df.columns or 'filename' not in df.columns:
        raise ValueError("CSV file must contain 'patient_id', 'study_instance_uid', and 'filename' columns")

    # Remove 'ID_' prefix from patient_id and study_instance_uid
    df['patient_id'] = df['patient_id'].str.replace('ID_', '', regex=False)
    df['study_instance_uid'] = df['study_instance_uid'].str.replace('ID_', '', regex=False)

    # Parse the filenames column from string representation of list to actual list
    df['filename'] = df['filename'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

    # Flatten the DataFrame to have one filename per row
    df = df.explode('filename')

    # Create a set of filenames for quick lookup
    required_files = set(df['filename'].tolist())

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Iterate over all the items in the ZIP file
        for file_info in zip_ref.infolist():
            # Check if the current item is in the specified folder and in the required files list
            if file_info.filename.startswith(folder_name + '/') and os.path.basename(file_info.filename) in required_files:
                # Find the corresponding row in the DataFrame
                row = df[df['filename'] == os.path.basename(file_info.filename)].iloc[0]
                patient_id = row['patient_id']
                study_instance_uid = row['study_instance_uid']

                # Create a subfolder name based on patient_id and study_instance_uid
                subfolder_name = f"{patient_id}_{study_instance_uid}"
                subfolder_path = os.path.join(output_path, subfolder_name)

                # Create the subfolder if it does not exist
                os.makedirs(subfolder_path, exist_ok=True)

                # Define the full path for extraction
                extracted_path = os.path.join(subfolder_path, os.path.basename(file_info.filename))

                # Extract the file
                with zip_ref.open(file_info) as source_file, open(extracted_path, 'wb') as target_file:
                    target_file.write(source_file.read())
                # print(f"Extracted: {file_info.filename} to {extracted_path}")

In [7]:
# Example usage
zip_file_path = "D:/Datasets/rsna-intracranial-hemorrhage-detection.zip"  # Path to your ZIP file
csv_file_path = "rsna/data_analyze/testing_dataset_150.csv"  # Path to your CSV file
destination_directory = 'D:/Datasets/rsna-ich-mil/'  # Where to extract and organize files
n_rows = 2000  # Limit to first n rows in the CSV
folder_to_read = 'rsna-intracranial-hemorrhage-detection/stage_2_train'  # Specific folder within the ZIP file

extract_and_organize_from_zip(zip_file_path, csv_file_path, destination_directory, n_rows, folder_to_read)

In [5]:
# import pandas as pd
#
# df = pd.read_csv('./rsna/data_analyze/training_dataset_1.csv')
#
# multi_label_columns = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
#
# for column in multi_label_columns:
#     df[column] = df[column].apply(
#         lambda x: eval(x) if isinstance(x, str) else x
#     )
#
# # Create new 6 columns for each multi-label column
# for column in multi_label_columns:
#         df[column + f'_'] = df[column].apply(lambda x: 1 if sum(x) > 0 else 0)
# df.head()
#
# # df.to_csv('temp.csv', index=False)

In [6]:
# # Count the number of row that has more than 1 label in these columns
# multi_label_columns = ['any_', 'epidural_', 'intraparenchymal_', 'intraventricular_', 'subarachnoid_', 'subdural_']
# df['multi_label'] = df[multi_label_columns].sum(axis=1)
# df['multi_label'].value_counts()


In [13]:
df_train = pd.read_csv('rsna/data_analyze/training_dataset_1150.csv')
df_test = pd.read_csv('rsna/data_analyze/testing_dataset_150.csv')

# Check if df_train and df_test have the same row
print(df_train.shape, df_test.shape)

# Check if there are same patient_id and study_instance_uid in both df_train and df_test
train_ids = set(df_train['patient_id'] + '_' + df_train['study_instance_uid'])
test_ids = set(df_test['patient_id'] + '_' + df_test['study_instance_uid'])

# Print out the duplicate ids
duplicate_ids = train_ids.intersection(test_ids)
duplicate_ids

(1000, 19) (150, 19)


{'ID_e0d2de32_ID_00047d6503'}