In [1]:
import numpy as np
import pandas as pd
import zipfile

In [2]:
import pandas as pd
import zipfile

# Define the path to the ZIP file
# zip_file_path = '../rsna-intracranial-hemorrhage-detection.zip'
zip_file_path = '../../rsna-intracranial-hemorrhage-detection.zip'
# Load the CSV file from the ZIP archive
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Specify the name of the CSV file inside the ZIP
    csv_filename = 'rsna-intracranial-hemorrhage-detection/stage_2_train.csv'
    
    # Use pandas to read the CSV file directly from the ZIP
    with zip_ref.open(csv_filename) as csv_file:
        labels_df = pd.read_csv(csv_file)

# Display the first few rows of the DataFrame
labels_df.head()

Unnamed: 0,ID,Label
0,ID_12cadc6af_epidural,0
1,ID_12cadc6af_intraparenchymal,0
2,ID_12cadc6af_intraventricular,0
3,ID_12cadc6af_subarachnoid,0
4,ID_12cadc6af_subdural,0


In [3]:
# Inspect the ID column
print(labels_df['ID'].unique())  # Check unique IDs for format issues

# Split the ID column safely
split_ids = labels_df['ID'].str.split('_', n=2, expand=True)  # n=1 to limit the number of splits
labels_df['ID'] = split_ids[1]
labels_df['Diagnosis'] = split_ids[2]

# Check for any rows where Diagnosis is NaN (indicating an issue with the split)
print(labels_df[labels_df['Diagnosis'].isna()])
labels_df.head()

['ID_12cadc6af_epidural' 'ID_12cadc6af_intraparenchymal'
 'ID_12cadc6af_intraventricular' ... 'ID_4a85a3a3f_subarachnoid'
 'ID_4a85a3a3f_subdural' 'ID_4a85a3a3f_any']
Empty DataFrame
Columns: [ID, Label, Diagnosis]
Index: []


Unnamed: 0,ID,Label,Diagnosis
0,12cadc6af,0,epidural
1,12cadc6af,0,intraparenchymal
2,12cadc6af,0,intraventricular
3,12cadc6af,0,subarachnoid
4,12cadc6af,0,subdural


In [4]:
# Identify duplicates
duplicates = labels_df[labels_df.duplicated(subset=['ID', 'Diagnosis'], keep=False)]
print("Duplicates found without Label:")
print(len(duplicates))

duplicates = labels_df[labels_df.duplicated(subset=['ID', 'Diagnosis', 'Label'], keep=False)]
print("Duplicates found with Label:")
print(len(duplicates))

# Drop duplicates
labels_df = labels_df.drop_duplicates(subset=['ID', 'Diagnosis', 'Label'])

Duplicates found without Label:
48
Duplicates found with Label:
48


In [5]:
print(labels_df['ID'].isnull().sum())
print(labels_df.columns)

0
Index(['ID', 'Label', 'Diagnosis'], dtype='object')


In [6]:
# # Convert labels_df to a DataFrame
labels_df = pd.DataFrame(labels_df)

labels_df = labels_df.groupby(['ID', 'Diagnosis'], as_index=False)['Label'].max()

# # Pivot the DataFrame to have diagnoses as columns
# labels_pivot = labels_df.pivot(index='ID', columns='Diagnosis', values='Label').fillna(0).reset_index()

In [7]:
labels_df.head()

Unnamed: 0,ID,Diagnosis,Label
0,000012eaf,any,0
1,000012eaf,epidural,0
2,000012eaf,intraparenchymal,0
3,000012eaf,intraventricular,0
4,000012eaf,subarachnoid,0


In [8]:
# Pivot the DataFrame to have diagnoses as columns
labels_pivot = labels_df.pivot(index='ID', columns='Diagnosis', values='Label').fillna(0).reset_index()

In [9]:
labels_pivot.head()

Diagnosis,ID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,000012eaf,0,0,0,0,0,0
1,000039fa0,0,0,0,0,0,0
2,00005679d,0,0,0,0,0,0
3,00008ce3c,0,0,0,0,0,0
4,0000950d7,0,0,0,0,0,0


In [10]:
# Print all the NaN values
for column in labels_pivot.columns:
    print(f"NaN values in {column}: {labels_pivot[column].isna().sum()}")

NaN values in ID: 0
NaN values in any: 0
NaN values in epidural: 0
NaN values in intraparenchymal: 0
NaN values in intraventricular: 0
NaN values in subarachnoid: 0
NaN values in subdural: 0


In [11]:
# Read sorted_dicom_df from sorted_training_dataset.csv
sorted_dicom_df = pd.read_csv('sorted_training_dataset_descending.csv')
sorted_dicom_df.head()

Unnamed: 0,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
0,ID_deb85caf0.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 174.759506]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
1,ID_716b72762.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 169.421448]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
2,ID_a8aca4f40.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 164.083405]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
3,ID_4184c4f03.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 158.745346]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
4,ID_72e823e2c.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 153.409485]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0


In [12]:
print(sorted_dicom_df.shape)
print(labels_pivot.shape)

(752803, 18)
(752803, 7)


In [13]:
# # Extract ID from the filename (ignoring the extension)
# sorted_dicom_df['ID'] = sorted_dicom_df['filename'].str.split('.').str[0]

# Step 1: Extract ID from the filename (ignoring the extension)
sorted_dicom_df['ID'] = sorted_dicom_df['filename'].str.split('_', n=1, expand=True)[1].str.split('.').str[0]

In [14]:
# import dask.dataframe as dd

# # Convert to Dask DataFrames
# sorted_dicom_dd = dd.from_pandas(sorted_dicom_df, npartitions=10)
# labels_pivot_dd = dd.from_pandas(labels_pivot, npartitions=10)

# # Assign values from labels_pivot_dd to sorted_dicom_dd based on ID
# diagnosis_columns = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
# sorted_dicom_dd = sorted_dicom_dd.merge(labels_pivot_dd[['ID'] + diagnosis_columns], on='ID', how='left')

# # Compute the result
# sorted_dicom_df = sorted_dicom_dd.compute()

In [15]:
# sorted_dicom_df.head()

In [16]:
# Step 2: Merge the DataFrames on ID
merged_df = sorted_dicom_df.merge(labels_pivot, on='ID', how='left', suffixes=('', '_label'))

merged_df.head()

Unnamed: 0,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,patient_id,study_instance_uid,series_instance_uid,...,window_width,rescale_intercept,rescale_slope,ID,any_label,epidural_label,intraparenchymal_label,intraventricular_label,subarachnoid_label,subdural_label
0,ID_deb85caf0.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,...,80,-1024.0,1.0,deb85caf0,0,0,0,0,0,0
1,ID_716b72762.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,...,80,-1024.0,1.0,716b72762,0,0,0,0,0,0
2,ID_a8aca4f40.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,...,80,-1024.0,1.0,a8aca4f40,0,0,0,0,0,0
3,ID_4184c4f03.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,...,80,-1024.0,1.0,4184c4f03,0,0,0,0,0,0
4,ID_72e823e2c.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,...,80,-1024.0,1.0,72e823e2c,0,0,0,0,0,0


In [17]:
# Step 3: Assign values from labels_pivot to sorted_dicom_df
diagnosis_columns = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
for col in diagnosis_columns:
    sorted_dicom_df[col] = merged_df[col + '_label'].fillna(0).astype(int)

# Display the updated DataFrame
sorted_dicom_df.head()

Unnamed: 0,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope,ID
0,ID_deb85caf0.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 174.759506]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0,deb85caf0
1,ID_716b72762.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 169.421448]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0,716b72762
2,ID_a8aca4f40.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 164.083405]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0,a8aca4f40
3,ID_4184c4f03.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 158.745346]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0,4184c4f03
4,ID_72e823e2c.dcm,0,0,0,0,0,0,ID_fffc2bd6,ID_dbf89aa342,ID_5f23c752ef,"[-125.000000, -122.084023, 153.409485]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0,72e823e2c


In [18]:
# Save the sorted DataFrame to a new CSV file
sorted_dicom_df.to_csv('sorted_training_dataset_descending_with_labels.csv', index=False)

In [19]:
# print the row that has 1 value in intraparenchymal
print(sorted_dicom_df[sorted_dicom_df['intraparenchymal'] == 1])

                filename  any  epidural  intraparenchymal  intraventricular  \
40      ID_55f1fd2ef.dcm    1         0                 1                 0   
41      ID_032d5f85e.dcm    1         0                 1                 0   
42      ID_c225e3d45.dcm    1         0                 1                 0   
43      ID_390dbc246.dcm    1         0                 1                 1   
44      ID_487ed181a.dcm    1         0                 1                 1   
...                  ...  ...       ...               ...               ...   
752560  ID_7af428783.dcm    1         0                 1                 0   
752561  ID_98e61691f.dcm    1         0                 1                 0   
752562  ID_424bb2303.dcm    1         0                 1                 0   
752563  ID_8c4be97d5.dcm    1         0                 1                 0   
752564  ID_f30726c97.dcm    1         0                 1                 0   

        subarachnoid  subdural   patient_id study_i