# MIMIC

* Reading in the labels
* Reading in metadata
* Only keeping AP view

* Reading in the embedding list
* Reading the reports
* Merging df
* Create train/val/test split


In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df_labels = pd.read_csv('/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.0.0-chexpert.csv')
df_labels_expert = pd.read_csv('/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.1.0-test-set-labeled.csv')
print(df_labels.shape)
print(df_labels_expert.shape)

(227827, 16)
(687, 15)


In [3]:
# Keep all columns of df_labels, but exchange with df_labels_expert where available

# Merge the dataframes to bring updated rows
df_merged = pd.merge(
    df_labels, 
    df_labels_expert, 
    on='study_id', 
    how='left', 
    suffixes=('', '_expert')
)

# Replace values in df_labels with corresponding values from df_labels_expert where available
for col in df_labels_expert.columns:
    if "Airspace" in col:
        continue
    if col != 'study_id':  # Avoid replacing 'study_id' itself
        df_merged[col] = df_merged[col + '_expert'].combine_first(df_merged[col])

# Drop the "_expert" columns used for merging
df_merged = df_merged[df_labels.columns]
print(df_merged.shape)

(227827, 16)


In [4]:
# get duplicates
duplicates = df_merged[df_merged.duplicated(subset=['study_id'], keep=False)]
print(duplicates.shape)


(0, 16)


In [5]:
df_merged[(df_merged['subject_id'] == 13042648) & (df_merged['study_id'] == 58779246)]


Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
68970,13042648,58779246,,,1.0,,,,,1.0,,,,1.0,,1.0


In [6]:
# replace nan and -1 with 0
df_merged = df_merged.fillna(0)
df_merged = df_merged.replace(-1, 0)
# get duplicates
duplicates = df_merged[df_merged.duplicated(subset=['study_id'], keep=False)]
print(duplicates.shape)

(0, 16)


### Metadata

In [7]:
df_meta = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.0.0-metadata.csv")

# drop row if ViewPosition is not AP
df_meta = df_meta[df_meta['ViewPosition'] == 'AP']
df_meta.shape

(147173, 12)

In [8]:
# Merge based on study_id and subject_id only keep rows that are in df_meta
df_merge = pd.merge(
    df_merged, 
    df_meta[['study_id', 'subject_id', 'dicom_id']],  # Only keep relevant columns for merging
    on=['study_id', 'subject_id'], 
    how='inner'
)

df_merge.shape

(147169, 17)

In [9]:
# get duplicates
duplicates = df_merged[df_merged.duplicated(subset=['study_id'], keep=False)]
print(duplicates.shape)

(0, 16)


### Embeddings

In [10]:
df_embeddings = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/all_embeddings.csv")
# rename Unnamed:0 to Path
df_embeddings = df_embeddings.rename(columns={"Unnamed: 0": "Path"})
df_embeddings.head(2)

Unnamed: 0,Path,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,0.008716,0.023156,-0.022518,0.014674,-0.012687,0.023082,-0.019689,-0.020924,0.007848,...,-0.008098,-0.006153,-0.072708,-0.060997,0.04044,-0.012472,-0.003973,0.007015,-0.035673,0.005564
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,-3.4e-05,0.013148,-0.014591,0.017788,-0.012439,-0.011676,-0.006333,-0.030729,0.025696,...,0.043519,0.011985,-0.019353,-0.046151,-0.000618,-0.032512,-0.003576,0.017097,-0.038961,0.035584


In [11]:
img = '/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/files/p13/p13042648/s58779246/24f6c834-7e14cae0-66293a0a-6c8044c9-4a87967d.jpg'
df_embeddings[df_embeddings['Path'] == img]

Unnamed: 0,Path,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
243961,/mnt/data2/datasets_lfay/MedImageInsights/data...,-0.019185,0.031098,-0.001581,-0.013201,0.007646,-0.005413,-0.023828,-0.034046,-0.01463,...,0.011989,-0.018302,-0.015856,-0.07872,-0.00882,-0.012025,0.031064,0.01205,-0.02878,0.007968


In [12]:
# Extract subject_id and study_id
df_embeddings['subject_id'] = df_embeddings['Path'].str.extract(r'/p\d+/p(\d+)/s')
df_embeddings['study_id'] = df_embeddings['Path'].str.extract(r'/s(\d+)/')
df_embeddings['dicom_id'] = df_embeddings['Path'].str.extract(r'/([^/]+).jpg')

# Convert to integers
df_embeddings['subject_id'] = df_embeddings['subject_id'].astype(int)
df_embeddings['study_id'] = df_embeddings['study_id'].astype(int)

df_embeddings.head(2)

Unnamed: 0,Path,0,1,2,3,4,5,6,7,8,...,1017,1018,1019,1020,1021,1022,1023,subject_id,study_id,dicom_id
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,0.008716,0.023156,-0.022518,0.014674,-0.012687,0.023082,-0.019689,-0.020924,0.007848,...,-0.060997,0.04044,-0.012472,-0.003973,0.007015,-0.035673,0.005564,11769941,53344629,bc9373a9-19cc43db-50ef2d5c-c5201f96-c8859ecc
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,-3.4e-05,0.013148,-0.014591,0.017788,-0.012439,-0.011676,-0.006333,-0.030729,0.025696,...,-0.046151,-0.000618,-0.032512,-0.003576,0.017097,-0.038961,0.035584,11769941,58953102,28a90887-a94ecd9f-df2ed39e-319cc783-74b2bf5e


In [13]:
df_merged[(df_merged['subject_id'] == 13042648) & (df_merged['study_id'] == 58779246)]


Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
68970,13042648,58779246,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [14]:
# Reorder columns to place 'study_id' and 'subject_id' after 'Path'
columns = list(df_embeddings.columns)
columns.remove('study_id')
columns.remove('subject_id')
columns.remove('dicom_id')
columns.insert(columns.index('Path') + 1, 'study_id')
columns.insert(columns.index('Path') + 2, 'subject_id')
columns.insert(columns.index('Path') + 3, 'dicom_id')

df_embeddings = df_embeddings[columns]
df_embeddings.shape


(294257, 1028)

### Merge embeddings with labels


In [15]:

# Perform the merge
df_result = pd.merge(
    df_merge, 
    df_embeddings, 
    on=['study_id', 'subject_id', 'dicom_id'],  # Merge on these common columns
    how='left'  # Keep only rows from df_merge
)

print("Merge complete. Resulting dataframe:")
print(df_result.shape)
df_result.head(2)

# remove duplicates
df_result = df_result.drop_duplicates(subset=['Path'])
df_result.shape


Merge complete. Resulting dataframe:
(147169, 1042)


(147169, 1042)

In [16]:
# Reorder columns to make 'Path' the first column
columns = ['Path'] + [col for col in df_result.columns if col != 'Path']
df_result = df_result[columns]
df_result.head(2)

Unnamed: 0,Path,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,10000032,53911762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018527,0.012001,-0.020366,-0.039709,0.004157,-0.010063,-0.009169,-0.039814,-0.035478,0.002569
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,10000032,53911762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005905,-0.016116,-0.010489,-0.0429,-0.005026,0.004733,-0.010128,-0.029413,-0.030221,-0.01709


## Read reports

In [17]:
import os
import pandas as pd

# Assuming df_result is your DataFrame
report_texts = []

# Iterate through each row in the DataFrame
for report_path in df_result['Path'].values:
    try:
        # Construct the path to the report file
        report_file_path = os.path.join("/", *report_path.split('/')[:-1]) + '.txt'
        
        # Read the content of the report file
        with open(report_file_path, 'r') as f:
            report_text = f.read()
    except FileNotFoundError:
        # Handle cases where the file is not found
        report_text = None
        print(f"File not found: {report_file_path}")
    except Exception as e:
        # Handle other exceptions
        report_text = None
        print(f"Error reading file {report_file_path}: {e}")
    
    # Append the report text (or None if file not found) to the list
    report_texts.append(report_text)

# Add the report texts to a new column in the DataFrame
df_result['report'] = report_texts


In [18]:
# Move the 'report' column to be the fourth column (after 'study_id')
# First, get the list of columns
columns = list(df_result.columns)

# Find the index of 'study_id' column
study_id_index = columns.index('study_id')

# Rearrange the columns: Insert 'report' right after 'study_id'
columns.insert(study_id_index + 1, columns.pop(columns.index('report')))

# Reassign the DataFrame with the new column order
df_result = df_result[columns]

# Check the new column order
df_result.head(2)


Unnamed: 0,Path,subject_id,study_id,report,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,10000032,53911762,FINAL REPORT\...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018527,0.012001,-0.020366,-0.039709,0.004157,-0.010063,-0.009169,-0.039814,-0.035478,0.002569
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,10000032,53911762,FINAL REPORT\...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005905,-0.016116,-0.010489,-0.0429,-0.005026,0.004733,-0.010128,-0.029413,-0.030221,-0.01709


In [19]:
# remove /mnt/data2/datasets_lfay from Path
df_result['Path'] = df_result['Path'].str.replace('/mnt/data2/datasets_lfay', '')
df_result.head(2)

Unnamed: 0,Path,subject_id,study_id,report,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/MedImageInsights/data/MIMIC-v1.0-512/files/p1...,10000032,53911762,FINAL REPORT\...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018527,0.012001,-0.020366,-0.039709,0.004157,-0.010063,-0.009169,-0.039814,-0.035478,0.002569
1,/MedImageInsights/data/MIMIC-v1.0-512/files/p1...,10000032,53911762,FINAL REPORT\...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005905,-0.016116,-0.010489,-0.0429,-0.005026,0.004733,-0.010128,-0.029413,-0.030221,-0.01709


### extract findings and impressions

In [20]:
import re

def extract_findings_and_impressions(reports, combine=False):
    """
    Extract findings and impressions from reports.

    Args:
        reports (list of str): List of report texts.
        combine (bool): If True, combines findings and impressions into one string.

    Returns:
        dict: A dictionary with keys 'findings', 'impressions', and optionally 'combined'.
    """
    findings_list = []
    impressions_list = []
    combined_list = [] if combine else None
    
    for report in reports:
        # Normalize spaces and line breaks
        report = " ".join(report.split())
        
        # Extract findings
        findings_match = re.search(r"FINDINGS:\s(.*?)(?=\sIMPRESSION:|\sCONCLUSION:|$)", report, re.IGNORECASE)
        findings = findings_match.group(1).strip() if findings_match else None
        findings = f"FINDINGS: {findings}" if findings else None
        findings_list.append(findings)
        
        # Extract impressions
        impression_match = re.search(r"IMPRESSION:\s(.*?)(?=\sFINDINGS:|$)", report, re.IGNORECASE)
        impression = impression_match.group(1).strip() if impression_match else None
        impression = f"IMPRESSION: {impression}" if impression else None
        impressions_list.append(impression)
        
    
    result = {
        "findings": findings_list,
        "impressions": impressions_list
    }

    return result


In [24]:

# Add section_findings and section_impressions to df_result
report_texts = df_result['report'].values
extracted = extract_findings_and_impressions(report_texts)
df_result['section_findings'] = extracted['findings']
df_result['section_impression'] = extracted['impressions']

# move column after report
columns = list(df_result.columns)
columns.insert(columns.index('report') + 1, columns.pop(columns.index('section_findings')))
columns.insert(columns.index('section_findings') + 1, columns.pop(columns.index('section_impression')))
df_result = df_result[columns]

df_result.head(2)


Unnamed: 0,Path,report,section_findings,section_impression,section_impressions,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/MedImageInsights/data/MIMIC-v1.0-512/files/p1...,FINAL REPORT\...,FINDINGS: Single frontal view of the chest pro...,IMPRESSION: No acute intrathoracic process.,IMPRESSION: No acute intrathoracic process.,0.0,0.0,0.0,0.0,0.0,...,0.018527,0.012001,-0.020366,-0.039709,0.004157,-0.010063,-0.009169,-0.039814,-0.035478,0.002569
1,/MedImageInsights/data/MIMIC-v1.0-512/files/p1...,FINAL REPORT\...,FINDINGS: Single frontal view of the chest pro...,IMPRESSION: No acute intrathoracic process.,IMPRESSION: No acute intrathoracic process.,0.0,0.0,0.0,0.0,0.0,...,0.005905,-0.016116,-0.010489,-0.0429,-0.005026,0.004733,-0.010128,-0.029413,-0.030221,-0.01709


In [25]:
# drop columns subject_id and study_id
df_result = df_result.drop(columns=['subject_id', 'study_id'])
df_result.head(2)

KeyError: "['subject_id', 'study_id'] not found in axis"

## Create Train/Val/Test 

In [26]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df_result, test_size=0.4, stratify=df_result['Pneumonia'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.9, stratify=temp_df['Pneumonia'], random_state=42)

# Print the number of samples in each set (optional)
print(f"Training set: {len(train_df)} samples, -->  {len(train_df)/len(df_result)*100:.2f}%")
print(f"Validation set: {len(val_df)} samples -->  {len(val_df)/len(df_result)*100:.2f}%")
print(f"Test set: {len(test_df)} samples -->  {len(test_df)/len(df_result)*100:.2f}%")

Training set: 88301 samples, -->  60.00%
Validation set: 5886 samples -->  4.00%
Test set: 52982 samples -->  36.00%


In [27]:
# value count normalized
print(train_df.Pneumonia.value_counts(normalize=True))
print(val_df.Pneumonia.value_counts(normalize=True))
print(test_df.Pneumonia.value_counts(normalize=True))

Pneumonia
0.0    0.92273
1.0    0.07727
Name: proportion, dtype: float64
Pneumonia
0.0    0.922698
1.0    0.077302
Name: proportion, dtype: float64
Pneumonia
0.0    0.922747
1.0    0.077253
Name: proportion, dtype: float64


In [28]:
# save the dataframes
train_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/train.csv", index=False)
val_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/val.csv", index=False)
test_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/test.csv", index=False)

In [None]:
df = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/train.csv")

In [None]:
img = '/MedImageInsights/data/MIMIC-v1.0-512/files/p13/p13042648/s58779246/24f6c834-7e14cae0-66293a0a-6c8044c9-4a87967d.jpg'

In [None]:
# get row where df.Path == img

df[df['Path'] == img]

Unnamed: 0,Path,subject_id,study_id,report,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
44322,/MedImageInsights/data/MIMIC-v1.0-512/files/p1...,13042648,58779246,FINAL REPORT\...,0.0,0.0,1.0,0.0,0.0,0.0,...,0.011989,-0.018302,-0.015856,-0.07872,-0.00882,-0.012025,0.031064,0.01205,-0.02878,0.007968


# Create TEST set with expert labels

In [None]:
df_labels = pd.read_csv('/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.0.0-chexpert.csv')
df_labels_expert = pd.read_csv('/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.1.0-test-set-labeled.csv')
print(df_labels.shape)
print(df_labels_expert.shape)

(227827, 16)
(687, 15)


In [None]:
# add subject_id to df_labels_expert from df_labels
df_labels_expert = pd.merge(
    df_labels_expert,
    df_labels[['study_id', 'subject_id']],
    on='study_id',
    how='left'
)


In [None]:
df_meta = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/mimic-cxr-2.0.0-metadata.csv")

# drop row if ViewPosition is not AP
df_meta = df_meta[df_meta['ViewPosition'] == 'AP']
df_meta.shape

(147173, 12)

In [None]:
# Merge based on study_id and subject_id only keep rows that are in df_meta
df_merged = pd.merge(
    df_labels_expert, 
    df_meta[['study_id', 'subject_id', 'dicom_id']],  # Only keep relevant columns for merging
    on=['study_id', 'subject_id'], 
    how='left'
)

df_merged.shape

(735, 17)

In [None]:
# replace nan and -1 with 0
df_merged = df_merged.fillna(0)
df_merged = df_merged.replace(-1, 0)
df_merged.shape


(735, 17)

In [None]:
df_merged = df_merged[~df_merged.duplicated(subset=['study_id'], keep=False)]
df_merged.shape

(639, 17)

In [None]:
df_merged.Pneumonia.value_counts(normalize=True)

Pneumonia
0.0    0.917058
1.0    0.082942
Name: proportion, dtype: float64

In [None]:
df_embeddings = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/all_embeddings.csv")
# rename Unnamed:0 to Path
df_embeddings = df_embeddings.rename(columns={"Unnamed: 0": "Path"})
df_embeddings.head(2)

Unnamed: 0,Path,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,0.008716,0.023156,-0.022518,0.014674,-0.012687,0.023082,-0.019689,-0.020924,0.007848,...,-0.008098,-0.006153,-0.072708,-0.060997,0.04044,-0.012472,-0.003973,0.007015,-0.035673,0.005564
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,-3.4e-05,0.013148,-0.014591,0.017788,-0.012439,-0.011676,-0.006333,-0.030729,0.025696,...,0.043519,0.011985,-0.019353,-0.046151,-0.000618,-0.032512,-0.003576,0.017097,-0.038961,0.035584


In [None]:
# Extract subject_id and study_id
df_embeddings['subject_id'] = df_embeddings['Path'].str.extract(r'/p\d+/p(\d+)/s')
df_embeddings['study_id'] = df_embeddings['Path'].str.extract(r'/s(\d+)/')
df_embeddings['dicom_id'] = df_embeddings['Path'].str.extract(r'/([^/]+).jpg')

# Convert to integers
df_embeddings['subject_id'] = df_embeddings['subject_id'].astype(int)
df_embeddings['study_id'] = df_embeddings['study_id'].astype(int)

df_embeddings.shape

(294257, 1028)

In [None]:
# Reorder columns to place 'study_id' and 'subject_id' after 'Path'
columns = list(df_embeddings.columns)
columns.remove('study_id')
columns.remove('subject_id')
columns.remove('dicom_id')
columns.insert(columns.index('Path') + 1, 'study_id')
columns.insert(columns.index('Path') + 2, 'subject_id')
columns.insert(columns.index('Path') + 3, 'dicom_id')

df_embeddings = df_embeddings[columns]
df_embeddings.shape
# Perform the merge
df_result = pd.merge(
    df_merged, 
    df_embeddings, 
    on=['study_id', 'subject_id', 'dicom_id'],  # Merge on these common columns
    how='left'  # Keep only rows from df_merge
)

print("Merge complete. Resulting dataframe:")
print(df_result.shape)
df_result.head(2)

# remove duplicates
df_result = df_result.drop_duplicates(subset=['Path'])
df_result.shape


Merge complete. Resulting dataframe:
(639, 1042)


(381, 1042)

In [None]:
df_result.Pneumonia.value_counts()

Pneumonia
0.0    358
1.0     23
Name: count, dtype: int64

In [None]:
df_result["Path"][0]

'/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/files/p18/p18874374/s58085167/4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad.jpg'

In [None]:
# Reorder columns to make 'Path' the first column
columns = ['Path'] + [col for col in df_result.columns if col != 'Path']
df_result = df_result[columns]
df_result.head(2)

Unnamed: 0,Path,study_id,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Airspace Opacity,Edema,Consolidation,Pneumonia,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/mnt/data2/datasets_lfay/MedImageInsights/data...,58085167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.004936,-0.008383,-0.030736,-0.052354,0.058367,-0.021667,-0.016113,-0.014594,-0.050418,0.015657
1,/mnt/data2/datasets_lfay/MedImageInsights/data...,57798090,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.006477,-0.021117,-0.028051,-0.03983,0.035866,-0.006744,-0.035062,-0.031807,-0.028329,0.041227


In [None]:
df_result.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/test_expert.csv", index=False)

# Read In

In [3]:
import pandas as pd
train_df = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/train.csv")
val_df = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/val.csv")
test_df = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/test.csv")

# remove /MedImageInsights/data from path
train_df['Path'] = train_df['Path'].str.replace('/MedImageInsights/data', '')
val_df['Path'] = val_df['Path'].str.replace('/MedImageInsights/data', '')
test_df['Path'] = test_df['Path'].str.replace('/MedImageInsights/data', '')

train_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/train.csv", index=False)
val_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/val.csv", index=False)
test_df.to_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/test.csv", index=False)


In [4]:
train_df

Unnamed: 0,Path,report,section_findings,section_impression,section_impressions,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/MIMIC-v1.0-512/files/p18/p18163289/s57380856/...,FINAL REPORT\...,FINDINGS: Single AP view of the chest provided...,IMPRESSION: 1. An ET tube appears to be at the...,IMPRESSION: 1. An ET tube appears to be at the...,1.0,0.0,0.0,1.0,0.0,...,0.029090,-0.011777,-0.029023,-0.074285,-0.001163,-0.037435,-0.012301,0.020248,-0.020471,0.008760
1,/MIMIC-v1.0-512/files/p10/p10455613/s53917954/...,FINAL REPORT\...,,IMPRESSION: As compared to the previous radiog...,IMPRESSION: As compared to the previous radiog...,0.0,1.0,0.0,0.0,0.0,...,0.015124,0.004441,-0.006895,-0.040284,0.016083,-0.037795,-0.028646,-0.023374,-0.026763,0.031974
2,/MIMIC-v1.0-512/files/p15/p15383083/s59807648/...,FINAL REPORT\...,,IMPRESSION: Sequential images show advancement...,IMPRESSION: Sequential images show advancement...,0.0,0.0,1.0,0.0,0.0,...,0.037863,0.009386,0.004832,-0.013751,0.005484,-0.004412,-0.029964,-0.022406,-0.010008,0.014529
3,/MIMIC-v1.0-512/files/p16/p16625317/s52607953/...,FINAL REPORT\...,FINDINGS: The heart is enlarged. The hilar and...,IMPRESSION: 1. Mild pulmonary edema. 2. Blunti...,IMPRESSION: 1. Mild pulmonary edema. 2. Blunti...,0.0,0.0,0.0,1.0,0.0,...,0.008044,0.008688,-0.026819,-0.046973,0.019281,-0.009734,-0.018037,-0.027236,-0.036859,0.040492
4,/MIMIC-v1.0-512/files/p13/p13922124/s58117141/...,FINAL REPORT\...,FINDINGS: A right PIC line terminates in the l...,IMPRESSION: Dobhoff tube coils in the stomach.,IMPRESSION: Dobhoff tube coils in the stomach.,0.0,0.0,0.0,0.0,0.0,...,0.031592,-0.010245,0.010824,-0.019900,0.012419,-0.014236,-0.039484,0.001872,-0.004139,0.021636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88296,/MIMIC-v1.0-512/files/p15/p15544660/s52430684/...,FINAL REPORT\...,,,,0.0,1.0,1.0,0.0,0.0,...,0.010052,-0.003003,-0.032199,-0.048011,0.002558,0.019582,-0.000823,0.019632,-0.044928,-0.003979
88297,/MIMIC-v1.0-512/files/p17/p17699811/s50961383/...,FINAL REPORT\...,FINDINGS: AP and lateral chest radiographs wer...,IMPRESSION: No acute cardiopulmonary process.,IMPRESSION: No acute cardiopulmonary process.,0.0,0.0,0.0,0.0,0.0,...,0.029585,0.028872,-0.007239,-0.027074,-0.004145,-0.001319,-0.016202,-0.030308,-0.024119,0.032040
88298,/MIMIC-v1.0-512/files/p14/p14367016/s55563599/...,FINAL REPORT\...,FINDINGS: Inspiratory volumes are slightly low...,IMPRESSION: 1. Patchy opacity at the left base...,IMPRESSION: 1. Patchy opacity at the left base...,0.0,1.0,0.0,0.0,0.0,...,0.008715,-0.020348,-0.020625,-0.042130,0.028435,-0.037658,-0.044479,-0.045562,-0.028675,0.006779
88299,/MIMIC-v1.0-512/files/p10/p10550799/s54894693/...,FINAL REPORT\...,FINDINGS: Tip of endotracheal tube terminates ...,,,0.0,0.0,0.0,0.0,0.0,...,0.042242,0.010866,-0.037455,-0.049514,-0.021316,-0.022871,0.033111,0.057285,-0.039146,-0.001051
