## Import packages

In [18]:
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from skimage.io import imread

In [19]:
print(f'torch=={torch.__version__}')
import skimage
print(f'skimage=={skimage.__version__}')
print(f'pandas=={pd.__version__}')

torch==1.12.1+cu102
skimage==0.19.3
pandas==1.4.4


## Necessary paths

In [28]:
project_path = Path.cwd().parent
print(f'project_path = {project_path}')
mri_png_dir = project_path/'data/processed/mri'
print(f'train, val and test MRI images are in {mri_png_dir}')
csv_dir = project_path/'data/processed/csv'
print(f'needed csv files are in {csv_dir}')
modalities_csv = csv_dir/'modalities.csv'

project_path = /home/dk/Desktop/projects/multimodal-healthcare/mri
train, val and test MRI images are in /home/dk/Desktop/projects/multimodal-healthcare/mri/data/processed/mri
needed csv files are in /home/dk/Desktop/projects/multimodal-healthcare/mri/data/processed/csv


## Keep patients  with MRI modality

In [46]:
modalities_df = pd.read_csv(modalities_csv)
# only care about mri modal
modalities_df = modalities_df[['id', 'mri']]
modalities_df['mri'] = modalities_df['mri'].astype(int)
mri_modality = modalities_df[modalities_df['mri'] == 1]
mri_modality

Unnamed: 0,id,mri
4,a58de3fd-f026-902b-55c1-872dc042e0c5,1
8,75425ac1-ff8b-76eb-5b4b-633b619c87c3,1
13,af7d5693-35b4-2ea1-fba8-fb43ca2551ed,1
15,aead835e-66f2-d3b9-099c-c33844a70748,1
20,60530d29-0468-98b7-9bbd-4b3d001f9f67,1
...,...,...
1146,75257665-718a-053f-bd68-038a3f50e374,1
1147,47a14779-8178-a001-ca12-2d2c96fb9f49,1
1149,7dcecf55-4ddc-8abe-0214-fe4f1d0a69e1,1
1151,c12614ed-2db3-cb59-33c6-be45d445192e,1


In [49]:
mri_ids = list(mri_modality['id'])
mri_ids

['a58de3fd-f026-902b-55c1-872dc042e0c5',
 '75425ac1-ff8b-76eb-5b4b-633b619c87c3',
 'af7d5693-35b4-2ea1-fba8-fb43ca2551ed',
 'aead835e-66f2-d3b9-099c-c33844a70748',
 '60530d29-0468-98b7-9bbd-4b3d001f9f67',
 'c679fa5a-8269-3842-70ff-947e832afe2c',
 'fec6d99f-1cfd-f397-e740-e3952410ea2a',
 '1e224d43-3b3b-a903-585d-d10e572b9468',
 '4f112535-f726-3d10-153d-2f658331bb93',
 '76061e2b-4d4c-4115-f7fb-0a9de1582962',
 'e8116fd7-c1a4-ddca-c425-6c63cb9ab2ec',
 '869b6878-2252-18df-0b86-83cf1ad6b915',
 'e7ed3dea-2bc1-ca48-2e6b-ec6236c3a0b0',
 'a11f80dc-e770-e8d3-a08a-6fd36449dbb2',
 '752d1134-a8fb-47e8-b656-0224d4892c8a',
 '57d5b77b-cdb9-fcd5-9a9c-ecacbde4c2d9',
 '7614e7b9-3f0a-7522-02e6-1be4e535094b',
 '99bc67a4-f0e3-23de-06c9-b8d8e49af77d',
 '963b8d68-0855-8b37-45e9-b22286ece7d7',
 '083e1df1-21cc-3885-48c7-56aa760e28ec',
 'f774cb31-e225-d925-87a0-cf71c5d4c42a',
 'c9f4ee85-7e93-e0d0-3342-d5c794e825ff',
 '745a318e-857e-da43-e673-8ce73f417c28',
 'f1792e8b-3270-ff2d-7ded-18aa6006bd89',
 'f75f3740-09ad-

## Process output conditions

In [21]:
train_df = pd.read_csv(csv_dir/'train_patients.csv')
# only keep necessary columns
cols = ['patient', 'heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
train_df = train_df[cols]
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest
0,c2a58661-9352-1929-fc29-c51fad4c6950,True,False,False,False,False
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,False,True,False,True,False
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,False,True,False,True,False
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,False,True,False,True,False
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,True,True,False,False,False
...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,True,False,False,True,False
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,False,False,False,False,True
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,False,False,False,False,True
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,False,False,False,True,False


Embed `False` and `True` into `0` and `1`

In [22]:
labels = ['heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
train_df[labels] = train_df[labels].astype(int)
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest
0,c2a58661-9352-1929-fc29-c51fad4c6950,1,0,0,0,0
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,0,1,0,1,0
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,0,1,0,1,0
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,0,1,0,1,0
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,1,1,0,0,0
...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,1,0,0,1,0
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,0,0,0,0,1
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,0,0,0,0,1
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,0,0,0,1,0


Check if any values are `NaN`

In [23]:
check_nan = train_df.isnull().values.any()
 # printing the result
print(check_nan)

False


In [24]:
train_df['label']= train_df[labels].values.tolist()
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest,label
0,c2a58661-9352-1929-fc29-c51fad4c6950,1,0,0,0,0,"[1, 0, 0, 0, 0]"
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,0,1,0,1,0,"[0, 1, 0, 1, 0]"
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,0,1,0,1,0,"[0, 1, 0, 1, 0]"
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,0,1,0,1,0,"[0, 1, 0, 1, 0]"
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,1,1,0,0,0,"[1, 1, 0, 0, 0]"
...,...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,1,0,0,1,0,"[1, 0, 0, 1, 0]"
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,0,0,0,0,1,"[0, 0, 0, 0, 1]"
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,0,0,0,0,1,"[0, 0, 0, 0, 1]"
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,0,0,0,1,0,"[0, 0, 0, 1, 0]"


Only keep the `patient` and the `label` columns

In [25]:
train_patient_label_df = train_df[['patient', 'label']]
train_patient_label_df 

Unnamed: 0,patient,label
0,c2a58661-9352-1929-fc29-c51fad4c6950,"[1, 0, 0, 0, 0]"
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,"[0, 1, 0, 1, 0]"
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,"[0, 1, 0, 1, 0]"
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,"[0, 1, 0, 1, 0]"
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,"[1, 1, 0, 0, 0]"
...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,"[1, 0, 0, 1, 0]"
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,"[0, 0, 0, 0, 1]"
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,"[0, 0, 0, 0, 1]"
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,"[0, 0, 0, 1, 0]"


Only keep patients with MRI modality

In [54]:
train_mri_patients = train_patient_label_df[train_patient_label_df['patient'].isin(mri_ids)] 
train_mri_patients

Unnamed: 0,patient,label
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,"[0, 1, 0, 1, 0]"
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,"[1, 1, 0, 0, 0]"
9,d76a5009-afb2-63eb-4161-006a2c9332d2,"[0, 0, 0, 0, 1]"
10,967d5226-f8c4-60a8-b882-6ef803af88a6,"[0, 0, 0, 1, 0]"
12,a2710800-5a46-ae6b-0e28-f3706d55c841,"[0, 0, 0, 0, 0]"
...,...,...
717,e7ed3dea-2bc1-ca48-2e6b-ec6236c3a0b0,"[0, 0, 0, 0, 0]"
721,30fd2d55-8c78-bbc7-67a1-2d2ead91606e,"[0, 0, 0, 0, 0]"
724,869b6878-2252-18df-0b86-83cf1ad6b915,"[0, 0, 0, 0, 0]"
740,af7d5693-35b4-2ea1-fba8-fb43ca2551ed,"[0, 0, 0, 0, 0]"


In [57]:
train_mri_patients.to_csv(csv_dir/'train_mri_patients.csv')

## Get the paths of the MRI `.png` files and give them the right output 

Loop through `train_patient_label_df` dataframe. For each patient, find the paths to the MRI `.png` images by his id

In [26]:
train_mri_dir = mri_png_dir/'train'
train_png_paths = train_mri_dir.glob('*')

In [27]:
train_png_paths = list(train_png_paths)
len(train_png_paths)

20470

Reminder from `data_process.ipynb`: There are only 178 patients in the train split, 62 patients in the val split and 57 patients in the test split that have MRI brain scans. For a patient with a MRI scan, we chose 115 MRI slices => each patient has 115 MRI png images, e.g. 20470 / 178 = 115

In [10]:
def find_mri_png_paths(patient_id: str, mri_png_paths: list) -> list[Path]:
    """
    For each patient, find the corresponding paths to the MRI .png images
    and return them in a list
    """
    res = []
    for path in mri_png_paths:
        if patient_id in path.name:
            res.append(path)
    return res

In [21]:
train_png_mri_label = []
for i, row in train_patient_label_df.iterrows():
    patient_id = row['patient']
    label = row['label']
    png_paths = find_mri_png_paths(patient_id, train_png_paths)
    for png_path in png_paths:
        train_png_mri_label.append((png_path, label))

In [28]:
train_mri_df = pd.DataFrame(train_png_mri_label, columns=['mri_png_path', 'label'])
train_mri_df.to_csv(csv_dir/'train_mri.csv')

## Do the same process above for the val split

In [7]:
val_df = pd.read_csv(csv_dir/'valid_patients.csv')
# only keep necessary columns
cols = ['patient', 'heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
val_df = val_df[cols]
labels = ['heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
val_df[labels] = val_df[labels].astype(int)
val_df['label']= val_df[labels].values.tolist()
val_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest,label
0,94938ad3-7ba9-dc6e-0948-714bc1f29546,0,0,0,1,0,"[0, 0, 0, 1, 0]"
1,8d3a78f7-944e-2a23-f358-a5ce6e1be86e,1,0,0,0,0,"[1, 0, 0, 0, 0]"
2,7b68ef38-85c5-3f4f-ae44-e5bb70b52962,1,0,0,0,0,"[1, 0, 0, 0, 0]"
3,d8b40876-f928-6f5e-a646-35e0a3fcafdf,0,1,0,1,0,"[0, 1, 0, 1, 0]"
4,df30e63f-374a-277a-83e7-37458e8d5261,1,0,0,0,0,"[1, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...
251,5a676f76-34bb-39a9-87ed-e03e5137b098,0,1,1,0,0,"[0, 1, 1, 0, 0]"
252,2a8f230a-281c-8863-9271-a0f11c37a3eb,0,1,1,0,0,"[0, 1, 1, 0, 0]"
253,33a35db9-44ca-910d-863c-30a025d7ea1f,1,0,0,0,0,"[1, 0, 0, 0, 0]"
254,6e74ab3f-cc95-6122-61f5-870c538ee25d,0,1,0,1,0,"[0, 1, 0, 1, 0]"


In [8]:
val_patient_label_df = val_df[['patient', 'label']]
val_patient_label_df

Unnamed: 0,patient,label
0,94938ad3-7ba9-dc6e-0948-714bc1f29546,"[0, 0, 0, 1, 0]"
1,8d3a78f7-944e-2a23-f358-a5ce6e1be86e,"[1, 0, 0, 0, 0]"
2,7b68ef38-85c5-3f4f-ae44-e5bb70b52962,"[1, 0, 0, 0, 0]"
3,d8b40876-f928-6f5e-a646-35e0a3fcafdf,"[0, 1, 0, 1, 0]"
4,df30e63f-374a-277a-83e7-37458e8d5261,"[1, 0, 0, 0, 0]"
...,...,...
251,5a676f76-34bb-39a9-87ed-e03e5137b098,"[0, 1, 1, 0, 0]"
252,2a8f230a-281c-8863-9271-a0f11c37a3eb,"[0, 1, 1, 0, 0]"
253,33a35db9-44ca-910d-863c-30a025d7ea1f,"[1, 0, 0, 0, 0]"
254,6e74ab3f-cc95-6122-61f5-870c538ee25d,"[0, 1, 0, 1, 0]"


In [59]:
val_mri_patients = val_patient_label_df[val_patient_label_df['patient'].isin(mri_ids)]
val_mri_patients.to_csv(csv_dir/'val_mri_patients.csv')

In [12]:
val_mri_dir = mri_png_dir/'val'
val_png_paths = val_mri_dir.glob('*')
val_png_paths = list(val_png_paths)
print(len(val_png_paths))

val_png_mri_label = []
for i, row in val_patient_label_df.iterrows():
    patient_id = row['patient']
    label = row['label']
    png_paths = find_mri_png_paths(patient_id, val_png_paths)
    for png_path in png_paths:
        val_png_mri_label.append((png_path, label))

7130


In [13]:
val_mri_df = pd.DataFrame(val_png_mri_label, columns=['mri_png_path', 'label'])
val_mri_df.to_csv(csv_dir/'val_mri.csv')

## For the test split

In [15]:
test_df = pd.read_csv(csv_dir/'test_patients.csv')
# only keep necessary columns
cols = ['patient', 'heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
test_df = test_df[cols]
labels = ['heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
test_df[labels] = test_df[labels].astype(int)
test_df['label']= test_df[labels].values.tolist()
test_patient_label_df = test_df[['patient', 'label']]
test_patient_label_df

Unnamed: 0,patient,label
0,1c3f1813-3502-df08-73f1-29806fc52373,"[0, 0, 0, 1, 0]"
1,5bb4f528-2e34-e116-9a6c-e142d774b18f,"[0, 0, 0, 1, 1]"
2,6aaf5ffe-502a-2c60-9eab-80c90f705926,"[0, 1, 0, 1, 0]"
3,bf40d134-5782-7dc1-9c9c-6ce11242e7af,"[0, 0, 0, 0, 0]"
4,39cdd1d4-37fc-8ca4-9cb6-e827bab46da4,"[0, 0, 0, 0, 1]"
...,...,...
251,523f4150-951a-633c-acb8-a6c9cc311d07,"[0, 0, 0, 0, 0]"
252,850c66af-7703-a9b0-1504-9dad84261b19,"[0, 0, 0, 1, 0]"
253,754aaa43-09df-dab5-3cf3-c8e021cb696e,"[0, 0, 0, 1, 0]"
254,8db378e0-9e07-8426-fa88-b431378f5fb9,"[0, 0, 0, 0, 0]"


In [60]:
test_mri_patients = test_patient_label_df[test_patient_label_df['patient'].isin(mri_ids)]
test_mri_patients.to_csv(csv_dir/'test_mri_patients.csv')

In [16]:
test_mri_dir = mri_png_dir/'test'
test_png_paths = test_mri_dir.glob('*')
test_png_paths = list(test_png_paths)
print(len(test_png_paths))

test_png_mri_label = []
for i, row in test_patient_label_df.iterrows():
    patient_id = row['patient']
    label = row['label']
    png_paths = find_mri_png_paths(patient_id, test_png_paths)
    for png_path in png_paths:
        test_png_mri_label.append((png_path, label))

6555


In [17]:
test_mri_df = pd.DataFrame(test_png_mri_label, columns=['mri_png_path', 'label'])
test_mri_df.to_csv(csv_dir/'test_mri.csv')