## Import packages

In [1]:
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from skimage.io import imread

In [2]:
print(f'torch=={torch.__version__}')
import skimage
print(f'skimage=={skimage.__version__}')
print(f'pandas=={pd.__version__}')

torch==1.12.1+cu102
skimage==0.19.3
pandas==1.4.4


## Necessary paths

In [3]:
project_path = Path.cwd().parent
print(f'project_path = {project_path}')
mri_png_dir = project_path/'data/processed/mri'
print(f'train, val and test MRI images are in {mri_png_dir}')
csv_dir = project_path/'data/processed/csv'
print(f'needed csv files are in {csv_dir}')

project_path = /home/dk/Desktop/projects/multimodal-healthcare/mri
train, val and test MRI images are in /home/dk/Desktop/projects/multimodal-healthcare/mri/data/processed/mri
needed csv files are in /home/dk/Desktop/projects/multimodal-healthcare/mri/data/processed/csv


## Process output conditions

In [4]:
train_df = pd.read_csv(csv_dir/'train_patients.csv')
# only keep necessary columns
cols = ['patient', 'heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
train_df = train_df[cols]
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest
0,c2a58661-9352-1929-fc29-c51fad4c6950,True,False,False,False,False
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,False,True,False,True,False
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,False,True,False,True,False
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,False,True,False,True,False
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,True,True,False,False,False
...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,True,False,False,True,False
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,False,False,False,False,True
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,False,False,False,False,True
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,False,False,False,True,False


Embed `False` and `True` into `0` and `1`

In [5]:
labels = ['heart_failure', 'coronary_heart', 'myocardial_infarction', 'stroke', 'cardiac_arrest']
train_df[labels] = train_df[labels].astype(int)
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest
0,c2a58661-9352-1929-fc29-c51fad4c6950,1,0,0,0,0
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,0,1,0,1,0
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,0,1,0,1,0
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,0,1,0,1,0
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,1,1,0,0,0
...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,1,0,0,1,0
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,0,0,0,0,1
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,0,0,0,0,1
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,0,0,0,1,0


Check if any values are `NaN`

In [6]:
check_nan = train_df.isnull().values.any()
 # printing the result
print(check_nan)

False


In [7]:
train_df['label']= train_df[labels].values.tolist()
train_df

Unnamed: 0,patient,heart_failure,coronary_heart,myocardial_infarction,stroke,cardiac_arrest,label
0,c2a58661-9352-1929-fc29-c51fad4c6950,1,0,0,0,0,"[1, 0, 0, 0, 0]"
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,0,1,0,1,0,"[0, 1, 0, 1, 0]"
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,0,1,0,1,0,"[0, 1, 0, 1, 0]"
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,0,1,0,1,0,"[0, 1, 0, 1, 0]"
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,1,1,0,0,0,"[1, 1, 0, 0, 0]"
...,...,...,...,...,...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,1,0,0,1,0,"[1, 0, 0, 1, 0]"
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,0,0,0,0,1,"[0, 0, 0, 0, 1]"
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,0,0,0,0,1,"[0, 0, 0, 0, 1]"
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,0,0,0,1,0,"[0, 0, 0, 1, 0]"


Only keep the `patient` and the `label` columns

In [24]:
train_patient_label_df = train_df[['patient', 'label']]
train_patient_label_df

Unnamed: 0,patient,label
0,c2a58661-9352-1929-fc29-c51fad4c6950,"[1, 0, 0, 0, 0]"
1,adf80417-f70b-2fe8-a2da-0fdcf70b2509,"[0, 1, 0, 1, 0]"
2,6cb86432-5f9e-6f74-2289-a8295e84c64e,"[0, 1, 0, 1, 0]"
3,b8e7ebe2-ef2c-5562-7390-e216389cdba8,"[0, 1, 0, 1, 0]"
4,fbb75ebb-8b10-2a28-5634-3b8f4da7442b,"[1, 1, 0, 0, 0]"
...,...,...
761,9996a530-4e4a-3e49-ff16-d52837fd3b49,"[1, 0, 0, 1, 0]"
762,e7a9a523-6df7-6fcb-9496-4380f13c9b71,"[0, 0, 0, 0, 1]"
763,79ff8a86-85fb-6ff3-1c2e-6b7a2d04d240,"[0, 0, 0, 0, 1]"
764,13ff227e-4038-4634-2eb2-e64dc710c9b3,"[0, 0, 0, 1, 0]"


## Get the paths of the MRI `.png` files and give them the right output 

In [31]:
train_mri_dir = mri_png_dir/'train'
train_png_paths = train_mri_dir.glob('*')

In [30]:
for i, row in train_patient_label_df.iterrows():
    print(row)

    break

patient    c2a58661-9352-1929-fc29-c51fad4c6950
label                           [1, 0, 0, 0, 0]
Name: 0, dtype: object


## Building a data loading pipeline

In [13]:
class MRIDataset(Dataset):
    def __init__(self, mri_dir, csv_dir):
        self.mri_dir = mri_dir
        self.csv_dir = csv_dir

    def create_labels(self):

## Train a neural net for our problem (using transfer learning)

## Test