In [15]:
import os
import sys
import time
from pathlib import Path
module_path = os.path.abspath('../src/utils')
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np
import ants

from base_mri import list_available_images, delete_useless_images, set_env_variables, load_mri, save_mri, create_file_name_from_path,save_batch_mri

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Getting list of images

In [140]:
mri_input_path = "/home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/preprocessed/20210328"
mri_output_path = "/home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/train"

imgs_list,_,_ = list_available_images(mri_input_path,file_format=".nii.gz")
# imgs_list = [x.as_posix() for x in imgs_list]

Looking for MRI raw images in path: /home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/preprocessed/20210328 

Found a total of  716  images.
Found a total of  0  mask images.
Available images to process:  716 



In [267]:
def get_image_id(path,file_format):
    return path.split('_')[-1].replace(file_format,'')

def get_image_name(path):
    return path.split('/')[-1]

def list_image_dict(image_path,file_format=".nii.gz"):
    imgs_list,_,_ = list_available_images(image_path,file_format=file_format)
    img_dict = {}
    for path in imgs_list: 
        img_dict.update({get_image_id(path,file_format):path})
    return img_dict

# Getting Reference Labels

In [281]:
def load_reference_table(path = "/home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/reference/MRI_MPRAGE.csv"):
    df = pd.read_csv(path)
    df.columns = [x.replace(' ','_').upper() for x in df.columns]
    return df

def transform_reference_table(df):
    df['MACRO_GROUP'] = df['GROUP']
    df.loc[df['MACRO_GROUP'] == 'SMC','MACRO_GROUP'] = 'CN'
    df.loc[df['MACRO_GROUP'] == 'EMCI','MACRO_GROUP'] = 'MCI'
    df.loc[df['MACRO_GROUP'] == 'LMCI','MACRO_GROUP'] = 'MCI'
    return df

def enhance_reference_table(df,filter_images,file_format='.nii.gz'):
    df_filtered = df.query("IMAGE_DATA_ID in @filter_images.keys()").sort_values("IMAGE_DATA_ID")
    df_filtered['IMAGE_DATA_NAME'] = np.sort(list(filter_images.values()))
    df_filtered['LABELED_IMAGE_DATA_NAME'] = df_filtered['IMAGE_DATA_NAME'].str.replace("ADNI_","").str.replace(file_format,"") + '_' + df_filtered['MACRO_GROUP'] + file_format
    return df_filtered


In [277]:
df_mprage = load_reference_table()
df_mprage = transform_reference_table(df_mprage)
filter_images = list_image_dict("/home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/preprocessed/20210320",'.npz')    
df_filtered = enhance_reference_table(df_mprage,filter_images,file_format='.npz')

Looking for MRI raw images in path: /home/lucasthim1/mmml-alzheimer-diagnosis/data/mri/preprocessed/20210320 

Found a total of  283  images.
Found a total of  0  mask images.
Available images to process:  283 



In [271]:
df_mprage.groupby("GROUP").count()

Unnamed: 0_level_0,IMAGE_DATA_ID,SUBJECT,SEX,AGE,VISIT,DESCRIPTION,TYPE,ACQ_DATE,MACRO_GROUP
GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AD,2015,2015,2015,2015,2015,2015,2015,2015,2015
CN,3715,3715,3715,3715,3715,3715,3715,3715,3715
EMCI,1215,1215,1215,1215,1215,1215,1215,1215,1215
LMCI,630,630,630,630,630,630,630,630,630
MCI,5090,5090,5090,5090,5090,5090,5090,5090,5090
SMC,184,184,184,184,184,184,184,184,184


In [272]:
df_mprage.groupby("MACRO_GROUP").count()

Unnamed: 0_level_0,IMAGE_DATA_ID,SUBJECT,GROUP,SEX,AGE,VISIT,DESCRIPTION,TYPE,ACQ_DATE
MACRO_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AD,2015,2015,2015,2015,2015,2015,2015,2015,2015
CN,3899,3899,3899,3899,3899,3899,3899,3899,3899
MCI,6935,6935,6935,6935,6935,6935,6935,6935,6935


In [279]:
df_filtered.groupby("GROUP").count()

Unnamed: 0_level_0,IMAGE_DATA_ID,SUBJECT,SEX,AGE,VISIT,DESCRIPTION,TYPE,ACQ_DATE,MACRO_GROUP,IMAGE_DATA_NAME,LABELED_IMAGE_DATA_NAME
GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AD,43,43,43,43,43,43,43,43,43,43,43
CN,76,76,76,76,76,76,76,76,76,76,76
EMCI,76,76,76,76,76,76,76,76,76,76,76
LMCI,34,34,34,34,34,34,34,34,34,34,34
MCI,34,34,34,34,34,34,34,34,34,34,34
SMC,20,20,20,20,20,20,20,20,20,20,20


In [280]:
df_filtered.groupby("MACRO_GROUP").count()

Unnamed: 0_level_0,IMAGE_DATA_ID,SUBJECT,GROUP,SEX,AGE,VISIT,DESCRIPTION,TYPE,ACQ_DATE,IMAGE_DATA_NAME,LABELED_IMAGE_DATA_NAME
MACRO_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AD,43,43,43,43,43,43,43,43,43,43,43
CN,96,96,96,96,96,96,96,96,96,96,96
MCI,144,144,144,144,144,144,144,144,144,144,144


# Renaming images

In [282]:
def rename_images_with_label(df):
    print('Adding label to MRI files...')
    for old_name,new_name in zip(df['IMAGE_DATA_NAME'],df['LABELED_IMAGE_DATA_NAME']):
        os.rename(old_name,new_name)

In [143]:
rename_images_with_label(df_filtered)

Adding label to MRI...


# Splitting into Train, Validation and Test

In [201]:
def train_validation_test_split(df,proportion = [0.5,0.3,0.2],classes = ['AD','CN'],class_column = ['GROUP']):

    train = []
    validation = []
    test = []
    df_classes = df[df[class_column].isin(classes)]

    for cl in classes:
        df_shuffled = df[df[class_column] == cl].sample(frac=1).reset_index(drop=True)

        df_train_cl = df_shuffled.iloc[:int(np.ceil(proportion[0] * df_shuffled.shape[0]))]
        df_validation_cl = df_shuffled.iloc[int(np.ceil(proportion[0] * df_shuffled.shape[0])):int(np.ceil((proportion[0] + proportion[1]) * df_shuffled.shape[0]))]
        df_test_cl = df_shuffled.iloc[int(np.ceil((proportion[0] + proportion[1]) * df_shuffled.shape[0])):]

        train.append(df_train_cl)
        validation.append(df_validation_cl)
        test.append(df_test_cl)

    df_train = pd.concat(train).sample(frac=1).reset_index(drop=True)
    df_validation = pd.concat(validation).sample(frac=1).reset_index(drop=True)
    df_test = pd.concat(test).sample(frac=1).reset_index(drop=True)
    return df_train,df_validation,df_test

def train_test_split(df,proportion = [0.8,0.2],classes = ['AD','CN'],class_column = ['GROUP']):

    train = []
    test = []
    df_classes = df[df[class_column].isin(classes)]
    
    for cl in classes:
        df_shuffled = df_classes[df_classes[class_column] == cl].sample(frac=1).reset_index(drop=True)

        df_train_cl = df_shuffled.iloc[:int(np.ceil(proportion[0] * df_shuffled.shape[0]))]
        df_test_cl = df_shuffled.iloc[int(np.ceil((proportion[0]) * df_shuffled.shape[0])):]

        train.append(df_train_cl)
        test.append(df_test_cl)

    df_train = pd.concat(train).sample(frac=1).reset_index(drop=True)
    df_test = pd.concat(test).sample(frac=1).reset_index(drop=True)
    return df_train,df_test

In [219]:
df_train,df_test = train_test_split(df=df,proportion = [0.7,0.2],classes = ['AD','CN'],class_column = 'GROUP')
df_train,df_validation = train_test_split(df=df_train,proportion = [0.8,0.2],classes = ['AD','CN'],class_column = 'GROUP')

df_train,df_validation,df_test = train_validation_test_split(df=df,proportion = [0.5,0.3,0.2],classes = ['AD','CN'],class_column = 'GROUP')

In [223]:
df_train

Unnamed: 0,IMAGE_DATA_ID,SUBJECT,GROUP,SEX,AGE,VISIT,DESCRIPTION,TYPE,ACQ_DATE,MACRO_GROUP,IMAGE_DATA_NAME,LABELED_IMAGE_DATA_NAME
0,I238644,041_S_4037,CN,M,76,22,MT1; GradWarp; N3m,Processed,2011-05-24,CN,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
1,I321232,011_S_4845,AD,F,68,22,MT1; GradWarp; N3m,Processed,2012-07-12,AD,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
2,I118705,010_S_0419,CN,M,71,3,MPR; ; N3; Scaled_2,Processed,2006-12-15,CN,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
3,I119129,033_S_0724,AD,M,79,2,MPR; GradWarp; N3; Scaled_2,Processed,2006-08-11,AD,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
4,I50423,062_S_0535,AD,M,77,1,MPR; GradWarp,Processed,2006-05-18,AD,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
...,...,...,...,...,...,...,...,...,...,...,...,...
145,I384090,018_S_5240,AD,F,63,22,MT1; N3m,Processed,2013-07-23,AD,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
146,I34561,099_S_0470,AD,F,87,1,MPR; GradWarp; B1 Correction; N3,Processed,2006-05-10,AD,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
147,I119024,010_S_0420,CN,M,74,1,MPR; ; N3; Scaled_2,Processed,2006-06-15,CN,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...
148,I285157,010_S_4442,CN,F,74,22,MT1; N3m,Processed,2012-02-07,CN,/home/lucasthim1/mmml-alzheimer-diagnosis/data...,/home/lucasthim1/mmml-alzheimer-diagnosis/data...


Dataframes can be saved and then loaded to train the neural net. 
X = paths to load images
y = GROUP or MACRO_GROUP