In [205]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.data import Dataset, Iterator

# OASIS Data Preprocessing

In [221]:
data_dir = "/Users/tiffanysoebijantoro/Documents/DeepLearning/"
labels = pd.read_csv(data_dir+"oasis_label_2.csv")

labels['Days since MRI'] = pd.to_numeric(labels['Label'].str[-4:])
labels['Months since MRI'] = np.floor(labels['Days since MRI']/30)
labels['2 months since MRI'] = np.floor(labels['Days since MRI']/60)

# drop subjects who have two mri at the same month 
new = labels
new = new.drop_duplicates(subset=['Subject','2 months since MRI'])

# sort by subject and then months since mri
sorted_labels = new.sort_values(by=['Subject','Months since MRI'])

# change format OAS30001_ClinicalData_d0000 -> OAS30001_MR_d0000.nii

def rep(s):
    x = s.replace("_ClinicalData_","_MR_")
    return x

sorted_labels['New Label'] = sorted_labels['Label'].apply(rep) + '.nii'

# add diagnosis label

def diagnose(dx):
    if dx == 'Cognitively normal':
        return 0
    elif dx == 'uncertain dementia':
        return 5
    elif dx == 'AD Dementia':
        return 3
    else:
        return None

sorted_labels["Diagnosis"] = sorted_labels['dx1'].apply(diagnose)

In [222]:
# oasis_image_data_dates.csv matches sorted_labels
image_data = pd.read_csv("oasis_image_data_dates.csv")


def match(df, col1 = 'subject',col2 = 'days'):
    df1=sorted_labels[sorted_labels["Subject"]==df[col1]]
    loc = (np.abs(df1["Days since MRI"] - df[col2])).idxmin()            
    low_limit = df[col2] - 180
    high_limit = df[col2] + 180
    if sorted_labels.loc[loc]['Days since MRI'] < high_limit and sorted_labels.loc[loc]['Days since MRI'] > low_limit:
        # here return dx1 other than the nearest day
        # return int(sorted_labels.loc[loc]['Days since MRI'])
        return sorted_labels.loc[loc]['Diagnosis']
    else:
        return None

image_data["match_label"] = image_data.apply(match,col1 = 'subject',col2 = 'days',axis = 1)

In [259]:
# add gender and age columns to image_data

def addcol(df, col1, col2 = 'subject'):
    df1=sorted_labels[sorted_labels["Subject"]==df[col2]]
    loc = df1.iloc[0][col1]
    return loc

image_data["gender"] = image_data.apply(addcol, col1="Gender", col2='subject', axis=1)
image_data["age"] = image_data.apply(addcol, col1="ageAtEntry", col2='subject', axis=1)

# normalize age
min_age = image_data["age"].min()
max_age = image_data["age"].max()
image_data["age_norm"] = (image_data["age"] - min_age) / (max_age - min_age)

# add file address
image_data["file address"] = "fs_t1/"+image_data["actual file name"]

# codify gender
def code_gender(gender):
    if gender == 'male':
        return 0
    elif gender == 'female':
        return 1
    else:
        return None

image_data["Sex"] = image_data['gender'].apply(code_gender)


image_data.head(5)

Unnamed: 0,Count,actual file name,subject,type,file_name_day,day label,days,match_label,gender,age,age_norm,file address,Sex
0,17814,OAS30001_MR_d0129.nii.gz,OAS30001,MR,d0129.nii.gz,d0129,129,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d0129.nii.gz,1
1,15818,OAS30001_MR_d0757.nii.gz,OAS30001,MR,d0757.nii.gz,d0757,757,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d0757.nii.gz,1
2,16079,OAS30001_MR_d2430.nii.gz,OAS30001,MR,d2430.nii.gz,d2430,2430,,female,65.149895,0.427142,fs_t1/OAS30001_MR_d2430.nii.gz,1
3,17075,OAS30001_MR_d3132.nii.gz,OAS30001,MR,d3132.nii.gz,d3132,3132,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d3132.nii.gz,1
4,22537,OAS30002_MR_d0371.nii.gz,OAS30002,MR,d0371.nii.gz,d0371,371,,male,67.206024,0.465866,fs_t1/OAS30002_MR_d0371.nii.gz,0


In [260]:
sample_normal = image_data[image_data['match_label'] == 0]
sample_AD = image_data[image_data['match_label'] == 3]
sample_uncertain = image_data[image_data['match_label'] == 5]

print(len(sample_normal), len(sample_AD), len(sample_uncertain))

1281 193 97


In [261]:
train_percentage = 0.8
val_percentage = 0.1
test_percentage = 0.1

def split(sample):
    train = math.floor(len(sample)*train_percentage)
    val = math.floor(len(sample)*val_percentage)
    train_set = sample[:train]
    val_set = sample[train:train+val]
    test_set = sample[train+val:]
    return train_set, val_set, test_set

train_normal, val_normal, test_normal = split(sample_normal)
train_AD, val_AD, test_AD = split(sample_AD)
train_uncertain, val_uncertain, test_uncertain = split(sample_uncertain)

train_data_oasis = train_normal.append(train_AD).append(train_uncertain)
val_data_oasis = val_normal.append(val_AD).append(val_uncertain)
test_data_oasis = test_normal.append(test_AD).append(test_uncertain)

print(len(train_data_oasis), len(val_data_oasis), len(test_data_oasis))

1255 156 160


In [262]:
train_data_oasis.head(10)

Unnamed: 0,Count,actual file name,subject,type,file_name_day,day label,days,match_label,gender,age,age_norm,file address,Sex
0,17814,OAS30001_MR_d0129.nii.gz,OAS30001,MR,d0129.nii.gz,d0129,129,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d0129.nii.gz,1
1,15818,OAS30001_MR_d0757.nii.gz,OAS30001,MR,d0757.nii.gz,d0757,757,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d0757.nii.gz,1
3,17075,OAS30001_MR_d3132.nii.gz,OAS30001,MR,d3132.nii.gz,d3132,3132,0.0,female,65.149895,0.427142,fs_t1/OAS30001_MR_d3132.nii.gz,1
5,18189,OAS30002_MR_d0653.nii.gz,OAS30002,MR,d0653.nii.gz,d0653,653,0.0,male,67.206024,0.465866,fs_t1/OAS30002_MR_d0653.nii.gz,0
6,17453,OAS30002_MR_d2340.nii.gz,OAS30002,MR,d2340.nii.gz,d2340,2340,0.0,male,67.206024,0.465866,fs_t1/OAS30002_MR_d2340.nii.gz,0
7,18150,OAS30002_MR_d2345.nii.gz,OAS30002,MR,d2345.nii.gz,d2345,2345,0.0,male,67.206024,0.465866,fs_t1/OAS30002_MR_d2345.nii.gz,0
10,14014,OAS30003_MR_d2669.nii.gz,OAS30003,MR,d2669.nii.gz,d2669,2669,0.0,female,58.77344,0.307054,fs_t1/OAS30003_MR_d2669.nii.gz,1
11,16103,OAS30003_MR_d2682.nii.gz,OAS30003,MR,d2682.nii.gz,d2682,2682,0.0,female,58.77344,0.307054,fs_t1/OAS30003_MR_d2682.nii.gz,1
12,16548,OAS30003_MR_d3731.nii.gz,OAS30003,MR,d3731.nii.gz,d3731,3731,0.0,female,58.77344,0.307054,fs_t1/OAS30003_MR_d3731.nii.gz,1
13,18382,OAS30004_MR_d1101.nii.gz,OAS30004,MR,d1101.nii.gz,d1101,1101,0.0,female,55.096508,0.237806,fs_t1/OAS30004_MR_d1101.nii.gz,1


# NACC Data Preprocessing

In [242]:
# NACC diagnosis is 0normal, 1normaltomci, 2mci, 3ad, 4otherdementia
# NACC sex bin is 0 Male, 1 Female

nacc_data = pd.read_csv("NACC_LABELS_CLASSIFICATION.csv")
nacc_data['Address_Name'] = "fs_t1_nacc/"+nacc_data['File_Name']

nacc_normal = nacc_data[nacc_data['Diagnosis'] == 0]
nacc_normalMCI = nacc_data[nacc_data['Diagnosis'] == 1]
nacc_MCI = nacc_data[nacc_data['Diagnosis'] == 2]
nacc_AD = nacc_data[nacc_data['Diagnosis'] == 3]
nacc_other = nacc_data[nacc_data['Diagnosis'] == 4]

print(len(nacc_normal), len(nacc_normalMCI), len(nacc_MCI),
      len(nacc_AD), len(nacc_other))

3034 193 625 2056 289


In [243]:
train_normal2, val_normal2, test_normal2 = split(nacc_normal)
train_normalMCI, val_normalMCI, test_normalMCI = split(nacc_normalMCI)
train_MCI, val_MCI, test_MCI = split(nacc_MCI)
train_AD2, val_AD2, test_AD2 = split(nacc_AD)
train_other, val_other, test_other = split(nacc_other)

train_data_nacc = train_other.append(train_normal2).append(train_normalMCI).append(train_MCI).append(train_AD2)
val_data_nacc = val_other.append(val_normal2).append(val_normalMCI).append(val_MCI).append(val_AD2)
test_data_nacc = test_other.append(test_normal2).append(test_normalMCI).append(test_MCI).append(test_AD2)

print(len(train_data_nacc), len(val_data_nacc), len(test_data_nacc))

4956 617 624


In [244]:
train_data_nacc.head(10)

Unnamed: 0,Map_ID,Image_ID,MRI_ID,Address_Name,File_Name,Label_ID,Patient_ID,Diagnosis,Visit_Year,MRI_Year,Data_Set,Sex,Age_Norm
3,4,4,mri4908,fs_t1_nacc/mri4908_SAG_T1_20130424203248_2.nii.gz,mri4908_SAG_T1_20130424203248_2.nii.gz,5521,NACC725009,4,2013,2013,NACC,0,0.53012
24,25,34,mri5254,fs_t1_nacc/mri5254_IR-FSPGR_TI=500_FA=10_20120...,mri5254_IR-FSPGR_TI=500_FA=10_20120608093049_9...,5528,NACC726763,4,2012,2012,NACC,0,0.722892
35,36,48,mri7785,fs_t1_nacc/mri7785_COR_GRADIENT_T1_20070515135...,mri7785_COR_GRADIENT_T1_20070515135332_2.nii.gz,4911,NACC273669,4,2007,2007,NACC,1,0.819277
38,39,51,mri3558,fs_t1_nacc/mri3558_ep2d_t1w_128_2mm_2008022114...,mri3558_ep2d_t1w_128_2mm_20080221145345_19.nii.gz,5021,NACC353834,4,2008,2008,NACC,0,0.698795
42,43,55,mri2629,fs_t1_nacc/mri2629_MPRAGE-ADNI_19890323123135_...,mri2629_MPRAGE-ADNI_19890323123135_2.nii.gz,4598,NACC065134,4,2008,2008,NACC,1,0.156627
45,46,58,mri5553,fs_t1_nacc/mri5553_IR-FSPGR_TI=500_FA=10_20120...,mri5553_IR-FSPGR_TI=500_FA=10_20120507104932_9...,4993,NACC335444,4,2012,2012,NACC,1,0.771084
52,53,66,mri3835,fs_t1_nacc/mri3835_Sagittal_T1_20130530115553_...,mri3835_Sagittal_T1_20130530115553_3.nii.gz,5311,NACC568769,4,2013,2013,NACC,0,0.746988
64,65,80,mri3852,fs_t1_nacc/mri3852_MPRAGE_SAG_ISO_201102071457...,mri3852_MPRAGE_SAG_ISO_20110207145732_2.nii.gz,5913,NACC990347,4,2011,2011,NACC,1,0.590361
74,75,95,mri5438,fs_t1_nacc/mri5438_FSPGR_3D_SAG_20130508100940...,mri5438_FSPGR_3D_SAG_20130508100940_8.nii.gz,5849,NACC941984,4,2013,2013,NACC,1,0.759036
91,92,114,mri1465,fs_t1_nacc/mri1465_FSPGR_rhuser1=-376_rhuser2=...,mri1465_FSPGR_rhuser1=-376_rhuser2=10_20100923...,4950,NACC307886,4,2010,2010,NACC,1,0.915663


# NACC & OASIS Data Merge

In [292]:
train_img_oasis = train_data_oasis['file address'].values
train_img_nacc = train_data_nacc['Address_Name'].values
train_img = np.concatenate((train_img_oasis, train_img_nacc), axis=None)

train_age_oasis = train_data_oasis['age_norm'].values
train_age_nacc = train_data_nacc['Age_Norm'].values
train_age = np.concatenate((train_age_oasis, train_age_nacc), axis=None)

train_sex_oasis = train_data_oasis['Sex'].values
train_sex_nacc = train_data_nacc['Sex'].values
train_sex = np.concatenate((train_sex_oasis, train_sex_nacc), axis=None)

train_label_oasis = train_data_oasis['match_label'].values
train_label_nacc = train_data_nacc['Diagnosis'].values
train_label = np.concatenate((train_label_oasis, train_label_nacc), axis=None)

train_label.shape

(6211,)

In [293]:
val_img_oasis = val_data_oasis['file address'].values
val_img_nacc = val_data_nacc['Address_Name'].values
val_img = np.concatenate((val_img_oasis, val_img_nacc), axis=None)

val_age_oasis = val_data_oasis['age_norm'].values
val_age_nacc = val_data_nacc['Age_Norm'].values
val_age = np.concatenate((val_age_oasis, val_age_nacc), axis=None)

val_sex_oasis = val_data_oasis['Sex'].values
val_sex_nacc = val_data_nacc['Sex'].values
val_sex = np.concatenate((val_sex_oasis, val_sex_nacc), axis=None)

val_label_oasis = val_data_oasis['match_label'].values
val_label_nacc = val_data_nacc['Diagnosis'].values
val_label = np.concatenate((val_label_oasis, val_label_nacc), axis=None)

val_label.shape

(773,)

In [295]:
# save input arrays for later use

np.save(data_dir+"train_img.npy", train_img)
np.save(data_dir+"train_age.npy", train_age)
np.save(data_dir+"train_sex.npy", train_sex)
np.save(data_dir+"train_label.npy", train_label)

np.save(data_dir+"val_img.npy", val_img)
np.save(data_dir+"val_age.npy", val_age)
np.save(data_dir+"val_sex.npy", val_sex)
np.save(data_dir+"val_label.npy", val_label)

# Data Input to Tensorflow

In [None]:
# load input arrays

train_img = np.load(data_dir+"train_img.npy")
train_age = np.load(data_dir+"train_age.npy")
train_sex = np.load(data_dir+"train_sex.npy")
train_label = np.load(data_dir+"train_label.npy")

val_img = np.load(data_dir+"val_img.npy")
val_age = np.load(data_dir+"val_age.npy")
val_sex = np.load(data_dir+"val_sex.npy")
val_label = np.load(data_dir+"val_label.npy")

In [207]:
train_img = tf.constant(train_img)
train_age = tf.constant(train_age)
train_sex = tf.constant(train_sex)
train_label = tf.constant(train_label)

val_img = tf.constant(val_img)
val_age = tf.constant(val_age)
val_sex = tf.constant(val_sex)
val_label = tf.constant(val_label)

#test_img = tf.constant(test_data['actual file name'].values)
#test_age = tf.constant(test_data['age'].values)
#test_sex = tf.constant(test_data['gender'].values)
#test_label = tf.constant(test_data['match_label'].values)

# create TensorFlow Dataset objects
trdata = Dataset.from_tensor_slices((train_img, train_age, train_sex, train_label))
valdata = Dataset.from_tensor_slices((val_img, val_label))

# create TensorFlow Iterator object
iterator = Iterator.from_structure(trdata.output_types, trdata.output_shapes)
next_el = iterator.get_next()

# create 2 initialization ops
tr_init_op = iterator.make_initializer(trdata)
val_init_op = iterator.make_initializer(valdata)

In [141]:
NUM_CLASSES = 5

def input_parser(img_path, label):
    # convert the label to one-hot encoding
    one_hot = tf.one_hot(label, NUM_CLASSES)

    # read the img from file
    img_file = tf.read_file(img_path)
    img_decoded = tf.image.decode_image(img_file, channels=3)

    return img_decoded, one_hot

Unnamed: 0,Count,actual file name,subject,type,file_name_day,day label,days,match_label
0,17814,OAS30001_MR_d0129.nii.gz,OAS30001,MR,d0129.nii.gz,d0129,129,0.0
1,15818,OAS30001_MR_d0757.nii.gz,OAS30001,MR,d0757.nii.gz,d0757,757,0.0
3,17075,OAS30001_MR_d3132.nii.gz,OAS30001,MR,d3132.nii.gz,d3132,3132,0.0
5,18189,OAS30002_MR_d0653.nii.gz,OAS30002,MR,d0653.nii.gz,d0653,653,0.0
6,17453,OAS30002_MR_d2340.nii.gz,OAS30002,MR,d2340.nii.gz,d2340,2340,0.0
7,18150,OAS30002_MR_d2345.nii.gz,OAS30002,MR,d2345.nii.gz,d2345,2345,0.0
10,14014,OAS30003_MR_d2669.nii.gz,OAS30003,MR,d2669.nii.gz,d2669,2669,0.0
11,16103,OAS30003_MR_d2682.nii.gz,OAS30003,MR,d2682.nii.gz,d2682,2682,0.0
12,16548,OAS30003_MR_d3731.nii.gz,OAS30003,MR,d3731.nii.gz,d3731,3731,0.0
13,18382,OAS30004_MR_d1101.nii.gz,OAS30004,MR,d1101.nii.gz,d1101,1101,0.0


In [241]:
#saving input tensors