In [33]:
!pip install nibabel
!pip install pandas

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [34]:
import pandas as pd
import numpy as np
import os
import glob
import nibabel as nib
from shutil import copyfile
from os import path


# Prior Steps:
1) Download oasis3 data - folders are organized by patient id

2) Run session match up bash or python script in the oasis-3 repo. This generates a csv lining up scans with cdr values within n number of days between scan and clinical visit

3) This colab contains loading of all T1w nii.gz compressed files and saving them into folders based on their CDR value

Initial model uses the scans from patients who were not known to have dementia or AD throughout this study. This is in order to get a proper representation of the 'normal' distribution

In [35]:
label_df = pd.read_csv('session_matchup_output.csv')

In [36]:
all_t1w_mr_files = glob.glob('/tmp/data/**/**/*T1w.nii.gz')

In [37]:
all_t1w_mr_files

['/tmp/data/labeled/ad/sub-OAS30016_ses-d0021_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30024_ses-d0084_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30029_ses-d0893_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30031_ses-d0427_run-01_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30031_ses-d0427_run-02_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30051_ses-d1286_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30056_ses-d3491_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30058_ses-d1316_run-01_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30058_ses-d1316_run-02_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30058_ses-d2236_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30064_ses-d0687_run-01_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30064_ses-d0687_run-02_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30070_ses-d0070_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30072_ses-d2219_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30085_ses-d1566_T1w.nii.gz',
 '/tmp/data/labeled/ad/sub-OAS30091_ses-d0092_run-01_T1w.nii.gz',
 '/tmp/

In [38]:
mapping = []
none_list = []
# iterate through all mr id records
for mr_id in label_df['MR ID_MR']:
    # indicate that a record has not been appended
    inner_list = []
    # loop through all filenames
    for file_path in all_t1w_mr_files:
        # append the file path if mr id is found
        if mr_id in file_path:
            inner_list.append(file_path)
            #print(f"Found {mr_id} in {file_path}")
            #break
    if len(inner_list) == 0:
        mapping.append(None)
        none_list.append(mr_id)
    else:
        mapping.append(inner_list)
        

In [39]:
df = pd.DataFrame(none_list)
# Save off to download remaining files later
#df.to_csv('to_download.csv', index=False)

### I'm missing data from complete set of OASIS 3 files! Last record found is for OAS30704_MR_d0584
#### Nothing from OAS30705_MR_d0070 onwards - can get the list by filtering on NaNs in File_Path column


In [40]:
len(mapping)

1156

In [41]:
label_df.shape

(1156, 35)

In [42]:
label_df['File_Path'] = mapping

In [43]:
label_df.head(5)

Unnamed: 0.1,Unnamed: 0,ADRC_ADRCCLINICALDATA ID,Subject,Date,Age,mmse,ageAtEntry,cdr,commun,dx1,...,Day,MR ID_MR,Subject_MR,Age_MR,Scanner_MR,Scans_MR,FreeSurfers_MR,PUPs_MR,Day_MR,File_Path
0,2,OAS30001_ClinicalData_d0722,OAS30001,,,30.0,65.149895,0.0,0.0,Cognitively normal,...,722,OAS30001_MR_d0757,OAS30001,67.0,3.0T,"bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...",1.0,,757.0,[/tmp/data/OAS30001_MR_d0757/anat2/sub-OAS3000...
1,13,OAS30002_ClinicalData_d0751,OAS30002,,,29.0,67.206024,0.0,0.0,Cognitively normal,...,751,OAS30002_MR_d0653,OAS30002,68.0,3.0T,"bold(2), T1w(2), T2w(2)",1.0,,653.0,[/tmp/data/OAS30002_MR_d0653/anat2/sub-OAS3000...
2,17,OAS30002_ClinicalData_d2263,OAS30002,,,30.0,67.206024,0.0,0.0,Cognitively normal,...,2263,OAS30002_MR_d2345,OAS30002,73.0,3.0T,"asl(2), bold(2), dwi(2), fieldmap(3), GRE(2), ...",1.0,,2345.0,[/tmp/data/OAS30002_MR_d2345/anat2/sub-OAS3000...
3,23,OAS30003_ClinicalData_d2630,OAS30003,,,29.0,58.77344,0.0,0.0,Cognitively normal,...,2630,OAS30003_MR_d2682,OAS30003,66.0,3.0T,"angio(5), asl(1), dwi(1), fieldmap(3), FLAIR(1...",1.0,,2682.0,[/tmp/data/OAS30003_MR_d2682/anat8/sub-OAS3000...
4,25,OAS30003_ClinicalData_d3288,OAS30003,,,30.0,58.77344,0.0,0.0,Cognitively normal,...,3288,OAS30003_MR_d3320,OAS30003,,,"angio(1), T1w(1)",1.0,,3320.0,[/tmp/data/OAS30003_MR_d3320/anat2/sub-OAS3000...


### Split the list as strings under File_Path column
https://stackoverflow.com/questions/45758646/pandas-convert-string-into-list-of-strings

Organize the data: "/tmp/data/categorized > train/test > 0/1"
Just make copies since

In [44]:
# Bucket images with labels
base_path = "/tmp/data/labeled/"
for index, row in label_df.iterrows():
    if row["cdr"] == 0:
        final_path = base_path+"normal/"
    else:
        final_path = base_path+"ad/"
        #print("Found AD:", row["File_Path"])
    if row["File_Path"] == None:
        continue
    files = list(row["File_Path"])
    for file in files:
        copy_to = final_path + file.split("/")[-1]
        if not path.exists(copy_to):
            copyfile(file, copy_to)

In [45]:
label_df.to_csv('label_df_saved.csv')

In [46]:
len(none_list)

5

In [47]:
none_list

['OAS30770_MR_d2159',
 'OAS30801_MR_d0097',
 'OAS30812_MR_d0055',
 'OAS30844_MR_d0035',
 'OAS30871_MR_d1103']