# Experiment Details

TBA



In [1]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
import nobrainer

In [18]:
import multiprocessing as mp
from pathlib import Path
import tensorflow as tf
import functools
import tempfile
import sys, os
from tqdm import tqdm

from nondefaced_detector.preprocessing.normalization import clip, normalize, standardize
from nondefaced_detector.preprocessing.conform       import conform_data
from nondefaced_detector.helpers                     import utils

print("Number of processors: ", mp.cpu_count())
print(os.sched_getaffinity(0))

from nobrainer.io import read_csv, verify_features_labels


# verify_features_labels(temp)

def preprocess(
    vol_path,
    conform_volume_to=(128, 128, 128),
    conform_zooms=(2.0, 2.0, 2.0),
    save_path=None,
    with_label=False,
):
    
    try:
        vpath = vol_path
        if with_label:
            if len(vol_path) != 2:
                raise ValueError(
                    "The vol_path must have length of 2 when with_label=True"
                )
            
            vpath, label = vol_path
        
        spath = os.path.join(os.path.dirname(vpath), 'preprocessed')
        if save_path:
            spath = os.path.join(save_path, 'preprocessed')
        
        os.makedirs(spath, exist_ok=True)

        volume, affine, _ = utils.load_vol(vpath)

        # Prepocessing
        volume = clip(volume, q=90)
        volume = normalize(volume)
        volume = standardize(volume)
        
        
        tmp_preprocess_vol = tempfile.NamedTemporaryFile(
            suffix=".nii.gz",
            delete=True,
            dir=spath,
        )
        
        utils.save_vol(tmp_preprocess_vol.name, volume, affine)
    
    
        tmp_conform_vol = os.path.join(spath, os.path.basename(vpath))
        
        conform_data(
            tmp_preprocess_vol.name,
            out_file=tmp_conform_vol,
            out_size=conform_volume_to,
            out_zooms=conform_zooms)
        
        tmp_preprocess_vol.close()
        
        if with_label:
            return (tmp_conform_vol, label)
        return tmp_conform_vol
    
    except Exception as e:
        print(e)
        return
    
def cleanup_files(*args):
    for p in args:
        if os.path.exists(p):
            os.remove(p)
            
def preprocess_csv(
    volume_filepaths,
    num_parallel_calls=None,
    conform_volume_to=(128, 128, 128),
    conform_zooms=(2.0, 2.0, 2.0),
    save_path=None,
    with_label=True,
):

    try:
        map_fn = functools.partial(
            preprocess,
            conform_volume_to=conform_volume_to,
            conform_zooms=conform_zooms,
            save_path=save_path,
            with_label=with_label
        )
        
        if num_parallel_calls is None:
            # Get number of eligible CPUs.
            num_parallel_calls = len(os.sched_getaffinity(0))
        
        print("Preprocessing {} examples".format(len(volume_filepaths)))
        
        outputs = []
        
        if num_parallel_calls == 1:
            for vf in tqdm(volume_filepaths, total=len(volume_filepaths)):
                result = map_fn(vf)
                outputs.append(result)      
        else:
            pool = mp.Pool(num_parallel_calls)
            for result in tqdm(pool.imap(func=map_fn, iterable=volume_filepaths), total=len(volume_filepaths)):
                outputs.append(result)
                
        return outputs
                 
    except Exception as e:
        print(e)
        return

# import csv
# temp = []
# with open('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/example.csv', 'r') as file:
#     reader = csv.reader(file)
#     for row in reader:
#         temp.append(row[0])
    

temp = read_csv('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/example.csv', skip_header=False)

# vpaths = list(zip(*temp))[0]
# print(preprocess(temp[0], with_label=True))
# outputs = preprocess_csv(temp)
# print(outputs)
print(temp)

Number of processors:  16
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
[('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/faced/example1.nii.gz', '1'), ('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/faced/example2.nii.gz', '1'), ('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/faced/example3.nii.gz', '1'), ('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/defaced/example1.nii.gz', '0'), ('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/defaced/example2.nii.gz', '0'), ('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/defaced/example3.nii.gz', '0')]


In [5]:
import multiprocessing as mp
from pathlib import Path
import tensorflow as tf
import functools
import tempfile
import sys, os
from tqdm import tqdm

from nondefaced_detector.preprocessing.normalization import clip, normalize, standardize
from nondefaced_detector.preprocessing.conform       import conform_data
from nondefaced_detector.helpers                     import utils
from nondefaced_detector.preprocess import preprocess_parallel

print("Number of processors: ", mp.cpu_count())
print(os.sched_getaffinity(0))

from nobrainer.io import read_csv, verify_features_labels

num_parallel_calls=-1
volume_shape=(128,128,128)
preprocess_path=None
volume_filepaths = read_csv('/home/shank/Stanford/nondefaced-detector/examples/sample_vols/example.csv', skip_header=False)

num_parallel_calls = None if num_parallel_calls == -1 else num_parallel_calls
if num_parallel_calls is None:
    # Get number of processes allocated to the current process.
    # Note the difference from `os.cpu_count()`.
    num_parallel_calls = len(os.sched_getaffinity(0))

invalid_pairs = verify_features_labels(
    volume_filepaths,
    check_labels_int=True,
    num_parallel_calls=num_parallel_calls,
    verbose=1,
)

## UNCOMMENT the following when https://github.com/neuronets/nobrainer/pull/125
## is merged
# if not invalid_pairs:
#     click.echo(click.style("Passed verification.", fg="green"))
# else:
#     click.echo(click.style("Failed verification.", fg="red"))
#     for pair in invalid_pairs:
#         click.echo(pair[0])
#         click.echo(pair[1])
#     sys.exit(-1)

ppaths = preprocess_parallel(
    volume_filepaths,
    conform_volume_to=volume_shape,
    num_parallel_calls=num_parallel_calls,
    save_path=preprocess_path,
)

invalid_pairs = verify_features_labels(
    ppaths,
    volume_shape=volume_shape,
    check_labels_int=True,
    num_parallel_calls=num_parallel_calls,
    verbose=1,
)

Number of processors:  16
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Verifying 6 examples
Preprocessing 6 examples


100%|██████████| 6/6 [00:05<00:00,  1.04it/s]

Verifying 6 examples
0/6 [..............................] - ETA: 0s






In [19]:
import nobrainer


tfrecords_template = 'tfrecords/data-train_shard-{shard:03d}.tfrec'

os.makedirs(os.path.dirname(tfrecords_template), exist_ok=True)

print(tfrecords_path)

nobrainer.tfrecord.write(
        features_labels=ppaths,
        filename_template=tfrecords_template,
        examples_per_shard=3)



/home/shank/Stanford/nondefaced-detector/examples/sample_vols/faced/preprocessed/tfrecords/data-train_shard-{shard:03d}.tfrec


data-train_shard-000.tfrec  data-train_shard-001.tfrec


In [None]:
import os, sys
sys.path.append("..")
import numpy as np
from glob import glob
import pandas as pd
import random
from random import shuffle

# Define paths
ROOT_DIR = '/home/shank/HDDLinux/Stanford/data/mriqc-shared/conformed'

face_path = os.path.join(ROOT_DIR, 'face/128')
defaced_path = os.path.join(ROOT_DIR, 'face_defaced/128')
refaced_path = os.path.join(ROOT_DIR, 'face_refaced/128')

paths_d = []
paths_f = []
paths_r = []

for path in glob(defaced_path + "/*/*.nii*"):
    DS = path.split('/')[-2]
    paths_d.append(path)
    
for path in glob(refaced_path + "/*/*.nii*"):
    DS = path.split('/')[-2]
    paths_r.append(path)
    
for path in glob(face_path + "/*/*.nii*"):
    DS = path.split('/')[-2]
    paths_f.append(path)
    

def generate_datasets(fpaths, dpaths, size, typ ='faced'):
    
    if typ not in ['faced', 'refaced']:
        print("Incorrect value for t. Choose from [faced, refaced]")
        return
    
    random.shuffle(fpaths)
    test_f = fpaths[:size]
    main_f = fpaths[size:]

    test_d = []
    for t in test_f:
        if typ == 'faced':
            test_d.append(t.replace('face', 'face_defaced'))
        
        if typ == 'refaced':
            DS = t.split('/')[-2]
            sub = t.split('/')[-1].replace('_defaced_refaced', '').split('.nii.gz')[0]
            search_pattern = os.path.join(DS, sub)
            
            # match pattern from defaced dataset
            for _d in dpaths:
                if search_pattern in _d:
                    test_d.append(_d)
                

    test = test_f + test_d
    labels_test = [1]*len(test_f) + [0]*len(test_d)
    
    # remove T_A_D from defaced volume set
    main_d = list(set(dpaths) - set(test_d))
    
    labels_main = [1]*len(main_f) + [0]*len(main_d)
    main = main_f + main_d
    
    return main, labels_main, test, labels_test

A_2, L_A_2, T_A, L_T_A = generate_datasets(paths_f, paths_d, 49, typ='faced')
B_2, L_B_2, T_B, L_T_B = generate_datasets(paths_r, paths_d, 49, typ='refaced')

print(len(A_2), len(T_A))
print(len(B_2), len(T_B))


In [None]:
from nondefaced_detector import preprocess
vol_path = '../../examples/sample_vols/IXI002-Guys-0828-T1.nii.gz'
save_path = ''
ppath, cpath = preprocess.preprocess(vol_path, save_path=save_path)
print(ppath, cpath)

## Generate n-fold CV Datasets

In [None]:
from operator import itemgetter
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
import random
from random import shuffle
import os

def generate_CSV(paths, labels, save_path, test_paths=None, test_labels=None, n=15, mode='CV'):
    
    os.makedirs(save_path, exist_ok=True)
    
    df = pd.DataFrame()
    df["X"] = paths
    df["Y"] = labels
    df.to_csv(os.path.join(save_path, "all.csv"))
    
    if mode == 'CV':
        SPLITS = n
        skf = StratifiedKFold(n_splits=SPLITS)
        fold_no = 1

        for train_index, test_index in skf.split(paths, labels):
            out_path = os.path.join(save_path, "train_test_fold_{}/csv/".format(fold_no))

            if not os.path.exists(out_path):
                os.makedirs(out_path)

            image_train, image_test = (
                itemgetter(*train_index)(paths),
                itemgetter(*test_index)(paths),
            )

            label_train, label_test = (
                itemgetter(*train_index)(labels),
                itemgetter(*test_index)(labels),
            )

            train_data = {"X": image_train , "Y": label_train}
            df_train = pd.DataFrame(train_data)
            df_train.to_csv(os.path.join(out_path, "training.csv"), index=False)

            validation_data = {"X": image_test, "Y": label_test}
            df_validation = pd.DataFrame(validation_data)
            df_validation.to_csv(os.path.join(out_path, "validation.csv"), index=False)

            fold_no += 1
    else:
        train_data = {"X": paths , "Y": labels}
        df_train = pd.DataFrame(train_data)
        df_train.to_csv(os.path.join(save_path, "training.csv"), index=False)
        
        test_data = {"X": test_paths , "Y": test_labels}
        df_test = pd.DataFrame(test_data)
        df_test.to_csv(os.path.join(save_path, "testing.csv"), index=False)
        
ROOTDIR = '/home/shank/HDDLinux/Stanford/data/mriqc-shared/experiments'

## CROSS VALIDATION
# generate_CSV(A_2, L_A_2, "experiments/experiment_A/csv_F15")
generate_CSV(B_2, L_B_2, os.path.join(ROOTDIR, "experiment_B/128/csv_F15"), mode='CV')


## DEFINE A ROOT DIR where all the data will be stored <<<<<
# ROOTDIR = '/work/06850/sbansal6/maverick2/mriqc-shared/experiments' 

## FULL DATASET
# generate_CSV(A_2,
#              L_A_2,
#              os.path.join(ROOTDIR, 'experiment_A/128/csv_full'),
#              test_paths=T_A,
#              test_labels=L_T_A,
#              mode='full')

# generate_CSV(B_2,
#              L_B_2,
#              os.path.join(ROOTDIR, 'experiment_B/128/csv_full'),
#              test_paths=T_B,
#              test_labels=L_T_B,
#              mode='full')


# Generate tfrecords for n-fold CV datasets

In [None]:
import random
import nobrainer
import os, sys
sys.path.append("..")
import numpy as np
import nibabel as nb
from glob import glob
from pathlib import Path
from shutil import *
import subprocess
from operator import itemgetter
import pandas as pd


def generate_tfrecords(csv_path, records_save_path, mode='CV'):
    
    os.makedirs(records_save_path, exist_ok=True)
    train_csv_path = os.path.join(csv_path, "training.csv")
    train_paths = pd.read_csv(train_csv_path)["X"].values
    train_labels = pd.read_csv(train_csv_path)["Y"].values
    train_D = list(zip(train_paths, train_labels))
    
    random.shuffle(train_D)
    train_write_path = os.path.join(records_save_path, 'data-train_shard-{shard:03d}.tfrec')
    
    nobrainer.tfrecord.write(
        features_labels=train_D,
        filename_template=train_write_path,
        examples_per_shard=3)
    
    if mode =='CV':
        vt_csv_path = os.path.join(csv_path, "validation.csv")
        namefill = 'valid'
    else:
        vt_csv_path = os.path.join(csv_path, "testing.csv")
        namefill = 'test'
        
    vt_paths = pd.read_csv(vt_csv_path)["X"].values
    vt_labels = pd.read_csv(vt_csv_path)["Y"].values
    vt_D = list(zip(vt_paths, vt_labels))
    random.shuffle(vt_D)
    vt_write_path = os.path.join(records_save_path, 'data-{}_shard-{shard:03d}.tfrec'.format(namefill))

    nobrainer.tfrecord.write(
        features_labels=vt_D,
        filename_template=vt_write_path,
        examples_per_shard=1)
        

ROOTDIR = '/tf/shank/HDDLinux/Stanford/data/mriqc-shared/experiments'

# Cross-Validation 
# SPLITS = 15
# for fold in range(1, SPLITS+1):
#     print("FOLD: ", fold)
#     csv_path = os.path.join(
#         ROOTDIR, "experiment_B/128/csv_F15/train_test_fold_{}/csv/".format(fold)
#     )
    
#     tf_records_dir = os.path.join(
#         ROOTDIR, "experiment_B/128/tfrecords_F15/tfrecords_fold_{}/".format(fold)
#     )
#     generate_tfrecords(csv_path, tf_records_dir)


# Test (full dataset)
# experiment_A
# csv_path = os.path.join(ROOT_DIR, "experiment_A/128/csv_full")
# tf_records_dir = os.path.join(ROOT_DIR, "experiment_A/128/tfrecords_full")
# generate_tfrecords(csv_path, tf_records_dir, mode='test')

# experiment_B
# csv_path = os.path.join(ROOT_DIR, "experiment_B/128/csv_full")
# tf_records_dir = os.path.join(ROOT_DIR, "experiment_B/128/tfrecords_full")
# generate_tfrecords(csv_path, tf_records_dir, mode='test')

## Main held-out Test Dataset
csv_path = '/tf/shank/HDDLinux/Stanford/data/mriqc-shared/test_ixi/csv/testing.csv'
records_save_path = '/tf/shank/HDDLinux/Stanford/data/mriqc-shared/test_ixi/tfrecords_new'
paths = pd.read_csv(csv_path)["X"].values
labels = pd.read_csv(csv_path)["Y"].values

vt_D = list(zip(paths, labels))
random.shuffle(vt_D)

write_path = os.path.join(records_save_path, 'data-test_shard-{shard:03d}.tfrec')

nobrainer.tfrecord.write(
    features_labels=vt_D,
    filename_template=write_path,
    examples_per_shard=1)
