# Run Dataset

Submit jobs to aws batch, with increaed timeout:
```
submit_subjects \
  --upload_metadata \
  --save_details \
  --stagger \
  -q reTHINQ-c5-spot \
  -t 1.0.0-rc.11 \
  --timeout 21600 \
  -l submission-log--20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600.json \
  -I s3://cmet-scratch/maclaren-cmeds/ \
  -o s3://cmet-scratch/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/
```

Save submission log to s3 output bucket:
```
aws s3 cp ./submission-log--20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600.json \
  s3://cmet-scratch/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/submission-log--20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600.json
```

# Copy Data Locally

```
mkdir -p /home/paul/cmet/data/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/
cd /home/paul/cmet/data/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/
aws s3 cp s3://cmet-scratch/maclaren-cmeds/demographics.tsv .
aws s3 cp \
  --recursive \
  --exclude "*" \
  --include "*subject_info.json" \
  --include "*.pdf" \
  s3://cmet-scratch/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/ .
```

If there are any failures, get rid of cached subject_info.jsons with this:
```
find . -type d -name 'cache' -exec rm -rf {} \;
```

In [4]:
import json
import os
import fnmatch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# imports find_json_files(); load_json_file(); load_dataset();
from cmeds import *
# imports calc_cvs(); session_permute(); monte_carlo_perm_test
from test_retest import *

In [5]:
# Local Vars
maclaren_dir = '/home/paul/cmet/data/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/'
maclaren_tsv = '/home/paul/cmet/data/20200625-mclaren--rethinq-1.0.0-rc.11--timeout-21600/demographics.tsv'

In [6]:
# Load MacLaren data into dataframes.  For volumetric data we can either load in vals in mm^3, or %icv.
# And each load returns the measurement value as well as the normative percentile estimate
maclaren_vol_df, maclaren_vol_norm_df = load_dataset(maclaren_dir, maclaren_tsv, drop_subjects=[], vol_data_src='volume')
maclaren_picv_df, maclaren_picv_norm_df = load_dataset(maclaren_dir, maclaren_tsv, drop_subjects=[], vol_data_src='volume_percent_icv')

Ignoring Subject (did it error out?) sub-01_run-02
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-09
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-08
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-24
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-13
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-16
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-14
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-32
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-01
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-36
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub

In [31]:
structs_of_interest = [
    'BrainSegVolNotVentSurf',
    'Left-Amygdala',
    'Left-Caudate',
    'Left-Cerebellum',
    'Left-Hippocampus',
    'Left-Lateral-Ventricle',
    'Left-Putamen',
    'Left-Thalamus',
    'Left-White-Matter',
    'Right-Amygdala',
    'Right-Caudate',
    'Right-Cerebellum',
    'Right-Hippocampus',
    'Right-Lateral-Ventricle',
    'Right-Putamen',
    'Right-Thalamus',
    'Right-White-Matter',
    'TotalGrayVol',
    'White-Matter',
    'lh_cortex_volume',
    'lh_frontal_volume',
    'lh_occipital_volume',
    'lh_parietal_volume',
    'lh_temporal_volume',
    'rh_cortex_volume',
    'rh_frontal_volume',
    'rh_occipital_volume',
    'rh_parietal_volume',
    'rh_temporal_volume',
    ]

In [7]:
structs_of_interest = [
    'BrainSegVolNotVentSurf',
    'TotalGrayVol',
    'White-Matter',
    'lh_cortex_volume',
    'lh_frontal_volume',
    'lh_parietal_volume',
    'lh_occipital_volume',
    'lh_temporal_volume',
    'Left-White-Matter',
    'Left-Lateral-Ventricle',
    'Left-Hippocampus',
    'Left-Amygdala',
    'Left-Caudate',
    'Left-Putamen',
    'Left-Thalamus',
    'Left-Cerebellum',
    ]

In [33]:
# The column name that holds session info in the demographics.tsv
session_col='session'
# The column name that holds subject info in the demographics.tsv
subject_col='subject_num'

session_list= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
subject_list= [2,3]

# The MacLaren dataset can be processed by either the maclaren method, or the generalized gluer method.
# A good sanity check is that both methods give the same results for this dataset
cvs_macmethod = calc_cvs(maclaren_vol_df,subject_list,session_list,subject_col,session_col,structs_of_interest,method='maclaren')
cvs_gluemethod = calc_cvs(maclaren_vol_df,subject_list,session_list,subject_col,session_col,structs_of_interest,method='gluer')

# Same as above but use percent_icv data as input instead of mm^3
cvs_macmethod_icv = calc_cvs(maclaren_picv_df,subject_list,session_list,subject_col,session_col,structs_of_interest,method='maclaren')
cvs_gluemethod_icv = calc_cvs(maclaren_picv_df,subject_list,session_list,subject_col,session_col,structs_of_interest,method='gluer')

In [11]:
structs_of_interest = [ 
    'Hippocampus', 
    'Lateral-Ventricles', 
    'Amygdala',
    'Putamen',
    'Caudate'
]
maclaren_vol_df

Unnamed: 0,age,sex,manufacturer,field_strength,diagnosis,file_type,scan_time,scan_date,subject_num,source,...,rh_rostralanteriorcingulate_volume,rh_rostralmiddlefrontal_volume,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_temporal_volume,rh_temporalpole_volume,rh_transversetemporal_volume,sbTIV
sub-01_run-05,26,M,GE,3,HC,nifti,1413,19700101,1,https://openneuro.org/datasets/ds000239/versio...,...,2517.0,17946.0,20851.0,14645.0,13458.0,8584.0,58748.0,2594.0,824.0,1.561225e+06
sub-01_run-06,26,M,GE,3,HC,nifti,1422,19700101,1,https://openneuro.org/datasets/ds000239/versio...,...,2396.0,17397.0,20435.0,13634.0,13362.0,8567.0,57315.0,2183.0,760.0,1.557921e+06
sub-01_run-07,26,M,GE,3,HC,nifti,1305,19700101,1,https://openneuro.org/datasets/ds000239/versio...,...,2606.0,17939.0,20302.0,14547.0,13723.0,8313.0,58632.0,2125.0,834.0,1.548234e+06
sub-01_run-12,26,M,GE,3,HC,nifti,2144,19700101,1,https://openneuro.org/datasets/ds000239/versio...,...,2558.0,16709.0,20277.0,13922.0,13469.0,8148.0,58091.0,2358.0,809.0,1.568331e+06
sub-01_run-17,26,M,GE,3,HC,nifti,1928,19700101,1,https://openneuro.org/datasets/ds000239/versio...,...,2580.0,17981.0,20854.0,12982.0,13549.0,8929.0,57966.0,2303.0,839.0,1.566575e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-03_run-36,30,F,GE,3,HC,nifti,850,19700101,3,https://openneuro.org/datasets/ds000239/versio...,...,1564.0,16563.0,22432.0,12190.0,11291.0,10795.0,55313.0,2199.0,946.0,1.418517e+06
sub-03_run-37,30,F,GE,3,HC,nifti,839,19700101,3,https://openneuro.org/datasets/ds000239/versio...,...,1614.0,16669.0,23076.0,12174.0,11444.0,10871.0,54945.0,2361.0,932.0,1.417778e+06
sub-03_run-38,30,F,GE,3,HC,nifti,848,19700101,3,https://openneuro.org/datasets/ds000239/versio...,...,1533.0,16670.0,22464.0,12229.0,11349.0,10805.0,54956.0,2209.0,946.0,1.417339e+06
sub-03_run-39,30,F,GE,3,HC,nifti,1507,19700101,3,https://openneuro.org/datasets/ds000239/versio...,...,1515.0,16436.0,22687.0,12256.0,11528.0,10844.0,55210.0,2184.0,913.0,1.418929e+06


## This should be comprable to Table 1 in [1]

In [34]:
n = 1000
monte_carlo_perm_test(maclaren_vol_df, subject_list, session_list, subject_col, session_col, structs_of_interest, n_itrs=n, method='gluer')

Unnamed: 0,BrainSegVolNotVentSurf,TotalGrayVol,White-Matter,lh_cortex_volume,lh_frontal_volume,lh_parietal_volume,lh_occipital_volume,lh_temporal_volume,Left-White-Matter,Left-Lateral-Ventricle,Left-Hippocampus,Left-Amygdala,Left-Caudate,Left-Putamen,Left-Thalamus,Left-Cerebellum
mean-vol,1112280.0,604897.9,481112.5875,252287.775,95962.275,54824.475,24258.2625,58952.5625,238154.65,8567.565,4289.37375,1552.05,3417.185,5642.37,6592.445,67584.10375
total-cov,0.4137697,0.677517,0.486445,1.152907,1.303041,1.878735,1.607167,2.02355,0.610462,1.577902,1.099042,1.994225,1.27964,2.168283,0.944958,0.706172
session-cov,0.3274156,0.670641,0.44774,0.940043,0.979732,1.389655,1.192399,1.331097,0.508252,0.967618,1.247299,1.922322,1.12534,1.765608,0.861241,0.319214
abs-diff-cov,0.08635402,0.006876,0.038705,0.212864,0.323309,0.48908,0.414768,0.692453,0.10221,0.610283,0.148257,0.071902,0.1543,0.402675,0.083717,0.386958
p-vals,0.012,0.892,0.364,0.016,0.003,0.0,0.002,0.0,0.048,0.0,0.095,0.678,0.122,0.014,0.27,0.0
