# Setup 

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth',500)

In [None]:
## EDIT THESE VARIABLES
analysis_version = "2017_08_25"
project_dir = Path('/data/NNDSP') # needs to be pathlib.Path object
ncpus = '10'
ram = '16'

In [None]:
bids_dir = project_dir.joinpath('bids_2017_07_14')

# conf_script = mriqc_dir.joinpath('conf' + analysis_version + '.sh')

mriqc_dir  = project_dir.joinpath('anal/mriqc_files/other_files')
if not mriqc_dir.exists():
    mriqc_dir.mkdir()
output_folder =  project_dir / 'derivatives' / 'mriqc'
if not output_folder.exists():
    output_folder.mkdir()
base_work_dir = output_folder.joinpath('work')
if not base_work_dir.exists():
    base_work_dir.mkdir()
log_dir = mriqc_dir.joinpath('swarm_output_' +  analysis_version)
if not log_dir.exists():
    log_dir.mkdir()

swarm_path = mriqc_dir.joinpath('mriqc_' + analysis_version + '.cmd')
sing_image = Path('/data/FMRIF/mriqc/images/mriqc_parsing2.img')
# sing_image = Path('/data/Hippo_hr/moa/anal/mriqc_files/other_files/poldracklab_mriqc-2017-07-26-9f304acebfe0.img')

In [None]:
%pwd
%cd {project_dir}
%pwd

In [None]:
ls {output_folder}

# Generating subject list

In [None]:
df_scans = pd.DataFrame(columns=['scan_path'],data=[p.as_posix() for p in bids_dir.glob('**/anat/*T1w.nii.gz')])
# df_scans = df_scans.assign(subject = lambda df: df.scan_path.str.extract('(sub-\d{2,4})', expand=True))
df_scans = pd.concat(
    [df_scans,
     df_scans.scan_path.
     str.extract(
         '.*(?P<subject>sub-\d{2,4})/.*(?P<run>run-[a-zA-Z0-9]{1,8})_.*',
         expand=True)],
    axis = 1)

df_scans.head()

# Running mriqc on all scans

### Function for generating commands

In [None]:
# usage: mriqc [-h] [--version]
#              [--participant_label PARTICIPANT_LABEL [PARTICIPANT_LABEL ...]]
#              [--session-id SESSION_ID [SESSION_ID ...]]
#              [--run-id RUN_ID [RUN_ID ...]] [--task-id TASK_ID [TASK_ID ...]]
#              [-m [{T1w,bold,T2w} [{T1w,bold,T2w} ...]]] [-w WORK_DIR]
#              [--report-dir REPORT_DIR] [--verbose-reports] [--write-graph]
#              [--dry-run] [--profile] [--use-plugin USE_PLUGIN] [--no-sub]
#              [--email EMAIL] [-v] [--webapi-url WEBAPI_URL]
#              [--webapi-port WEBAPI_PORT] [--upload-strict] [--n_procs N_PROCS]
#              [--mem_gb MEM_GB] [--testing] [-f] [--ica] [--hmc-afni]
#              [--hmc-fsl] [--fft-spikes-detector] [--fd_thres FD_THRES]
#              [--ants-nthreads ANTS_NTHREADS] [--ants-settings ANTS_SETTINGS]
#              [--deoblique] [--despike] [--start-idx START_IDX]
#              [--stop-idx STOP_IDX] [--correct-slice-timing]
#              bids_dir output_dir {participant,group} [{participant,group} ...]

def generate_mriqc_command(df_row,bids_root=None,output_folder=None,ncpus='4',ram='8',image=None):

#     mriqc bids-root/ output-folder/ participant --participant-label S01 S02 S03
    work_dir = output_folder.joinpath('work','_'.join([df_row.subject, df_row.run]))
    report_dir = work_dir.joinpath('report')
    cmd = 'module load singularity;' + \
    ' mkdir ' + work_dir.as_posix() + ';' + \
    ' mkdir ' + report_dir.as_posix() + ';' + \
    ' singularity exec ' + sing_image.as_posix() + \
    ' mriqc' +  \
    ' --run-id ' + df_row.run + \
    ' --ants-nthreads ' + ncpus + \
    ' --n_procs ' + ncpus + \
    ' --mem_gb ' + ram + \
    ' --email ' + 'johnleenimh@gmail.com' + \
    ' --verbose-reports' + \
    ' --write-graph' + \
    ' --report-dir ' + report_dir.as_posix() + \
    ' -w ' + work_dir.as_posix() + \
    ' ' + bids_root.as_posix() + \
    ' ' + output_folder.as_posix() + \
    ' participant' + \
    ' --participant-label ' + df_row.subject

    return cmd

### Generate mriqc commands

In [None]:
df_sing = df_scans.copy()
df_sing['cmd'] = df_scans.apply(lambda df: generate_mriqc_command(df_row = df,
                                                 bids_root = bids_dir,
                                                 output_folder = output_folder,
                                                 ncpus = ncpus,
                                                 ram = ram,
                                                 image = sing_image),
              axis = 1)
# sometimes not all commands resolve to a single scan so getting unique ones before writing swarm
swarm_path.write_text('\n'.join(df_sing.cmd.drop_duplicates())) 
# print(swarm_path.read_text())
swarm_path.read_text().splitlines()[:10]

In [None]:
(len(df_sing.cmd),len(df_sing.cmd.drop_duplicates()))

###  Run swarm

In [None]:
swarm_path

In [None]:
log_dir

In [None]:
job_id = !swarm -f  {swarm_path} -g {ram} -t {ncpus} --logdir {log_dir} --time 96:00:00 --partition=nimh,norm
job_id = job_id[0]
job_id

job_id for swarm that worked: 48276164

### Exploring potential issues with swarm.

In [None]:
files_of_interest = []
# df_error_files_paths = pd.DataFrame([x.as_posix() for x in Path('swarm_output_2017_06_05').glob('*.e')],columns=['paths'])
df_error_files_paths = pd.DataFrame([x.as_posix() for x in log_dir.glob('*' + job_id + '*.e')],columns=['paths'])
df_error_files = (df_error_files_paths.
                  assign(run = lambda df:
                         df.paths.str.extract(
                             '/.*swarm_\d*_(\d*).e',
                             expand=False).
                         astype(int)).sort_values('run'))
if not files_of_interest:
    files_of_interest = list(range(len(df_error_files_paths)))


df_error_files.head()

Some files failed (observed on dashboard):

In [None]:
print('\n\n\n'.join(np.array(df_error_files.paths)))

In [None]:
error_files = [Path(x).read_text() for x in np.array(df_error_files.paths)]

In [None]:
print('\n\n\n'.join(error_files))

In [None]:
output_files = [Path(x).with_suffix('.o').read_text() for x in np.array(df_error_files.paths)]

In [None]:
print('\n\n\n'.join(output_files[:2]))


In [None]:
!ls {output_folder.joinpath('reports')}|wc

# Assessing output of the mriqc swarm

In [None]:
df_html = pd.DataFrame(columns=['file'],data=[p.as_posix() for p in output_folder.joinpath('reports').iterdir()])
# df_scans = df_scans.assign(subject = lambda df: df.scan_path.str.extract('(sub-\d{2,4})', expand=True))

In [None]:
df_html = pd.concat(
    [df_html,
     df_html.file.
     str.extract(
         '.*(?P<subject>sub-\d{2,4}).*(?P<run>run-[a-zA-Z0-9]{1,8})_.*',
         expand=True)],
    axis = 1)

df_html.head()

In [None]:
!module load singularity; singularity exec {sing_image.as_posix()} mriqc --n_procs 5 --mem_gb 30 --email johnleenimh@gmail.com --verbose-reports --write-graph -w {base_work_dir.as_posix()} {bids_dir.as_posix()} {output_folder} group

In [None]:
ls {output_folder}

In [None]:
t1w = pd.read_csv(output_folder.joinpath('T1w.csv'))

t2w = pd.read_csv(output_folder.joinpath('T2w.csv'))

In [None]:
t1w.head() 

In [None]:
t1w['subject'] = t1w.subject_id.apply(lambda x: 'sub-{n:04d}'.format(n = x))
t1w['run'] = t1w.run_id.apply(lambda x: 'run-{n:03d}'.format(n = x))
# t1w.duplicated(['subject_id','run_id']).sum() # no duplicate subject/run combos
df_qc = pd.merge(df_scans,t1w, on = ['subject','run'], how = 'outer', indicator= True)

In [None]:
df_qc.query("_merge != 'both'")