# Test code and scratch space for MR preproc dashboard 

1. Use pickle to load subject info and expected preproc steps
2. Check directories to see output files 
3. Check logs to find out issues 

**Expected use case:** 
    Run this script after preprocessing is complete to provide info on each subject

In [4]:
import pandas as pd
import numpy as np
import sys
import os

In [6]:
# Data paths
proj_dir = '/Users/nikhil/projects/MR_preproc_dash/'
data_dir = proj_dir + 'data/'

preproc_pipeline_dir = '/Users/nikhil/code/git_repos/nist_mni_pipelines/'
if preproc_pipeline_dir not in sys.path:
    sys.path.append(preproc_pipeline_dir)

# Step 1: read the pickle to get subject specific parameters and preproc stages

In [83]:
output_dirs = ['clp','clp2','stx','stx2','vbm','cls','add','vol','lng']
task_file_names_dict = {}
task_file_names_dict['clp'] = ['clp','den','nuc']
task_file_names_dict['clp2'] = ['clp2']
task_file_names_dict['cls'] = ['csl','lob']
task_file_names_dict['stx'] = ['stx','nsstx']
task_file_names_dict['stx2'] = ['stx2']

pipeline_data_pickle = pd.read_pickle(data_dir + 'logs/long_pipeline_052_S_4807.pickle')

df = parse_pickle(pipeline_data_pickle,output_dirs)
df, missing_tp, missing_dir = check_output_dirs(df,output_dirs)
df, missing_file = check_output_files(df,task_file_names_dict)
df

Unnamed: 0,subject_idx,subject_dir,tp_idx,denoise,mask_N3,advanced_N4,mri3T,model_name,beast_dir,run_skull_registration,...,nsstx,clp,clp2,stx,stx2,vbm,cls,add,vol,lng
0,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20121019,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,file_exists,file_exists,file_exists,file_exists,dir_exists,file_missing,dir_missing,dir_exists,dir_exists
1,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20120727,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing,timepoint_missing
2,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20130215,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,file_exists,file_exists,file_exists,file_exists,dir_exists,file_missing,dir_missing,dir_exists,dir_exists
3,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20150804,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,file_missing,file_exists,file_missing,file_exists,dir_missing,file_missing,dir_missing,dir_exists,dir_exists
4,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20140807,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,file_exists,file_exists,file_exists,file_exists,dir_exists,file_missing,dir_missing,dir_exists,dir_exists
5,052_S_4807,/data/ipl/scratch15/Mahsa/ADNI/LP/3T//052_S_4807/,20130813,expected,na,expected,expected,model_t1w,/ipl/quarantine/models/beast,na,...,expected,file_exists,file_exists,file_exists,file_exists,dir_exists,file_missing,dir_missing,dir_exists,dir_exists


In [84]:
print('subject: {}'.format(df['subject_idx'].values[0]))
print('missing timepoints: {}'.format(missing_tp))
print('')
print('missing dir: {}'.format(missing_dir)) 
print('')
print('missing files: {}'.format(missing_file)) 


subject: 052_S_4807
missing timepoints: ['20120727']

missing dir: ['20121019/add', '20130215/add', '20150804/stx', '20150804/vbm', '20150804/cls', '20150804/add', '20140807/add', '20130813/add']

missing files: ['20121019/cls/csl_052_S_4807_20121019_t1.mnc', '20121019/cls/lob_052_S_4807_20121019_t1.mnc', '20130215/cls/csl_052_S_4807_20130215_t1.mnc', '20130215/cls/lob_052_S_4807_20130215_t1.mnc', '20150804/clp/clp_052_S_4807_20150804_t1.mnc', '20150804/clp/nuc_052_S_4807_20150804_t1.mnc', '20150804/cls/csl_052_S_4807_20150804_t1.mnc', '20150804/cls/lob_052_S_4807_20150804_t1.mnc', '20150804/stx/stx_052_S_4807_20150804_t1.mnc', '20150804/stx/nsstx_052_S_4807_20150804_t1.mnc', '20140807/cls/csl_052_S_4807_20140807_t1.mnc', '20140807/cls/lob_052_S_4807_20140807_t1.mnc', '20130813/cls/csl_052_S_4807_20130813_t1.mnc', '20130813/cls/lob_052_S_4807_20130813_t1.mnc']


In [82]:
# Parse subject -> timepoint info
def parse_pickle(pkl, output_dirs):
    # the task columns represent the current state of the task (na/expected/completed/failed)
    info_cols = ['subject_idx','subject_dir','tp_idx','denoise','mask_N3','advanced_N4','mri3T','model_name',
                'beast_dir','run_skull_registration','beastresolution','number_of_timepoints','pipeline_version',
                'donl','dolngcls','nuc','den','lob','nsstx']
    
    subject_df = pd.DataFrame(columns=info_cols+output_dirs)
    number_of_tp = len(pkl)
    for t, tp in enumerate(pkl.keys()):
        subject_df.loc[t,'subject_idx'] = pkl.id
        subject_df.loc[t,'subject_dir'] = pkl.patientdir 
        subject_df.loc[t,'tp_idx'] = tp
        subject_df.loc[t,'denoise'] = pkl.denoise
        subject_df.loc[t,'mask_N3'] = pkl.mask_n3
        subject_df.loc[t,'advanced_N4'] = pkl.n4
        subject_df.loc[t,'donl'] = pkl.donl
        subject_df.loc[t,'dolngcls'] = pkl.dolngcls
        subject_df.loc[t,'mri3T'] = pkl.mri3T
        subject_df.loc[t,'beast_dir'] = pkl.beastdir
        subject_df.loc[t,'model_name'] = pkl.modelname
        subject_df.loc[t,'run_skull_registration'] = pkl.skullreg
        subject_df.loc[t,'beastresolution'] = pkl.beastresolution
        subject_df.loc[t,'number_of_timepoints'] = number_of_tp
        subject_df.loc[t,'pipeline_version'] = pkl.pipeline_version
        
        #Commonly done preproc tasks
        subject_df.loc[t,'nuc'] = True
        subject_df.loc[t,'den'] = True
        subject_df.loc[t,'clp'] = True
        subject_df.loc[t,'clp2'] = True
        subject_df.loc[t,'stx'] = True
        subject_df.loc[t,'nsstx'] = True
        subject_df.loc[t,'stx2'] = True
        subject_df.loc[t,'vbm'] = True
        subject_df.loc[t,'cls'] = True
        subject_df.loc[t,'lob'] = True
        subject_df.loc[t,'add'] = True
        subject_df.loc[t,'vol'] = True
        subject_df.loc[t,'lng'] = True
        
        subject_df = subject_df.replace({True:'expected',False:'na'})
        
    return subject_df
    

# Check diretory tree created at the beginning of the pipeline (catch permission failures)
def check_output_dirs(subject_df,output_dirs):
    #subject_dir = subject_df['subject_dir'].values[0] # on BIC system
    subject_dir = data_dir + '052_S_4807/' #for local tests 
    
    missing_tp = []
    missing_dir = []
    for tp in subject_df['tp_idx'].values:
        if os.path.isdir(subject_dir+tp):
            for out_dir in output_dirs:
                if os.path.isdir(subject_dir+tp+'/'+out_dir):
                    subject_df.loc[subject_df['tp_idx']==tp,out_dir] = 'dir_exists'
                else:
                    subject_df.loc[subject_df['tp_idx']==tp,out_dir] = 'dir_missing'
                    missing_dir.append(tp + '/' + out_dir)
        else:
            missing_tp.append(tp)
            subject_df.loc[subject_df['tp_idx']==tp,output_dirs] = 'timepoint_missing'
    
    return subject_df, missing_tp, missing_dir

# Check output files creates at each stage of the pipeline (catch processing errors)
def check_output_files(subject_df,task_file_names_dict):
    missing_file = []
    #subject_dir = subject_df['subject_dir'].values[0] # on BIC system
    subject_dir = data_dir + '052_S_4807/' #for local tests 
    subject_idx = subject_df['subject_idx'].values[0]
    for tp in subject_df['tp_idx'].values:    
        if os.path.isdir(subject_dir+tp):
            for out_dir in task_file_names_dict.keys():
                expected_files = task_file_names_dict[out_dir]
                for f in expected_files:
                    file_name = '{}_{}_{}_t1.mnc'.format(f,subject_idx,tp) 
                    if os.path.isfile(subject_dir+tp+'/'+out_dir+'/'+file_name):
                        subject_df.loc[subject_df['tp_idx']==tp,out_dir] = 'file_exists'
                    else:
                        subject_df.loc[subject_df['tp_idx']==tp,out_dir] = 'file_missing'
                        missing_file.append(tp + '/' + out_dir + '/' + file_name) 
    return subject_df, missing_file