# Description

This notebook create the swarm jobs to run CPM on 100 iterations over the real data and 10,000 iterations over randomized data. 

This happens separately for each question in the SNYCQ

In [1]:
import os.path as osp
import os
from datetime import datetime
import getpass
from utils.basics import get_sbj_scan_list
from utils.basics import PRJ_DIR, SCRIPTS_DIR, RESOURCES_CPM_DIR, DATA_DIR, FB_400ROI_ATLAS_NAME
from cpm.cpm import read_fc_matrices

In [2]:
CPM_NITERATIONS      = 100       # Number of iterations on real data (to evaluate robustness against fold generation)
CPM_NULL_NITERATIONS = 10000     # Number of iterations used to build a null distribution
CORR_TYPE            = 'pearson'      # Correlation type to use on the edge-selection step
E_SUMMARY_METRIC     = 'sum'           # How to summarize across selected edges on the final model
E_THR_P              = 0.01            # Threshold used on the edge-selection step
SPLIT_MODE           = 'subject_aware' # Split mode for cross validation
MODEL_TYPE           = 'pearson_sum'
CONFOUNDS            = 'conf_not_residualized'
ATLAS                = FB_400ROI_ATLAS_NAME

In [3]:
username = getpass.getuser()
print('++ INFO: user working now --> %s' % username)

++ INFO: user working now --> javiergc


***

## 0. Prepare all necessary data for the CPM

1. Create resources folder for CPM analyses

In [4]:
if not osp.exists(RESOURCES_CPM_DIR):
    print('++ INFO: Creating resources folder for CPM analyses [%s]' % RESOURCES_CPM_DIR)
    os.makedirs(RESOURCES_CPM_DIR)

2. Load list of scans that passed all QAs

In [5]:
sbj_list, scan_list, snycq_df = get_sbj_scan_list(when='post_motion', return_snycq=True)

++ [post_motion] Number of subjects: 133 subjects
++ [post_motion] Number of scans:    471 scans


In [6]:
targets = list(snycq_df.columns)
print(targets)

['Positive', 'Negative', 'Future', 'Past', 'Myself', 'People', 'Surroundings', 'Vigilance', 'Images', 'Words', 'Specific', 'Intrusive']


3. Load FC data into memory

In [7]:
fc_data = read_fc_matrices(scan_list,DATA_DIR,FB_400ROI_ATLAS_NAME)

Scan: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 471/471 [00:14<00:00, 31.66it/s]


4. Save FC data in vectorized form for all scans into a single file for easy access for batch jobs

In [9]:
out_path = osp.join(RESOURCES_CPM_DIR,'fc_data_{atlas}.csv'.format(atlas=ATLAS))
fc_data.to_csv(out_path)
print('++ INFO: FC data saved to disk [%s]' % out_path)

++ INFO: FC data saved to disk [/data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/fc_data_Schaefer2018_400Parcels_7Networks_AAL2.csv]


5. Save SNYCQ in the cpm resources folder

In [10]:
out_path = osp.join(RESOURCES_CPM_DIR,'behav_data.csv')
snycq_df.to_csv(out_path)
print('++ INFO: Behavioral data saved to disk [%s]' % out_path)

++ INFO: Behavioral data saved to disk [/data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/behav_data.csv]


***
## 1. Swarm Jobs for the real data

We will generate separate swarm files per question. Similarly we will separate the swarm jobs that are for computations on real data and those that are for the generation of the null distribution.

In [11]:
#user specific folders
#=====================
swarm_folder = osp.join(PRJ_DIR,'SwarmFiles.{username}'.format(username=username),'S15')
logs_folder  = osp.join(PRJ_DIR,'Logs.{username}'.format(username=username),'S15')
swarm_path,logdir_path={},{}

if not osp.exists(swarm_folder):
    os.makedirs(swarm_folder)
    print('++ INFO: New folder for swarm files created [%s]' % swarm_folder)

if not osp.exists(logs_folder):
    os.makedirs(logs_folder)
    print('++ INFO: New folder for log files created [%s]' % swarm_folder)

for TARGET in targets:    
    swarm_path[TARGET]  = osp.join(swarm_folder,'S15_CPM-{atlas}-real-{sm}-{conf}-{mt}-{target}.SWARM.sh'.format(atlas=ATLAS,sm=SPLIT_MODE,conf=CONFOUNDS,mt=MODEL_TYPE, target=TARGET))
    logdir_path[TARGET] = osp.join(logs_folder, 'S15_CPM-{atlas}-real-{sm}-{conf}-{mt}-{target}.logs'.format(atlas=ATLAS,sm=SPLIT_MODE,conf=CONFOUNDS,mt=MODEL_TYPE, target=TARGET))

In [12]:
# create specific folders if needed
# ======================================
for TARGET in targets:
    if not osp.exists(logdir_path[TARGET]):
        os.makedirs(logdir_path[TARGET])
        print('++ INFO: New folder for log files created [%s]' % logdir_path[TARGET])

++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-real-subject_aware-conf_not_residualized-pearson_sum-Positive.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-real-subject_aware-conf_not_residualized-pearson_sum-Negative.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-real-subject_aware-conf_not_residualized-pearson_sum-Future.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-real-subject_aware-conf_not_residualized-pearson_sum-Past.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Sch

In [13]:
for TARGET in targets:
    # Open the file
    swarm_file = open(swarm_path[TARGET], "w")
    # Log the date and time when the SWARM file is created
    swarm_file.write('#Create Time: %s' % datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    swarm_file.write('\n')
    # Insert comment line with SWARM command
    swarm_file.write('#swarm -f {swarm_path} -g 8 -t 8 -b 10 --time 00:15:00 --partition quick,norm --logdir {logdir_path}'.format(swarm_path=swarm_path[TARGET],logdir_path=logdir_path[TARGET]))
    swarm_file.write('\n')
    for n_iter in range(CPM_NITERATIONS):
        out_dir = osp.join(RESOURCES_CPM_DIR,'swarm_outputs','real',ATLAS,SPLIT_MODE,CONFOUNDS,MODEL_TYPE,TARGET)
        if not osp.exists(out_dir):
            os.makedirs(out_dir)
        swarm_file.write("export BEHAV_PATH={behav_path} FC_PATH={fc_path} OUT_DIR={output_dir} BEHAVIOR={behavior} NUM_FOLDS={k} NUM_ITER={n_iter} CORR_TYPE={corr_type} E_SUMMARY_METRIC={e_summary_metric} E_THR_P={e_thr_p} SPLIT_MODE={split_mode} VERBOSE=True RANDOMIZE_BEHAVIOR=False; sh {scripts_folder}/S15_cpm_batch.sh".format(scripts_folder = SCRIPTS_DIR,
                           behav_path       = osp.join(RESOURCES_CPM_DIR,'behav_data.csv'),
                           fc_path          = osp.join(RESOURCES_CPM_DIR,f'fc_data_{ATLAS}.csv'),
                           output_dir       = out_dir,
                           behavior         = TARGET,
                           k                = 10,
                           n_iter           = n_iter + 1,
                           corr_type        = CORR_TYPE,
                           split_mode       = SPLIT_MODE,
                           e_summary_metric = E_SUMMARY_METRIC,
                           e_thr_p          = E_THR_P))
        swarm_file.write('\n')
    swarm_file.close()

***
## 2. Swarm jobs for the Null Distributions

In [14]:
for TARGET in targets:    
    swarm_path[TARGET]  = osp.join(swarm_folder,'S15_CPM-{atlas}-null-{sm}-{conf}-{mt}-{target}.SWARM.sh'.format(atlas=ATLAS,sm=SPLIT_MODE,conf=CONFOUNDS,mt=MODEL_TYPE, target=TARGET))
    logdir_path[TARGET] = osp.join(logs_folder, 'S15_CPM-{atlas}-null-{sm}-{conf}-{mt}-{target}.logs'.format(atlas=ATLAS,sm=SPLIT_MODE,conf=CONFOUNDS,mt=MODEL_TYPE, target=TARGET))

In [15]:
# create specific folders if needed
# ======================================
for TARGET in targets:
    if not osp.exists(logdir_path[TARGET]):
        os.makedirs(logdir_path[TARGET])
        print('++ INFO: New folder for log files created [%s]' % logdir_path[TARGET])

++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-null-subject_aware-conf_not_residualized-pearson_sum-Positive.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-null-subject_aware-conf_not_residualized-pearson_sum-Negative.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-null-subject_aware-conf_not_residualized-pearson_sum-Future.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Schaefer2018_400Parcels_7Networks_AAL2-null-subject_aware-conf_not_residualized-pearson_sum-Past.logs]
++ INFO: New folder for log files created [/data/SFIMJGC_Introspec/2023_fc_introspection/Logs.javiergc/S15/S15_CPM-Sch

In [16]:
for TARGET in targets:
    # Open the file
    swarm_file = open(swarm_path[TARGET], "w")
    # Log the date and time when the SWARM file is created
    swarm_file.write('#Create Time: %s' % datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    swarm_file.write('\n')
    # Insert comment line with SWARM command
    swarm_file.write('#swarm -f {swarm_path} -g 8 -t 8 -b 50 --time 00:04:30 --partition quick,norm --logdir {logdir_path}'.format(swarm_path=swarm_path[TARGET],logdir_path=logdir_path[TARGET]))
    swarm_file.write('\n')
    for n_iter in range(CPM_NULL_NITERATIONS):
        out_dir = osp.join(RESOURCES_CPM_DIR,'swarm_outputs','null',ATLAS,SPLIT_MODE,CONFOUNDS,MODEL_TYPE,TARGET)
        #out_dir = osp.join(RESOURCES_CPM_DIR,SPLIT_MODE,'null_distribution',behavior,CORR_TYPE,E_SUMMARY_METRIC)
        if not osp.exists(out_dir):
            os.makedirs(out_dir)
        swarm_file.write("export BEHAV_PATH={behav_path} FC_PATH={fc_path} OUT_DIR={output_dir} BEHAVIOR={behavior} NUM_FOLDS={k} NUM_ITER={n_iter} CORR_TYPE={corr_type} E_SUMMARY_METRIC={e_summary_metric} E_THR_P={e_thr_p} SPLIT_MODE={split_mode} VERBOSE=True RANDOMIZE_BEHAVIOR=True; sh {scripts_folder}/S15_cpm_batch.sh".format(scripts_folder = SCRIPTS_DIR,
                           behav_path       = osp.join(RESOURCES_CPM_DIR,'behav_data.csv'),
                           fc_path          = osp.join(RESOURCES_CPM_DIR,f'fc_data_{ATLAS}.csv'),
                           output_dir       = out_dir,
                           behavior         = TARGET,
                           k                = 10,
                           n_iter           = n_iter + 1,
                           corr_type        = CORR_TYPE,
                           split_mode       = SPLIT_MODE,
                           e_summary_metric = E_SUMMARY_METRIC,
                           e_thr_p          = E_THR_P))
        swarm_file.write('\n')
    swarm_file.close()

```bash
python ./S15b_GatherSwarmResults.py  -i /data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/swarm_outputs/real/Schaefer2018_400Parcels_7Networks_AAL2/subject_aware/conf_not_residualized/pearson_sum/ -o /data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/real-Schaefer2018_400Parcels_7Networks_AAL2-subject_aware-conf_not_residualized-pearson_sum.pkl -n 100 -c pearson

python ./S15b_GatherSwarmResults.py  -i /data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/swarm_outputs/null/Schaefer2018_400Parcels_7Networks_AAL2/subject_aware/conf_not_residualized/pearson_sum/ -o /data/SFIMJGC_Introspec/2023_fc_introspection/code/fc_introspection/resources/cpm/null-Schaefer2018_400Parcels_7Networks_AAL2-subject_aware-conf_not_residualized-pearson_sum.pkl -n 10000 -c pearson
```