# Generate CLI commands to run `sbatch` on MareNostrum4

This helper notebook helps users to generate `sbatch` commands to run different **GRN inference** or **community analysis** pipelines at scale on MareNostrum 4. The printed commands will be shown as outputs in the notebook that a user will have to run in the command line.

## Contacts

For any problems/questions, please contact `mkriukov.job@gmail.com`.

## Setting up namespace

In [2]:
import os  # file system
from termcolor import colored  # colored text output

import pandas as pd  # tabular data

# Setting working directory as home
home_dir = os.path.expanduser('~')
os.chdir(os.path.expanduser('~/scGRN_analysis'))

# Getting the tools for GRN analysis
import scGRN

# Setting up constants
cyan_color = lambda x: colored(x, 'cyan')
red_color = lambda x: colored(x, 'red')
green_color = lambda x: colored(x, 'green')

_NET_TYPES = ['all', 'TF', 'ctx']  # all gene-gene connections, TF-target connections, enriched TF-target connections

# Setting pathways to data files
_PROJ_HOME = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis'  # also scGRN._PROJ_HOME
_FMETA = f'{_PROJ_HOME}/Data_home/data/GSE145926_RAW/metadata.tsv'  # also scGRN._META_FILE
_DATA_HOME = f'{_PROJ_HOME}/Data_home/res/covid_19'  # also scGRN._DATA_HOME
_GREASY_DIR = f'{_PROJ_HOME}/sbatch/greasy'
_TF_LIST_lambert = f'{_PROJ_HOME}/Data_home/data/TF_lists/lambert2018.txt'

# Setting up available data
full_meta = scGRN.ana.get_meta(_DATA_HOME, _FMETA)

# Getting cell type count for each patient
full_cell_type_dist = full_meta.iloc[:, 3:]

# Getting information about all patients
_ALL_CELL_TYPES = full_cell_type_dist.columns.to_list()
_ALL_PATIENTS = full_meta.index.to_list()
_ALL_PAT_CELL_TYPES = {  # available cell types per patient
    pat: full_cell_type_dist.loc[pat].dropna().index.to_list() for pat in _ALL_PATIENTS
}
_ALL_PAT_FILENAMES = {  # obtained data files per patient
    pat: ['raw_data'] + list(map(lambda s: 'raw_data_' + s, cell_types)) for pat, cell_types in _ALL_PAT_CELL_TYPES.items()
}

# Displaying full metadata
print('Full metadata:')
full_meta

Full metadata:


Unnamed: 0_level_0,group,file,num_cells,Macrophage,T_cells,DC,Pre-B_cell_CD34-,Monocyte,NK_cell,B_cell,...,Neutrophils,GMP,Erythroblast,Gametocytes,Neurons,Fibroblasts,Smooth_muscle_cells,Hepatocytes,Keratinocytes,Pro-Myelocyte
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C51,C,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,9431.0,8348.0,608.0,215.0,98.0,70.0,68.0,9.0,...,,,,,,,,,,
C52,C,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,8696.0,8611.0,13.0,23.0,3.0,14.0,5.0,2.0,...,,,,,,,,,,
C100,C,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,907.0,338.0,411.0,45.0,5.0,51.0,20.0,12.0,...,,,,,,,,,,
C141,M,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,1449.0,197.0,932.0,48.0,5.0,86.0,96.0,33.0,...,11.0,1.0,,,,,,,,
C142,M,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,1790.0,482.0,996.0,39.0,13.0,67.0,113.0,20.0,...,14.0,1.0,,,,,,,,
C144,M,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,452.0,37.0,181.0,41.0,8.0,73.0,34.0,14.0,...,3.0,,2.0,,,,,,,
C143,S,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,14933.0,2048.0,1394.0,154.0,33.0,7489.0,562.0,72.0,...,3005.0,2.0,,2.0,1.0,,,,,
C145,S,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,15550.0,6960.0,719.0,859.0,46.0,5616.0,421.0,58.0,...,635.0,2.0,,,,,,,,
C146,S,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,2545.0,247.0,61.0,36.0,,127.0,14.0,3.0,...,1632.0,,,1.0,,2.0,1.0,1.0,1.0,
C148,S,/gpfs/projects/bsc08/shared_projects/scGRN_ana...,1165.0,98.0,122.0,24.0,,641.0,36.0,8.0,...,178.0,1.0,,1.0,,,,,,


## `sc_pipeline` - Running single-cell processing using `Seurat` and `SingleR`

Obtain `sbatch` commands to run the COVID-19 single cell data processing pipeline using `Seurat` package. The script is located in `scGRN/single_cell_processing/sc_pipeline/sc_pipeline.sh`.

<font color=#8B8000>*Please move to* `scGRN/single_cell_processing/sc_pipeline` *or specify* `CURR_DIR="<PATH_TO_scGRN>/scGRN/single_cell_processing/sc_pipeline"` with `sc_pipeline.sh` script *in the cell below.*</font>

In [3]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/sc_pipeline'

# Current constants
META = '../sc_metadata.tsv'
OUT = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19'
ANNO = 'HumanPrimaryCellAtlasData'
ANNO_F = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/SingleR'
N_PROC = 12
PRE_MERGED = 'F'
SOBJ = 'T'
SFOBJ = 'F'
VERB = 'F'

log_folder = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs'  # <- choose log folder!
log_out = os.path.join(log_folder, f'sc_pipeline_%j.out')
log_err = os.path.join(log_folder, f'sc_pipeline_%j.err')

cpus_per_task = 48
hours = 12

high_mem = False  # using high-memory nodes for large datasets
high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem

qos_debug = True  # whether to use QOS queue (debug or not)
qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
      f"{cyan_color('--job-name')}='sc_pipeline' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
      f"{cyan_color('--output')}={log_out} {cyan_color('--error')}={log_err} "
      f"{CURR_DIR}/{red_color('sc_pipeline.sh')} {META} {OUT} {ANNO} {ANNO_F} {N_PROC} {PRE_MERGED} {SOBJ} {SFOBJ} {VERB}")

[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--cpus-per-task[0m=48 [36m--job-name[0m='sc_pipeline' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/sc_pipeline [36m--ntasks[0m=1 [36m--output[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs/sc_pipeline_%j.out [36m--error[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs/sc_pipeline_%j.err /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/sc_pipeline/[31msc_pipeline.sh[0m ../sc_metadata_debug.tsv /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19 HumanPrimaryCellAtlasData /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/SingleR 12 F T F F


## `regulon_pipeline` pipeline - Regulon activity inference using `VIPER`

Obtain `sbatch` command to run the **full** TF-target regulon activity using `VIPER` algorithm. The script is located in `scGRN/single_cell_processing/regulon_pipeline/regulon_pipeline.sh`

<font color=#8B8000>*Please move to* `scGRN/single_cell_processing/regulon_pipeline` *or specify* `CURR_DIR="<PATH_TO_scGRN>/scGRN/single_cell_processing/regulon_pipeline"` with the `regulon_pipeline.sh` script *in the cell below.*</font>

In [9]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/regulon_pipeline'

# Current constants
META = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/sc_metadata_debug.tsv'
META_CTYPE = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19/cell_type_meta_debug.tsv'
OUT = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19'
REGULON = 'pyscenic'
Q_THRESH = 0.95
PLEIOT_CORR = 'T'
N_PROC = 12
SOBJ = 'T'
VERB = 'F'

log_folder = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs'  # <- choose log folder!
log_out = os.path.join(log_folder, f'VIPER_{REGULON}_pipeline_%j.out')
log_err = os.path.join(log_folder, f'VIPER_{REGULON}_pipeline_%j.err')

cpus_per_task = 24
hours = 12

high_mem = False  # using high-memory nodes for large datasets
high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem

qos_debug = True  # whether to use QOS queue (debug or not)
qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
      f"{cyan_color('--job-name')}='VIPER_{REGULON}_pipeline' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
      f"{cyan_color('--output')}={log_out} {cyan_color('--error')}={log_err} "
      f"{CURR_DIR}/{red_color('regulon_pipeline.sh')} {META} {META_CTYPE} {OUT} {REGULON} {Q_THRESH} {PLEIOT_CORR} {N_PROC} {SOBJ} {VERB}")

[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--cpus-per-task[0m=24 [36m--job-name[0m='VIPER_pyscenic_pipeline' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/regulon_pipeline [36m--ntasks[0m=1 [36m--output[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs/VIPER_pyscenic_pipeline_%j.out [36m--error[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs/VIPER_pyscenic_pipeline_%j.err /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/regulon_pipeline/[31mregulon_pipeline.sh[0m /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/single_cell_processing/sc_metadata_debug.tsv /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19/cell_type_meta_debug.tsv /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19 pyscenic 0.95 T 12 T F


## `network_inference` pipeline using `grnboost2` or `genie3`

### Running for any scRNA-seq file

Obtain `sbatch` command to run GRN inference **based on passed scRNA-seq matrix file** in `.tsv` format.

<font color=#8B8000>*Please move to* `scGRN/network_inference` *or specify* `SCRIPT_DIR="<PATH_TO_scGRN>/scGRN/network_inference"` with `infer_GRN.sh` script *in the cell below.*</font>

In [37]:
# Main parameters
JOB_ID = 'debug_C51_T_cells'
SCRIPT_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference'
IN_PATH = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19/C51/data/Seurat/raw_data_T_cells.tsv'
OUT_PATH = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/debug'
LOG_FOLDER = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/sbatch/logs'
log_out = os.path.join(LOG_FOLDER, f'sc_pipeline_%j.out')
log_err = os.path.join(LOG_FOLDER, f'sc_pipeline_%j.err')

# Run parameters
method = 'grnboost2'
Q_THRESH = 0.95
hours = 40  # upper limit of hours
cpus_per_task = 48  # number of CPUs allocated per sbatch command
num_workers_per_task = 12  # the level of parallelization, the bigger - the higher parallelization

# Supplementary data
TF_LIST_PATH = _TF_LIST_lambert
DB_NAMES = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/SCENIC/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather'
MOTIF_ANNOTATION = '/gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/SCENIC/motifs-v9-nr.hgnc-m0.001-o0.0.tbl'

high_mem = False  # using high-memory nodes for large datasets
high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem

qos_debug = True  # whether to use QOS queue (debug or not)
qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

print(green_color('\nThe command for inferring TF-target network:'))
print(
    f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
    f"{cyan_color('--job-name')}='{JOB_ID}_TF_{method}' {cyan_color('--chdir')}={SCRIPT_DIR} {cyan_color('--ntasks')}=1 "
    f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
    f"{SCRIPT_DIR}/{red_color('infer_GRN.sh')} {method} {IN_PATH} {OUT_PATH} {num_workers_per_task} {Q_THRESH} {LOG_FOLDER} {TF_LIST_PATH} {DB_NAMES} {MOTIF_ANNOTATION}"
)
print(green_color('\nThe command for inferring gene-gene network:'))
print(
    f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
    f"{cyan_color('--job-name')}='{JOB_ID}_{method}' {cyan_color('--chdir')}={SCRIPT_DIR} {cyan_color('--ntasks')}=1 "
    f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
    f"{SCRIPT_DIR}/{red_color('infer_GRN.sh')} {method} {IN_PATH} {OUT_PATH} {num_workers_per_task} {Q_THRESH} {LOG_FOLDER}"
)

[32m
The command for inferring TF-target network:[0m
[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--cpus-per-task[0m=48 [36m--job-name[0m='debug_C51_T_cells_TF_grnboost2' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference [36m--ntasks[0m=1 [36m--output[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/logs/sc_pipeline_%j.out [36m--error[0m=/gpfs/projects/bsc08/shared_projects/scGRN_analysis/logs/sc_pipeline_%j.err /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/[31minfer_GRN.sh[0m grnboost2 /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/res/covid_19/C51/data/Seurat/raw_data_T_cells.tsv /gpfs/projects/bsc08/shared_projects/scGRN_analysis/logs 12 0.95 /gpfs/projects/bsc08/shared_projects/scGRN_analysis/logs /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/TF_lists/lambert2018.txt /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/SCENIC/hg38__refseq-r

### Patient-specific

#### Run separately

Obtain `sbatch` commands to run **patient-specific** GRN inference pipelines (one per cell type available in corresponding patient). Please choose a `patient` for which we will produce a list of `sbatch` commands to run the GRN inference.

<font color=#8B8000>*Please move to* `scGRN/network_inference/ana_scripts` *or specify* `SCRIPT_DIR="<PATH_TO_scGRN>/scGRN/network_inference/ana_scripts"` *with* `infer_pat_GRN.sh` *in the cell below.*</font>

In [46]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts'

# Current constants
patient = 'C51'  # choose a patient: C51 C52 C100 C141 C142 C144 C143 C145 C146 C148 C149 C152
method = 'grnboost2'
Q_THRESH = 0.95

data_ctypes = list(map(
    lambda x: x.replace('raw_data_', '') if x.replace('raw_data', '') else "''", 
    _ALL_PAT_FILENAMES[patient]
))

hours = 40  # upper limit of hours
cpus_per_task = 48  # number of CPUs allocated per sbatch command
num_workers_per_task = 8  # the level of parallelization, the bigger - the higher parallelization
high_mem = True if full_meta.loc[patient, 'num_cells'] > 10000 else False  # using high-memory nodes for large datasets
high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem

qos_debug = True  # whether to use QOS queue (debug or not)
qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

display(pd.DataFrame(full_cell_type_dist.loc[patient].dropna()).T)  # displaying the cell types in chosen patient

print(green_color('\nThe commands for inferring TF-target networks:'))
for d in data_ctypes:
    print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
          f"{cyan_color('--job-name')}='{patient}_{d}_TF_{method}' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{CURR_DIR}/{red_color('infer_pat_GRN.sh')} {method} {patient} {d} {num_workers_per_task} {Q_THRESH} {_TF_LIST_lambert} SBATCH")
print(green_color('\nThe commands for inferring gene-gene networks:'))
for d in data_ctypes:
    print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
          f"{cyan_color('--job-name')}='{patient}_{d}_{method}' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{CURR_DIR}/{red_color('infer_pat_GRN.sh')} {method} {patient} {d} {num_workers_per_task} {Q_THRESH} '' SBATCH")

Unnamed: 0,Macrophage,T_cells,DC,Pre-B_cell_CD34-,Monocyte,NK_cell,B_cell,Epithelial_cells,BM,Pro-B_cell_CD34+,HSC_-G-CSF
C51,8348.0,608.0,215.0,98.0,70.0,68.0,9.0,7.0,4.0,3.0,1.0


[32m
The commands for inferring TF-target networks:[0m
[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--cpus-per-task[0m=48 [36m--job-name[0m='C51_''_TF_grnboost2' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts [36m--ntasks[0m=1 [36m--output[0m=/dev/null [36m--error[0m=/dev/null /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31minfer_pat_GRN.sh[0m grnboost2 C51 '' 8 0.95 /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/TF_lists/lambert2018.txt SBATCH
[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--cpus-per-task[0m=48 [36m--job-name[0m='C51_Macrophage_TF_grnboost2' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts [36m--ntasks[0m=1 [36m--output[0m=/dev/null [36m--error[0m=/dev/null /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31minfer_pat_GRN.sh[0m 

#### Run together using `greasy`

Here we will use highly-parallelized `greasy` framework (one `greasy` run per patient for all available cell types in that patient). Aggregates the `sbatch` runs as one command. The cell below will generate `greasy` task files **for each patient** which the user can execute using `greasy_pat_GRN.sh`.

<font color=#8B8000>*Please move to* `scGRN/network_inference/ana_scripts` *or specify* `SCRIPT_DIR="<PATH_TO_scGRN>/scGRN/network_inference/ana_scripts"` *with* `greasy_pat_GRN.sh` *in the cell below.*</font>

In [42]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts'

_PRINT_TASKS_HERE = False  # whether to print tasks here or save only in greasy task file

# Current constants
method = 'grnboost2'
Q_THRESH = 0.95

# Global configs
n_nodes = 4  # total number of nodes allocated to greasy job (1 node = 48 cores in MN4)
hours = 36  # upper limit of hours

for patient in _ALL_PATIENTS:
    
    data_ctypes = list(map(
        lambda x: x.replace('raw_data_', '') if x.replace('raw_data', '') else "''", 
        _ALL_PAT_FILENAMES[patient]
    ))    
    pat_ctype_count = full_meta.loc[patient].drop(['file', 'group']).dropna()

    GBs_per_worker = 8 if pat_ctype_count['num_cells'] < 8000 else 16  # number of gigabytes per one process
    cpus_per_worker = GBs_per_worker // 2  # calculate number of CPUs per process (low-mem nodes have 2 GB RAM per 1 CPU)
    cpus_per_task = cpus_per_worker * 4  # calculate number CPUs per greasy task
    total_avail_cpus = n_nodes * 48  # calculate total number of CPUs
    n_tasks = total_avail_cpus // cpus_per_task  # calculate number of greasy tasks executed simultaneously (normally `n_tasks` < `total_n_tasks`)
    num_workers_per_task = cpus_per_task // cpus_per_worker  # calculate number of parallel processes per task, level of parallelization

    qos_debug = False  # whether to use QOS queue (debug or not)
    qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
    hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

    # Print out the sbatch command to run greasy
    print(f"{red_color('sbatch')} {cyan_color('--job-name')}='GREASY_pat_{patient}_{method}' "
          f"{cyan_color('--ntasks')}={n_tasks} {cyan_color('--time')}='{hours}:00:00' "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{cyan_color('--cpus-per-task')}={cpus_per_task}{qos_debug_cmd}"
          f"{CURR_DIR}/{red_color('greasy_pat_GRN.sh')} {method} {green_color(patient)}")

    # Generate list of tasks
    tasks = ''
    i = 1
    for d in data_ctypes:
        # TF-target tasks
        task_1 = f"{CURR_DIR}/infer_pat_GRN.sh {method} {patient} {d} {num_workers_per_task} {Q_THRESH} {_TF_LIST_lambert} GREASY {i}"
        if _PRINT_TASKS_HERE:
            print(task_1)
        tasks += f'{task_1}\n'
        i += 1
    print()
    for d in data_ctypes:
        # Gene-gene tasks
        task_2 = f"{CURR_DIR}/infer_pat_GRN.sh {method} {patient} {d} {num_workers_per_task} {Q_THRESH} '' GREASY {i}"
        if _PRINT_TASKS_HERE:
            print(task_2)
        i += 1
        tasks += f'{task_2}\n'
    if _PRINT_TASKS_HERE:
        print()
        print()

    tasks = tasks.strip()
    with open(f'{_GREASY_DIR}/greasy_tasks_pat_{patient}_{method}', 'w') as f:
        f.write(tasks)
        
print(f'The greasy task files is generated at directory: {green_color(_GREASY_DIR)}')

[31msbatch[0m [36m--job-name[0m='GREASY_pat_C51_grnboost2' [36m--ntasks[0m=6 [36m--time[0m='36:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=32 /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31mgreasy_pat_GRN.sh[0m grnboost2 [32mC51[0m

[31msbatch[0m [36m--job-name[0m='GREASY_pat_C52_grnboost2' [36m--ntasks[0m=6 [36m--time[0m='36:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=32 /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31mgreasy_pat_GRN.sh[0m grnboost2 [32mC52[0m

[31msbatch[0m [36m--job-name[0m='GREASY_pat_C100_grnboost2' [36m--ntasks[0m=12 [36m--time[0m='36:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=16 /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31mgreasy_pat_GRN.sh[0m grnboost2 [32mC100[0m

[31msbatch[0m [36m--job-name[0

### Cell type-specific
#### Run separately

Obtain `sbatch` commands to run **cell type-specific** GRN inference pipelines (patient-aggregated per each cell type). For each cell type we will have 4 GRN inference runs:
- based on all cells
- based on cells from control (C) patients
- based on cells from mild (M) patients
- based on cells from severe (S) patients

<font color=#8B8000>*Please move to* `scGRN/network_inference/ana_scripts` *or specify* `SCRIPT_DIR="<PATH_TO_scGRN>/scGRN/network_inference/ana_scripts"` *with* `infer_agg_GRN.sh` *in the cell below.*</font>

In [4]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts'

# Current constants
method = 'grnboost2'
pat_types = ['all_patients', 'C', 'M', 'S']
data_sizes = full_meta.sum()[2:].sort_values(ascending=False).rename(index={'num_cells': 'all_data'})
Q_THRESH = 0.95

# Report number of cells for each cell types
display(pd.DataFrame(full_cell_type_dist.sum().sort_values(ascending=False).rename('Number of cells')).T)

i = 1

for cell_type, data_size in data_sizes.items():
    print(green_color(f'The cell type: {cell_type}'))
    for pat_type in pat_types:
        
        hours = 48 if data_size > 8000 else 24  # upper limit of hours
        cpus_per_task = 48 if data_size > 8000 else 24  # number of CPUs allocated per sbatch command
        num_workers_per_task = 12  # the level of parallelization, the bigger - the higher parallelization
        high_mem = True if data_size > 10000 else False  # using high-memory nodes for large datasets
        high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem
        
        qos_debug = True  # whether to use QOS queue (debug or not)
        qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
        hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

        # Printing commands for TF-target networks
        print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
              f"{cyan_color('--job-name')}='{cell_type}_{pat_type}_TF_{method}' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
              f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
              f"{CURR_DIR}/{red_color('infer_agg_GRN.sh')} {method} {cell_type} {green_color(pat_type)} {num_workers_per_task} {Q_THRESH} {_TF_LIST_lambert} SBATCH {i}")
        i += 1
        # Printing commands for gene-gene networks
        print(f"{red_color('sbatch')} {cyan_color('--time')}='{hours}:00:00'{qos_debug_cmd}{high_mem_cmd}{cyan_color('--cpus-per-task')}={cpus_per_task} "
              f"{cyan_color('--job-name')}='{cell_type}_{pat_type}_{method}' {cyan_color('--chdir')}={CURR_DIR} {cyan_color('--ntasks')}=1 "
              f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
              f"{CURR_DIR}/{red_color('infer_agg_GRN.sh')} {method} {cell_type} {green_color(pat_type)} {num_workers_per_task} {Q_THRESH} '' SBATCH {i}")
        i += 1
        print()
    print()

Unnamed: 0,Macrophage,Monocyte,T_cells,Neutrophils,DC,NK_cell,Epithelial_cells,B_cell,Pre-B_cell_CD34-,HSC_-G-CSF,...,CMP,BM,Gametocytes,Erythroblast,Fibroblasts,Neurons,Smooth_muscle_cells,Hepatocytes,Keratinocytes,Pro-Myelocyte
Number of cells,28008.0,15720.0,6515.0,5672.0,1605.0,1502.0,1242.0,586.0,388.0,61.0,...,16.0,13.0,4.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0


[32mThe cell type: all_data[0m
[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--constraint[0m=highmem [36m--cpus-per-task[0m=48 [36m--job-name[0m='all_data_all_patients_TF_grnboost2' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts [36m--ntasks[0m=1 [36m--output[0m=/dev/null [36m--error[0m=/dev/null /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31minfer_agg_GRN.sh[0m grnboost2 all_data [32mall_patients[0m 12 0.95 /gpfs/projects/bsc08/shared_projects/scGRN_analysis/Data_home/data/TF_lists/lambert2018.txt SBATCH 1
[31msbatch[0m [36m--time[0m='2:00:00' [36m--qos[0m=debug  [36m--constraint[0m=highmem [36m--cpus-per-task[0m=48 [36m--job-name[0m='all_data_all_patients_grnboost2' [36m--chdir[0m=/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts [36m--ntasks[0m=1 [36m--output[0m=/dev/null [36m--error[0m=/dev/null /gpfs/home/bsc08/bsc0

#### Run together using `greasy`

Here we will use highly-parallelized `greasy` framework (one `greasy` run per cell type for all available patient types where such cell type is present). Aggregates the `sbatch` runs as one command. The cell below will generate `greasy` task files **for cell type** which the user can execute using `greasy_agg_GRN.sh`.

<font color=#8B8000>*Please move to* `scGRN/network_inference/ana_scripts` *or specify* `SCRIPT_DIR="<PATH_TO_scGRN>/scGRN/network_inference/ana_scripts"` *with* `greasy_agg_GRN.sh` *in the cell below.*</font>

In [44]:
CURR_DIR = '/gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts'

_PRINT_TASKS_HERE = False  # whether to print tasks here or save only in greasy task file

# Current constants
method = 'grnboost2'
pat_types = ['all_patients', 'C', 'M', 'S']
data_sizes = full_meta.sum()[2:].sort_values(ascending=False).rename(index={'num_cells': 'all_data'})
Q_THRESH = 0.95

# Global configs
n_nodes = 4  # total number of nodes allocated to greasy job (1 node = 48 cores in MN4)
hours = 48  # upper limit of hours

for cell_type, data_size in data_sizes.items():
    
    cell_count = scGRN.ana.get_num_cells(pat='all', cell_type=cell_type, meta=full_meta)

    GBs_per_worker = 8 if cell_count < 8000 else 16 if cell_count < 20000 else 24 # number of gigabytes per one process
    cpus_per_worker = GBs_per_worker // 2  # calculate number of CPUs per process (low-mem nodes have 2 GB RAM per 1 CPU)
    cpus_per_task = cpus_per_worker * 4  # calculate number CPUs per greasy task
    total_avail_cpus = n_nodes * 48  # calculate total number of CPUs
    n_tasks = total_avail_cpus // cpus_per_task  # calculate number of greasy tasks executed simultaneously (normally `n_tasks` < `total_n_tasks`)
    num_workers_per_task = cpus_per_task // cpus_per_worker  # calculate number of parallel processes per task, level of parallelization

    high_mem = True if data_size > 10000 else False  # using high-memory nodes for large datasets
    high_mem_cmd = f' {cyan_color("--constraint")}=highmem ' if high_mem else ' '  # formatting high_mem
    
    qos_debug = False  # whether to use QOS queue (debug or not)
    qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '  # formatting qos_debug
    hours = 2 if qos_debug else hours  # decreasing the number of hours for debug mode

    # Print out the sbatch command to run greasy
    print(f"{red_color('sbatch')} {cyan_color('--job-name')}='GREASY_agg_{cell_type}_{method}' "
          f"{cyan_color('--ntasks')}={n_tasks} {cyan_color('--time')}='{hours}:00:00' "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{cyan_color('--cpus-per-task')}={cpus_per_task}{qos_debug_cmd}{high_mem_cmd}"
          f"{CURR_DIR}/{red_color('greasy_agg_GRN.sh')} {method} {green_color(cell_type)}")
        
    # Generate list of tasks
    tasks = ''
    i = 1
        
    for pat_type in pat_types:
        # TF task
        task_1 = f"{CURR_DIR}/infer_agg_GRN.sh {method} {cell_type} {pat_type} {num_workers_per_task} {Q_THRESH} {_TF_LIST_lambert} GREASY {i}"
        if _PRINT_TASKS_HERE:
            print(task_1)
        tasks += f'{task_1}\n'
        i += 1
    print()
    for pat_type in pat_types:
        # All genes task
        task_2 = f"{CURR_DIR}/infer_agg_GRN.sh {method} {cell_type} {pat_type} {num_workers_per_task} {Q_THRESH} '' GREASY {i}"
        if _PRINT_TASKS_HERE:
            print(task_2)
        i += 1
        tasks += f'{task_2}\n'
        
    if _PRINT_TASKS_HERE:
        print()
        print()

    tasks = tasks.strip()
    with open(f'{_GREASY_DIR}/greasy_tasks_agg_{cell_type}_{method}', 'w') as f:
        f.write(tasks)

[31msbatch[0m [36m--job-name[0m='GREASY_agg_all_data_grnboost2' [36m--ntasks[0m=4 [36m--time[0m='48:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=48  [36m--constraint[0m=highmem /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31mgreasy_agg_GRN.sh[0m grnboost2 [32mall_data[0m

[31msbatch[0m [36m--job-name[0m='GREASY_agg_Macrophage_grnboost2' [36m--ntasks[0m=4 [36m--time[0m='48:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=48  [36m--constraint[0m=highmem /gpfs/home/bsc08/bsc08890/scGRN_analysis/scGRN/network_inference/ana_scripts/[31mgreasy_agg_GRN.sh[0m grnboost2 [32mMacrophage[0m

[31msbatch[0m [36m--job-name[0m='GREASY_agg_Monocyte_grnboost2' [36m--ntasks[0m=6 [36m--time[0m='48:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=32  [36m--constraint[0m=highmem /gpfs/home/bsc08/bsc08890/scGRN_analysis/sc

## COMMUNITY ANALYSIS CELL TYPES AGGREGATED

In [28]:
cyan_color = lambda x: colored(x, 'cyan')
red_color = lambda x: colored(x, 'red')

algo = 'leiden'
cell_types = ['all_data'] + list(full_meta.columns[3:])
datas = list(map(lambda x: 'raw_data' if x == 'all_data' else f'raw_data_{x}', cell_types))
Q_THRESH = 0.95

display(full_cell_type_dist.sum().sort_values(ascending=False))

i = 1

for d, cell_type in zip(datas, cell_types):    
    n_tasks = len(datas) // 2  # Running only half of tasks in parallel
    cpus_per_task = 24
    hours = 12

    qos_debug = False
    qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '

    print(f"{red_color('sbatch')} {cyan_color('--job-name')}='{cell_type}_community_ana_{algo}' "
          f"{cyan_color('--ntasks')}=1 {cyan_color('--time')}='{hours}:00:00' "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{cyan_color('--cpus-per-task')}={cpus_per_task}{qos_debug_cmd}{red_color('community_ana_cell_type.sh')} {d} {algo}")
    i += 1

Macrophage             27418.0
Monocyte               15630.0
T_cells                 6403.0
Neutrophils             5638.0
DC                      1585.0
NK_cell                 1481.0
Epithelial_cells        1242.0
B_cell                   591.0
Pre-B_cell_CD34-         384.0
HSC_-G-CSF                61.0
Pro-B_cell_CD34+          45.0
GMP                       20.0
CMP                       16.0
BM                        14.0
Gametocytes                4.0
Erythroblast               2.0
Fibroblasts                2.0
Neurons                    1.0
Smooth_muscle_cells        1.0
Hepatocytes                1.0
Keratinocytes              1.0
Pro-Myelocyte              1.0
dtype: float64

[31msbatch[0m [36m--job-name[0m='all_data_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana_cell_type.sh[0m raw_data leiden
[31msbatch[0m [36m--job-name[0m='Macrophage_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana_cell_type.sh[0m raw_data_Macrophage leiden
[31msbatch[0m [36m--job-name[0m='T_cells_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana_cell_type.sh[0m raw_data_T_cells leiden
[31msbatch[0m [36m--job-name[0m='DC_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana_cell_type.sh[0m ra

## COMMUNITY ANALYSIS CELL TYPES (ONE PATIENT TYPE) AGGREGATED

In [29]:
cyan_color = lambda x: colored(x, 'cyan')
red_color = lambda x: colored(x, 'red')

method = 'leiden'
cell_types = ['all_data', 'Macrophage', 'T_cells', 'DC', 'Monocyte', 'NK_cell', 'B_cell', 'Epithelial_cells', 'Neutrophils', 'Pre-B_cell_CD34-']
datas = list(map(lambda x: 'raw_data' if x == 'all_data' else f'raw_data_{x}', cell_types))
Q_THRESH = 0.95

display(full_cell_type_dist.sum().sort_values(ascending=False)[cell_types[1:]])

i = 1

for d, cell_type in zip(datas, cell_types):   
    for t in ['C', 'M', 'S']:
        n_tasks = len(datas) // 2  # Running only half of tasks in parallel
        cpus_per_task = 24
        hours = 12

        qos_debug = False
        qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '

        print(f"{red_color('sbatch')} {cyan_color('--job-name')}='{cell_type}_{t}_type_community_ana_{algo}' "
              f"{cyan_color('--ntasks')}=1 {cyan_color('--time')}='{hours}:00:00' "
              f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
              f"{cyan_color('--cpus-per-task')}={cpus_per_task}{qos_debug_cmd}{red_color('community_ana.sh')} {t} {d} {algo}")
        i += 1

Macrophage          27418.0
T_cells              6403.0
DC                   1585.0
Monocyte            15630.0
NK_cell              1481.0
B_cell                591.0
Epithelial_cells     1242.0
Neutrophils          5638.0
Pre-B_cell_CD34-      384.0
dtype: float64

[31msbatch[0m [36m--job-name[0m='all_data_C_type_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana.sh[0m C raw_data leiden
[31msbatch[0m [36m--job-name[0m='all_data_M_type_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana.sh[0m M raw_data leiden
[31msbatch[0m [36m--job-name[0m='all_data_S_type_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana.sh[0m S raw_data leiden
[31msbatch[0m [36m--job-name[0m='Macrophage_C_type_community_ana_leiden' [36m--ntasks[0m=1 [36m--time[0m='12:00:00' [36m--output[0m=/dev/null [36m--error[0m=/dev/null [36m--cpus-per-task[0m=24 [31mcommunity_ana.sh[0m C raw_data_Macrophag

##### COMMUNITY ANALYSIS

In [27]:
cyan_color = lambda x: colored(x, 'cyan')
red_color = lambda x: colored(x, 'red')

# Patients - 'C141', 'C142', 'C143', 'C144', 'C145', 'C146', 'C51', 'C52', 'C100', 'C148', 'C149', 'C152'

for patient in ['C141', 'C142', 'C143', 'C144', 'C145', 'C146', 'C51', 'C52', 'C100', 'C148', 'C149', 'C152']:
    # patient = 'C152'
    algo = 'leiden'
    datas = _ALL_FILE_PREFIXES[patient]
    cell_type_x = full_meta.loc[patient].drop(['file', 'group']).dropna()

    n_tasks = len(datas) // 2  # Running only half of tasks in parallel
    cpus_per_task = 48
    hours = 24
    num_workers_per_task = cpus_per_task // 3

    qos_debug = False
    qos_debug_cmd = f' {cyan_color("--qos")}=debug ' if qos_debug else ' '

    # display(pd.DataFrame(cell_type_x).T)

    print(f"{red_color('sbatch')} {cyan_color('--job-name')}='{patient}_GREASY_community_ana_{algo}' "
          f"{cyan_color('--ntasks')}={n_tasks} {cyan_color('--time')}='{hours}:00:00' "
          f"{cyan_color('--output')}=/dev/null {cyan_color('--error')}=/dev/null "
          f"{cyan_color('--cpus-per-task')}={cpus_per_task}{qos_debug_cmd}{red_color('greasy_community_ana.sh')} {patient} {algo}")

    tasks = ''
    print()
    i = 1
    for d in datas:
        task = f"../community_ana.sh {patient} {d} {algo} GREASY {i}"
        # print(task)
        i += 1
        tasks += f'{task}\n'

    tasks = tasks.strip()
    with open(f'/gpfs/projects/bsc08/bsc08890/sbatch/greasy/greasy_tasks_community_ana_{patient}_{algo}', 'w') as f:
        f.write(tasks)

### Report of available data

Below we report the networks that were inferred. The user should pay attention to "?" cells (in red) - these cells correspond to networks where GRN inference pipeline was **failed**.

In [86]:
# Getting adjacency list/NetworkX object availability matrices
_AVAIL_ADJ_LISTs = scGRN.ana.get_avail_adj_lists(
    data_home=_DATA_HOME,
    meta_file=_FMETA
)
_AVAIL_Gs = scGRN.ana.get_avail_nx_graphs(
    data_home=_DATA_HOME,
    meta_file=_FMETA
)

# Visualizing the GRN availability matrix
for net_type in _NET_TYPES:
    print(f"{colored(net_type, 'cyan')}-type networks")
    display(scGRN.util.style_bool_df(_AVAIL_Gs[net_type]))

# Printing the legend
check, missing, cross = u'\u2713', '?', u'\u2715'  # green, red, yellow
print(colored('Legend:', 'green'))
print('all_data: The aggregated data, e.g. ("all_data", "Macrophage") - all macrophages, ("C152", "all_data") - all cells from C152.')
print(f'{check}: Data is processed and ready for use')
print(f'{missing}: Failed to detect the data file, although the corresponding cell type is present in patient')
print(f'{cross}: The corresponding cell type is not present in patient')

[36mall[0m-type networks


Unnamed: 0,all_data,Macrophage,T_cells,DC,Pre-B_cell_CD34-,Monocyte,NK_cell,B_cell,Epithelial_cells,BM,Pro-B_cell_CD34+,HSC_-G-CSF,CMP,Neutrophils,GMP,Erythroblast,Gametocytes,Neurons,Fibroblasts,Smooth_muscle_cells,Hepatocytes,Keratinocytes,Pro-Myelocyte
all_data,?,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,?,?,?,?,?,?,?
C,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
M,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,✕,✕,✕,✕,✕,✕,✕
S,?,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,✕,?,?,?,?,?,?,?
C51,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C52,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C100,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✓,✕,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C141,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕
C142,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕
C144,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✓,✕,✕,✕,✕,✕,✕,✕


[36mTF[0m-type networks


Unnamed: 0,all_data,Macrophage,T_cells,DC,Pre-B_cell_CD34-,Monocyte,NK_cell,B_cell,Epithelial_cells,BM,Pro-B_cell_CD34+,HSC_-G-CSF,CMP,Neutrophils,GMP,Erythroblast,Gametocytes,Neurons,Fibroblasts,Smooth_muscle_cells,Hepatocytes,Keratinocytes,Pro-Myelocyte
all_data,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,?,?,?,?,?,?,?
C,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
M,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,✕,✕,✕,✕,✕,✕,✕
S,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,✕,?,?,?,?,?,?,?
C51,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C52,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C100,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✓,✕,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C141,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕
C142,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✕,✕,✕,✕,✕
C144,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✓,✕,✕,✕,✕,✕,✕,✕


[36mctx[0m-type networks


Unnamed: 0,all_data,Macrophage,T_cells,DC,Pre-B_cell_CD34-,Monocyte,NK_cell,B_cell,Epithelial_cells,BM,Pro-B_cell_CD34+,HSC_-G-CSF,CMP,Neutrophils,GMP,Erythroblast,Gametocytes,Neurons,Fibroblasts,Smooth_muscle_cells,Hepatocytes,Keratinocytes,Pro-Myelocyte
all_data,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,?,?,?,?,?,?,?
C,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
M,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,?,✕,✕,✕,✕,✕,✕,✕
S,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,?,✕,?,?,?,?,?,?,?
C51,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C52,✓,✓,✓,✓,✓,✓,✓,?,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C100,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,?,✕,✓,✕,✕,✕,✕,✕,✕,✕,✕,✕,✕
C141,✓,✓,✓,✓,✓,✓,✓,✓,✓,✕,✕,✕,✓,✓,?,✕,✕,✕,✕,✕,✕,✕,✕
C142,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,✓,✓,?,✕,✕,✕,✕,✕,✕,✕,✕
C144,✓,✓,✓,✓,✓,✓,✓,✓,✓,?,?,?,?,✓,✕,?,✕,✕,✕,✕,✕,✕,✕


[32mLegend:[0m
all_data: The aggregated data, e.g. ("all_data", "Macrophage") - all macrophages, ("C152", "all_data") - all cells from C152.
✓: Data is processed and ready for use
?: Failed to detect the data file, although the corresponding cell type is present in patient
✕: The corresponding cell type is not present in patient
