# Readme

This script contains codes for data simulation and method evaluation described in the manuscript titled 'Benchmarking algorithms for joint integration of unpaired and paired single-cell RNA-seq and ATAC-seq data'. 

There are five major scenarios and one real data scenario described in this script, designed to answer three questions related to single-cell multi-omic integration. 

Scenario 1 tests if the multiome dataset helps the integration of unpaired scRNA-seq and snATAC-seq dataset. 

Scenario 2 tests how sequencing depth of the multiome dataset influences integration result and compare methods at their performance on cell type annotation, peak-gene recovery, cell type mixing, and batch mixing.  

Scenario 3 compares methods when the three data types are collected from different donor or research site, in other words, challenges each method at its ability to combat batch effect. 

Scenario 4 compares methods when there are cell population(s) missing from one of the single-modality dataset (scRNA-seq or snATAC-seq). Thus, testing each method's ability to integrate datasets in the case of unsahred cell populations. 

Scenario 5 compares methods when there are cell population(s) missing from the multiome dataset or exclusive to one of the single-modality dataset (scRNA-seq or snATAC-seq). Thus, testing each method's ability to integrate datasets in another type of unsahred cell population scenario. 

Section 6 applies the methods to a real data scenario where multiome, scRNA-seq, and snATAC-seq datasets from the Human Pancreas Analysis Program are integrated. 

In each sceanrio, we created multiple challenges using two publiclly available dataset (PBMC and BMMC). Open this script in Google Colab or Jupyter Notebook for the best visualization, as the codes are divided into 6 sections and 17 challenges in total. 

For each challenge, there are two parts. First one demonstrates how data is simulated and the parameters used. Second part shows codes used to run all methods and the files used to evaluate integration result. We ran all evaluations on our LPC server, through submitting one non-interactive job for completing one challenge using one method. Please adopt these codes depending on the environment you are running these codes in.  

This script assumes that a conda environment has been created for each method.  

In [None]:
# Run with a python kernel

In [1]:
%load_ext autoreload
%autoreload 2

# load libraries
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
import anndata as ad
from copy import deepcopy
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import pickle
import scipy.io as sio
import os

# Path to R [modify as needed]
os.environ['R_HOME'] = '/home/myylee/anaconda3/envs/r_py/lib/R/'
import utils_eval


from data_simulation import pair_unpair_split_size,downsample_samples, data_simulation, data_simulation_batch
from data_simulation import eval_test_all


# Scenario 1

## PBMC 

### Data simulation

In [41]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")

in_dir = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"

iter_list = [1000,3000,8000]

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "{}/dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz".format(os.getcwd())

cond_key = "nmulti"
    
def to_str(s): return(str(s))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)


### Running method + evaluations

In [43]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"

dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py","run_scmomat_pbmc.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired =====
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single.R",
                  "run_rbindsc_single.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv"
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_single.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


### Add dashed line in peak-gene recovery percentage with the true paired RNA and ATAC profiles

In [None]:
# copy over the true paired RNA-seq data
import shutil
dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5
output_folder="results_single_same_cell_number"

for i in range(len(iter_list)):
    for j in range(1,repeats+1):
        in_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j))
        out_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j),output_folder)

        os.makedirs(os.path.join(out_dir_i,"truth","predicted","ATAC"), exist_ok=True)
        # save ATAC to the right location
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","barcodes.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","peak.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","peak.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","ATAC_counts.mtx"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","counts.mtx"))

        # load in ATAC barcodes 
        atac_bc = pd.read_csv(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"),header=None)

        # get RNA profile of the corresponding cells 
        in_dir = "dataset/multiome_pbmc_10k/"

        adata_rna = ad.read_h5ad(in_dir+"pbmc_10x_rna_public.h5ad")

        idx_sel = atac_bc[0].tolist()
        # RNA
        adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna[idx_sel,:].X.todense())),
                               obs=adata_rna.obs.loc[idx_sel,:],
                               var=adata_rna.var,dtype=np.float32)
        adata_rna_sel.obs['rna.bc'] = adata_rna_sel.obs_names
        # save RNA to the right location
        utils_eval.write_adata(adata_rna_sel, os.path.join(out_dir_i,"truth","predicted","RNA"),"RNA","gene",feature_name='feature',transpose=True)



In [None]:
# run eval_missming_modality_prediction 

# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_missing_mod_eval.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"
method_keys = ["truth"]


#----- All of these can be random strings, are not used in script, but need not to be empty strings ----
conda_envs = ["truth_na"]
# R script for running the method
method_scripts = ["run_truth.csv"]
# if the script should be run in python environment 
py_langs = [False]
file_paths = ["truth/truth_result.csv"]
#------ END ----- 


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)





## BMMC

### Data simulation

In [50]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 


# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)


in_dir = "dataset/bmmc/bmmc_vary_cell_test/"
iter_list = [1000,2000,4000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"

def to_str(s): return(str(s))


data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)



### Running method + evaluations

In [49]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py","run_scmomat_bmmc.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired =====
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single.R",
                  "run_rbindsc_single.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv"
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_single.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


### Add dashed line in peak-gene recovery percentage with the true paired RNA and ATAC profiles

In [None]:
# copy over the true paired RNA-seq data
import shutil
dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5
output_folder="results_single_same_cell_number"

for i in range(len(iter_list)):
    for j in range(1,repeats+1):
        in_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j))
        out_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j),output_folder)

        os.makedirs(os.path.join(out_dir_i,"truth","predicted","ATAC"), exist_ok=True)
        # save ATAC to the right location
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","barcodes.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","peak.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","peak.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","ATAC_counts.mtx"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","counts.mtx"))


        # load in ATAC barcodes 
        atac_bc = pd.read_csv(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"),header=None)
        idx_sel = atac_bc[0].tolist()

        # get RNA profile of the corresponding cells 
        in_dir = "dataset/bmmc/"

        adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

        adata_rna.obs['rna.bc'] = adata_rna.obs_names
        adata_rna.var['feature'] = adata_rna.var['features'] 

        # RNA
        adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna[idx_sel,:].X.todense())),
                               obs=adata_rna.obs.loc[idx_sel,:],
                               var=adata_rna.var,dtype=np.float32)
        # save RNA to the right location
        utils_eval.write_adata(adata_rna_sel, os.path.join(out_dir_i,"truth","predicted","RNA"),"RNA","gene",feature_name='feature',transpose=True)




In [None]:
# run eval_missming_modality_prediction 

# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_missing_mod_eval.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5

method_keys = ["truth"]

#----- All of these can be random strings, are not used in script, but need not to be empty strings ----
conda_envs = ["truth_na"]
# R script for running the method
method_scripts = ["run_truth.csv"]
# if the script should be run in python environment 
py_langs = [False]
file_paths = ["truth/truth_result.csv"]
#------ END ----- 


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)





## SHARE-seq 

### Data simulation

In [None]:
source_dir = "dataset/mouse_skin/"

adata_rna = ad.read_h5ad(source_dir+"mouse_skin_shareseq_rna.h5ad")
adata_atac = ad.read_h5ad(source_dir+"mouse_skin_shareseq_atac.h5ad")

adata_rna.var['feature'] = deepcopy(adata_rna.var_names)
adata_atac.var['feature'] = deepcopy(adata_atac.var_names)

adata_rna.obs['ct3'] = deepcopy(adata_rna.obs['celltype'])
adata_atac.obs['ct3'] = deepcopy(adata_atac.obs['celltype'])

adata_rna.obs['ct3'].to_csv(in_dir+"mouse_skin_shareseq_bc_ct3.csv")

in_dir = "dataset/mouse_skin/multiome_ncells_pmat/"

iter_list = [5000,10000,15000]

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [8000]*len(iter_list)
n_snatac_list = [8000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "{}/dataset/mouse_skin/mouse_skin_shareseq_fragments.tsv.gz".format(os.getcwd())

cond_key = "nmulti"

def str_norm(s):
    return(str(int(s*100)))
    
def to_str(s): return(str(s))

data_simulation(in_dir,adata_rna,adata_atac,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)

### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/mouse_skin/mouse_skin_shareseq_bc_ct3.csv"
nclust = 22

dir_path = "dataset/mouse_skin/multiome_ncells_pmat/"
cond_key = "nmulti"

iter_list =  [15000,10000,5000]
repeats = 5

gp_eval_path = "eval_missing_modality_prediction_single_mm10.R"
gp_truth = "dataset/mouse_skin/mouse_skin_shareseq_pmat_all_ct_sig_links_50kb_unique.csv"


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py","run_scmomat_mouse_skin.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired =====
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single.R",
                  "run_rbindsc_single.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv"
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_mm10_single.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4 + [True]

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# GLUE 
# ===== unpaired =====
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_mm10.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


### Add dashed line in peak-gene recovery percentage with the true paired RNA and ATAC profiles 

In [None]:
import shutil
dir_path = "dataset/mouse_skin/multiome_ncells_pmat/"
cond_key = "nmulti"

iter_list = [5000]
repeats = 5
output_folder="results_single_same_cell_number"


for i in range(len(iter_list)):
    for j in range(1,repeats+1):
        in_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j))
        out_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j),output_folder)

        os.makedirs(os.path.join(out_dir_i,"truth","predicted","ATAC"), exist_ok=True)
        # save ATAC to the right location
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","barcodes.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","peak.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","peak.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","ATAC_counts.mtx"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","counts.mtx"))

        # load in ATAC barcodes 
        atac_bc = pd.read_csv(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"),header=None)

        # get RNA profile of the corresponding cells 
        in_dir = "dataset/mouse_skin/"
        adata_rna = ad.read_h5ad(in_dir+"mouse_skin_shareseq_rna.h5ad")

        idx_sel = atac_bc[0].tolist()
        # RNA
        adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna[idx_sel,:].X.todense())),
                               obs=adata_rna.obs.loc[idx_sel,:],
                               var=adata_rna.var,dtype=np.float32)
        adata_rna_sel.obs['rna.bc'] = adata_rna_sel.obs_names
        adata_rna_sel.var['feature'] = deepcopy(adata_rna_sel.var_names)

        # save RNA to the right location
        utils_eval.write_adata(adata_rna_sel, os.path.join(out_dir_i,"truth","predicted","RNA"),"RNA","gene",feature_name='feature',transpose=True)



In [None]:
# run eval_missming_modality_prediction 

# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_missing_mod_eval.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/mouse_skin/mouse_skin_shareseq_bc_ct3.csv"
nclust = 22

dir_path = "dataset/mouse_skin/multiome_ncells_pmat/"
cond_key = "nmulti"

iter_list = [5000]
repeats = 5

gp_eval_path = "eval_missing_modality_prediction_single_mm10.R"
gp_truth = "dataset/mouse_skin/mouse_skin_shareseq_pmat_all_ct_sig_links_50kb_unique.csv"

method_keys = ["truth"]

#----- All of these can be random strings, are not used in script, but need not to be empty strings ----
conda_envs = ["truth_na"]
# R script for running the method
method_scripts = ["run_truth.csv"]
# if the script should be run in python environment 
py_langs = [False]
file_paths = ["truth/truth_result.csv"]
#------ END ----- 


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=2,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)





# Scenario 2 

## PBMC 2000 cells 

### Data simulation

In [None]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")


in_dir = "dataset/multiome_pbmc_10k/nmulti2000_7ct_vdepth_test/"
iter_list = [0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [2000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "{}/dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz".format(os.getcwd())

cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)



### Running method + evaluations

In [51]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"

dir_path = "dataset/multiome_pbmc_10k/nmulti2000_7ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# Two additional methods
conda_envs = ["glue2","scmomat"]
# python script for running the method
method_scripts = ["run_glue_hg38.py","run_scmomat_pbmc.py"]
# if the script should be run in python environment 
py_langs = [True]*2

file_paths = ["glue/glue_result.csv","scmomat/scmomat_result.csv"]

method_keys = ["glue","scmomat"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=1,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## BMMC 2000 cells 

### Data simulation

In [56]:
# using pmat dataset, generated from bmmc_process R script 
in_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()

in_dir = "dataset/bmmc/nmulti2000_21ct_vdepth_test/"
iter_list = [0.25]#[0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [2000]*len(iter_list)
repeats = 2 #5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)




### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/nmulti2000_21ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# Two additional methods
conda_envs = ["glue2","scmomat"]
# python script for running the method
method_scripts = ["run_glue_hg38.py","run_scmomat_bmmc.py"]
# if the script should be run in python environment 
py_langs = [True]*2

file_paths = ["glue/glue_result.csv","scmomat/scmomat_result.csv"]

method_keys = ["glue","scmomat"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=1,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## BMMC 4000 cells 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
in_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()

in_dir = "dataset/bmmc/nmulti4000_21ct_vdepth_test/"
iter_list = [0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [4000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)




### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/nmulti4000_21ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# Two additional methods
conda_envs = ["glue2","scmomat"]
# python script for running the method
method_scripts = ["run_glue_hg38.py","run_scmomat_bmmc.py"]
# if the script should be run in python environment 
py_langs = [True]*2

file_paths = ["glue/glue_result.csv","scmomat/scmomat_result.csv"]

method_keys = ["glue","scmomat"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=1,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


## BMMC Increasing number of cells; 100% depth

### Data simulation

In [None]:
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 


# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()


in_dir = "dataset/bmmc/bmmc_vcells_intervals/"
iter_list = [int(x) for x in np.linspace(start=1000, stop=1000+400*10, num=11)] # ncell increase interval, each interval: increase by 400 cells. 400 cells of 10k atac fragment and 2500 nCounts_RNA is roughly equal to 10% increment in ATAC and RNA sequencing depth, assuming 4000 cells
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))


data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vcells_intervals/"
cond_key = "depthmulti"

iter_list = [int(x) for x in np.linspace(start=1000, stop=1000+400*10, num=11)] # ncell increase interval, each interval: increase by 400 cells. 400 cells of 10k atac fragment and 2500 nCounts_RNA is roughly equal to 10% increment in ATAC and RNA sequencing depth, assuming 4000 cells
repeats = 5


# ===== Seurat v4 ====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== Seurat v3 ===== 
conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat3/seurat3_result.csv"]

method_keys = ["seurat3"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## BMMC Increasing depth; 4000 cells

### Data simulation

In [None]:
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()


# Saving everything in /project/mingyaolpc/ folder to avoid running out of storage
in_dir = "dataset/bmmc/bmmc_vdepth_intervals/"
iter_list = np.linspace(start=10, stop=100, num=10)/100
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [4000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vdepth_intervals/"
cond_key = "depthmulti"

iter_list = np.linspace(start=10, stop=100, num=10)/100
repeats = 5


# ===== Seurat v4 =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== Seurat v3 ===== 
conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat3/seurat3_result.csv"]

method_keys = ["seurat3"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



# Scenario 3

## Technical batch effect 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_s1d1 = list(adata_atac.obs['batch'] == 's1d1')

# Paired - RNA
adata_rna_s1d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d1,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_s1d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d1,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s2d1 = list(adata_atac.obs['batch'] == 's2d1')
adata_rna_s2d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s2d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s2d1,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s2d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s2d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s2d1,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_technical_batch_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch2.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21
\
gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_technical_batch_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py","run_scmomat_batch_bmmc_2.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# Additional python method
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_batch.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


## Biological batch effect 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_s1d1 = list(adata_atac.obs['batch'] == 's1d1')

# Paired - RNA
adata_rna_s1d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d1,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_s1d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d1,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s1d2 = list(adata_atac.obs['batch'] == 's1d2')
adata_rna_s1d2 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d2,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d2,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s1d2 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d2,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d2,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_biological_batch_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch2.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_biological_batch_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py","run_scmomat_batch_bmmc_2.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# Additional python method
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_batch.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Complex test #1

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_paired = list(adata_atac.obs['batch'].isin(['s1d1','s1d3']))

# Paired - RNA
adata_rna_paired = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_paired,:].todense())),
                       obs=adata_rna.obs.iloc[idx_paired,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_paired = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_paired,:].todense())),
                       obs=adata_atac.obs.iloc[idx_paired,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s1d2 = list(adata_atac.obs['batch'] == 's1d2')
adata_rna_s1d2 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d2,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d2,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s1d2 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d2,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d2,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_complex1_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch2.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_complex1_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py","run_scmomat_batch_bmmc_2.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat","seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R","run_seurat4_4.R"]
# if the script should be run in python environment 
py_langs = [False]*2

file_paths = ["seurat4/seurat4_result.csv","seurat4int/seurat4int_result.csv"]

method_keys = ["seurat4int"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# Additional python method
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_batch.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


## Complex test #2

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_paired = list(adata_atac.obs['batch'].isin(['s4d1','s1d3']))

# Paired - RNA
adata_rna_paired = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_paired,:].todense())),
                       obs=adata_rna.obs.iloc[idx_paired,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_paired = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_paired,:].todense())),
                       obs=adata_atac.obs.iloc[idx_paired,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s2d1 = list(adata_atac.obs['batch'] == 's2d1')
adata_rna_s2d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s2d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s2d1,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s2d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s2d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s2d1,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_complex2_test/"
iter_list = [10000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [2000]*len(iter_list)
n_snatac_list = [2000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
### working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch2.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_complex2_test/"
cond_key = "nmulti"

iter_list = [10000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py","run_scmomat_batch_bmmc_2.py"]
# if the script should be run in python environment 
py_langs = [True]*3

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","cobolt","scmomat"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat","seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R","run_seurat4_4.R"]
# if the script should be run in python environment 
py_langs = [False]*2

file_paths = ["seurat4/seurat4_result.csv","seurat4int/seurat4int_result.csv"]

method_keys = ["seurat4int"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# Additional python method
conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_batch.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



# Scenario 4

## PBMC

### Data simulation

In [None]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna_sel = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac_sel = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")

adata_rna_sel.obs['ct3'].value_counts()

in_dir = "dataset/pbmc/single_modality_fixed_missing_ct/"

iter_list = [1000,3000,6000]
cts_remove_multiome_list = [[None],[None],[None]]
cts_remove_scrna_list = [[None],['NK'],[None]]
cts_remove_snatac_list = [[None],[None],['NK']]

percent_single_mod_ct = {'NK': 0.046}

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"

cond_keys = ["noMiss_nmulti","rnaMissNK_nmulti","atacMissNK_nmulti"]

# the obs column storing cell type annotation 
ct_col = 'ct3'

def to_str(s): return(str(s))

from data_simulation import simulate_missing_fixed
for i in range(len(cond_keys)):
    simulate_missing_fixed(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                           depth_multiome_list,depth_scrna_list,depth_snatac_list,
                           n_multiome_list,n_scrna_list,n_snatac_list,
                           # for each cond_key, have a different ct structure. Each time, create nmulti cells in a situation
                           cts_remove_multiome_list[i], 
                           cts_remove_scrna_list[i],
                           cts_remove_snatac_list[i],
                           percent_single_mod_ct,repeats,fragment_path,
                           cond_keys[i],
                           to_str, downsample=False,ct_col=ct_col)



### Running method + evaluations

In [None]:
folder_dir="/home/myylee/scmint/methods_eval/"

# bash script for job submission
job_submission_script = "submit_job_per_condition_n_missing_ct_eval.sh"

# python script for metric evaluation 
# this one evaluation script works for single-modality focused analysis and all three datasets analysis
# Because F1 score is calculated per cell type per modality
eval_script = "run_metric_eval_fair_missing_ct_perMod.py"

# write the rare_ct_list to folder, so the evaluation part knows to calculate F1 score for these two cell types
dir_path = "dataset/pbmc/single_modality_fixed_missing_ct/"

rare_ct_path = os.path.join(dir_path,"rare_ct_list.csv")
rare_ct_list = ['NK']
pd.DataFrame(rare_ct_list).to_csv(rare_ct_path)

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "false"
gp_truth = "false"

cond_keys = ["noMiss_nmulti","rnaMissNK_nmulti","atacMissNK_nmulti"]


iter_list = [1000,3000,6000]
repeats = 5


In [None]:
### Unpaired + seurat 4

# all R functions 
conda_envs = ["seurat","bindsc","seurat","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single_noPred.R","run_rbindsc_single_noPred.R",
                  "run_seurat4_3_noPred.R","run_rfigr_single.R","run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*5

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "seurat4/seurat4_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### Unpaired (multiome-split)

conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts =  ["run_seurat3_noPred.R","run_rbindsc_noPred.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### python functions
conda_envs = ["scmomat","cobolt","glue2","multivi"]
# python script for running the method
method_scripts = ["run_scmomat_pbmc.py",
                  "run_cobolt.py",
                  "run_glue_hg38_single_noPred.py",
                  "run_multivi_2_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]*4

file_paths = ["scmomat/scmomat_result.csv",
              "cobolt/cobolt_result.csv",
              "glue/glue_result.csv",
              "multivi/multivi_result.csv"]

method_keys = ["scmomat","cobolt","glue","multivi"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=4,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]


for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=2,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)
    

## SHARE-seq 

### Data simulation

In [None]:
# load 
source_dir = "dataset/mouse_skin/"
adata_rna = ad.read_h5ad(source_dir+"mouse_skin_shareseq_rna_10k.h5ad")
adata_atac = ad.read_h5ad(source_dir+"mouse_skin_shareseq_atac_10k.h5ad")


in_dir = "dataset/mouse_skin/single_modality_fixed_missing_ct/"

iter_list = [1000,3000,6000]
cts_remove_multiome_list = [[None],[None],[None],[None],[None],[None],[None],[None],[None]]
cts_remove_scrna_list = [[None],['HS'],[None],['Endo'],[None],['HS','Endo'],[None],['HS'],['Endo']]
cts_remove_snatac_list = [[None],[None],['HS'],[None],['Endo'],[None],['HS','Endo'],['Endo'],['HS']]

percent_single_mod_ct = {'HS': 0.1, 'Endo':0.05}

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "dataset/mouse_skin/mouse_skin_shareseq_fragments.tsv.gz"

cond_keys = ["noMiss_nmulti","rnaMissHS_nmulti","atacMissHS_nmulti","rnaMissEndo_nmulti","atacMissEndo_nmulti",
            "rnaMissTwo_nmulti","atacMissTwo_nmulti","eachMissOne_nmulti","eachMissOneAlt_nmulti"]

# the obs column storing cell type annotation 
ct_col = 'ct3'

def to_str(s): return(str(s))

from data_simulation import simulate_missing_fixed
for i in range(len(cond_keys)):
    simulate_missing_fixed(in_dir,adata_rna,adata_atac,iter_list,
                           depth_multiome_list,depth_scrna_list,depth_snatac_list,
                           n_multiome_list,n_scrna_list,n_snatac_list,
                           # for each cond_key, have a different ct structure. Each time, create nmulti cells in a situation
                           cts_remove_multiome_list[i], 
                           cts_remove_scrna_list[i],
                           cts_remove_snatac_list[i],
                           percent_single_mod_ct,repeats,fragment_path,
                           cond_keys[i],
                           to_str, downsample=False,ct_col=ct_col)



### Running method + evaluations

In [None]:
folder_dir="/home/myylee/scmint/methods_eval/"

# bash script for job submission
job_submission_script = "submit_job_per_condition_n_missing_ct_eval.sh"

# python script for metric evaluation 
# this one evaluation script works for single-modality focused analysis and all three datasets analysis
# Because F1 score is calculated per cell type per modality
eval_script = "run_metric_eval_fair_missing_ct_perMod.py"

# write the rare_ct_list to folder, so the evaluation part knows to calculate F1 score for these two cell types
dir_path = "dataset/mouse_skin/single_modality_fixed_missing_ct/"

rare_ct_path = os.path.join(dir_path,"rare_ct_list.csv")
rare_ct_list = ['HS','Endo']
pd.DataFrame(rare_ct_list).to_csv(rare_ct_path)

ct_ref = "dataset/mouse_skin/mouse_skin_shareseq_bc_ct3_10k.csv"
nclust = 12

gp_eval_path = "false"
gp_truth = "false"

cond_keys = ["noMiss_nmulti","rnaMissHS_nmulti","atacMissHS_nmulti","rnaMissEndo_nmulti","atacMissEndo_nmulti",
            "rnaMissTwo_nmulti","atacMissTwo_nmulti","eachMissOne_nmulti","eachMissOneAlt_nmulti"]

iter_list = [1000,3000,6000]
repeats = 5

In [None]:
### Unpaired + seurat 4

# all R functions 
conda_envs = ["seurat","bindsc","seurat","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single_noPred.R",
                  "run_rbindsc_single_noPred.R",
                  "run_seurat4_3_noPred.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*5

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "seurat4/seurat4_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### Unpaired (multiome-split)

conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts =  ["run_seurat3_noPred.R",
                   "run_rbindsc_noPred.R",
                   "run_rfigr_2.R",
                   "run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### python functions
conda_envs = ["scmomat","cobolt","glue2","multivi"]
# python script for running the method
method_scripts = ["run_scmomat_mouse_skin.py",
                  "run_cobolt.py",
                  "run_glue_mm10_single_noPred.py",
                  "run_multivi_2_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]*4

file_paths = ["scmomat/scmomat_result.csv",
              "cobolt/cobolt_result.csv",
              "glue/glue_result.csv",
              "multivi/multivi_result.csv"]

method_keys = ["scmomat","cobolt","glue","multivi"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=4,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_mm10_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]


for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=2,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)
    

# Scenario 5

## PBMC

### Data simulation

In [None]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna_sel = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac_sel = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")

adata_rna_sel.obs['ct3'].value_counts()

in_dir = "dataset/pbmc/multiome_fixed_missing_ct/"

iter_list = [3000]
cts_remove_multiome_list = [[None],['NK'],['NK'],['NK']]
cts_remove_scrna_list =    [[None],[None],[None],['NK']]
cts_remove_snatac_list =   [[None],[None],['NK'],[None]]

percent_single_mod_ct = {'NK': 0.046}

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [3000]*len(iter_list)
n_snatac_list = [3000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"

cond_keys = ["noMiss_nmulti","multiMissNK_nmulti","rnaOnlyNK_nmulti","atacOnlyNK_nmulti"]

# the obs column storing cell type annotation 
ct_col = 'ct3'

def to_str(s): return(str(s))

from data_simulation import simulate_missing_fixed
for i in range(len(cond_keys)):
    simulate_missing_fixed(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                           depth_multiome_list,depth_scrna_list,depth_snatac_list,
                           n_multiome_list,n_scrna_list,n_snatac_list,
                           # for each cond_key, have a different ct structure. Each time, create nmulti cells in a situation
                           cts_remove_multiome_list[i], 
                           cts_remove_scrna_list[i],
                           cts_remove_snatac_list[i],
                           percent_single_mod_ct,repeats,fragment_path,
                           cond_keys[i],
                           to_str, downsample=False,ct_col=ct_col)



### Running method + evaluations

In [None]:
folder_dir="/home/myylee/scmint/methods_eval/"

# bash script for job submission
job_submission_script = "submit_job_per_condition_n_missing_ct_eval.sh"

# python script for metric evaluation 
# this one evaluation script works for single-modality focused analysis and all three datasets analysis
# Because F1 score is calculated per cell type per modality
eval_script = "run_metric_eval_fair_missing_ct_perMod.py"

# write the rare_ct_list to folder, so the evaluation part knows to calculate F1 score for these two cell types
dir_path = "dataset/pbmc/multiome_fixed_missing_ct/"

rare_ct_path = os.path.join(dir_path,"rare_ct_list.csv")
rare_ct_list = ['NK']
pd.DataFrame(rare_ct_list).to_csv(rare_ct_path)

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "false"
gp_truth = "false"

cond_keys = ["noMiss_nmulti","multiMissNK_nmulti","rnaOnlyNK_nmulti","atacOnlyNK_nmulti"]
iter_list = [3000]
repeats = 5

In [None]:
### Unpaired + seurat 4

# all R functions 
conda_envs = ["seurat","bindsc","seurat","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single_noPred.R",
                  "run_rbindsc_single_noPred.R",
                  "run_seurat4_3_noPred.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*5

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "seurat4/seurat4_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### Unpaired (multiome-split)

conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts =  ["run_seurat3_noPred.R","run_rbindsc_noPred.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### python functions
conda_envs = ["scmomat","cobolt","glue2","multivi"]
# python script for running the method
method_scripts = ["run_scmomat_pbmc.py",
                  "run_cobolt.py",
                  "run_glue_hg38_single_noPred.py",
                  "run_multivi_2_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]*4

file_paths = ["scmomat/scmomat_result.csv",
              "cobolt/cobolt_result.csv",
              "glue/glue_result.csv",
              "multivi/multivi_result.csv"]

method_keys = ["scmomat","cobolt","glue","multivi"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=4,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_hg38_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=2,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)
    

## SHARE-seq 

### Data simulation

In [None]:
# load 
source_dir = "dataset/mouse_skin/"
adata_rna = ad.read_h5ad(source_dir+"mouse_skin_shareseq_rna_10k.h5ad")
adata_atac = ad.read_h5ad(source_dir+"mouse_skin_shareseq_atac_10k.h5ad")

in_dir = "dataset/mouse_skin/multiome_fixed_missing_ct/"

iter_list = [3000]
cts_remove_multiome_list = [[None],['HS'],['HS'],['HS'],['Endo'],['Endo'],['Endo']]
cts_remove_scrna_list =    [[None],[None],[None],['HS'],[None],[None],['Endo']]
cts_remove_snatac_list =   [[None],[None],['HS'],[None],[None],['Endo'],[None]]

percent_single_mod_ct = {'HS': 0.1, 'Endo':0.05}

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [3000]*len(iter_list)
n_snatac_list = [3000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "dataset/mouse_skin/mouse_skin_shareseq_fragments.tsv.gz"

#cond_key = "nmulti"
cond_keys = ["noMiss_nmulti","multiMissHS_nmulti","rnaOnlyHS_nmulti","atacOnlyHS_nmulti",
             "multiMissEndo_nmulti","rnaOnlyEndo_nmulti","atacOnlyEndo_nmulti"]

# the obs column storing cell type annotation 
ct_col = 'ct3'

def to_str(s): return(str(s))

from data_simulation import simulate_missing_fixed
for i in range(len(cond_keys)):
    simulate_missing_fixed(in_dir,adata_rna,adata_atac,iter_list,
                           depth_multiome_list,depth_scrna_list,depth_snatac_list,
                           n_multiome_list,n_scrna_list,n_snatac_list,
                           # for each cond_key, have a different ct structure. Each time, create nmulti cells in a situation
                           cts_remove_multiome_list[i], 
                           cts_remove_scrna_list[i],
                           cts_remove_snatac_list[i],
                           percent_single_mod_ct,repeats,fragment_path,
                           cond_keys[i],
                           to_str, downsample=False,ct_col=ct_col)



### Running method + evaluations

In [None]:
folder_dir="/home/myylee/scmint/methods_eval/"

# bash script for job submission
job_submission_script = "submit_job_per_condition_n_missing_ct_eval.sh"

# python script for metric evaluation 
# this one evaluation script works for single-modality focused analysis and all three datasets analysis
# Because F1 score is calculated per cell type per modality
eval_script = "run_metric_eval_fair_missing_ct_perMod.py"

# write the rare_ct_list to folder, so the evaluation part knows to calculate F1 score for these two cell types
dir_path = "dataset/mouse_skin/multiome_fixed_missing_ct/"

rare_ct_path = os.path.join(dir_path,"rare_ct_list.csv")
rare_ct_list = ['HS','Endo']
pd.DataFrame(rare_ct_list).to_csv(rare_ct_path)

ct_ref = "dataset/mouse_skin/mouse_skin_shareseq_bc_ct3_10k.csv"
nclust = 12

gp_eval_path = "false"
gp_truth = "false"

cond_keys = ["noMiss_nmulti","multiMissHS_nmulti","rnaOnlyHS_nmulti","atacOnlyHS_nmulti",
             "multiMissEndo_nmulti","rnaOnlyEndo_nmulti","atacOnlyEndo_nmulti"]

iter_list = [3000]

repeats = 5


In [None]:
### Unpaired + seurat 4

# all R functions 
conda_envs = ["seurat","bindsc","seurat","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single_noPred.R",
                  "run_rbindsc_single_noPred.R",
                  "run_seurat4_3_noPred.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*5

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "seurat4/seurat4_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### Unpaired (multiome-split)

conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts =  ["run_seurat3_noPred.R",
                   "run_rbindsc_noPred.R",
                   "run_rfigr_2.R",
                   "run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=False, wait_time=1*30,
                  batch=1,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


### python functions
conda_envs = ["scmomat","cobolt","glue2","multivi"]
# python script for running the method
method_scripts = ["run_scmomat_mouse_skin.py",
                  "run_cobolt.py",
                  "run_glue_mm10_single_noPred.py",
                  "run_multivi_2_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]*4

file_paths = ["scmomat/scmomat_result.csv",
              "cobolt/cobolt_result.csv",
              "glue/glue_result.csv",
              "multivi/multivi_result.csv"]

method_keys = ["scmomat","cobolt","glue","multivi"]

for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=4,output_folder="results_single_mod",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)


conda_envs = ["glue2"]
# python script for running the method
method_scripts = ["run_glue_mm10_noPred.py"]
# if the script should be run in python environment 
py_langs = [True]

file_paths = ["glue/glue_result.csv"]

method_keys = ["glue"]


for i in range(len(cond_keys)):
    cond_key = cond_keys[i]
    eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
                  method_scripts,py_langs,file_paths,method_keys,
                  ct_ref,nclust,dir_path,cond_key,
                  iter_list,repeats,repeat_start=1,
                  wait=True, wait_time=1*30,
                  batch=2,output_folder="results_single_same_cell_number",
                  ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,rare_ct_path = rare_ct_path)
    

# HPAP integration

In [4]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "/home/myylee/scmint/methods_eval/submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "/home/myylee/scmint/methods_eval/run_metric_eval_batch2_hpap.py"

ct_ref = "/project/mingyaolpc/myylee/scmint/methods_eval/dataset/hpap/hpap_all_bc_ct3.csv"
nclust = 10

dir_path = "/project/mingyaolpc/myylee/scmint/methods_eval/dataset/hpap/real_data/"
cond_key = "all"

iter_list = [70000]
repeats = 1

gp_eval_path = "false"
gp_truth = "false"
mem_limit = 100

# ===== functions ===== 

# == Python functions ==

conda_envs = ["multivi","glue2","cobolt","scmomat"]
# python script for running the method
method_scripts = ["run_multivi_batch.py",
                 "run_glue_hg38_batch.py",
                 "run_cobolt.py",
                 "run_scmomat_batch_hpap.py"]
# if the script should be run in python environment 
py_langs = [True,True,True,True]

file_paths = ["multivi/multivi_result.csv",
              "glue/glue_result.csv",
              "cobolt/cobolt_result.csv",
              "scmomat/scmomat_result.csv"]

method_keys = ["multivi","glue","cobolt","scmomat"]


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=1,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,
              mem_limit = mem_limit)

# == R functions ==
conda_envs = ["seurat","figr","liger","seurat","bindsc"]
# python script for running the method
method_scripts = ["run_seurat3_batch_sequential.R",
                  "run_rfigr_2_hpap_sequential.R",
                  "run_rliger_batch_sequential.R",
                  "run_seurat4_4_sequential.R",
                  "run_rbindsc_batch_sequential.R"]
# if the script should be run in python environment 
py_langs = [False,False,False,False,False]

file_paths = ["seurat3/seurat3_result.csv",
              "rfigr/rfigr_result.csv",
              "rliger/rliger_result.csv",
              "seurat4int/seurat4int_result.csv",
              "rbindsc/rbindsc_result.csv"]

method_keys = ["seurat3","rfigr","rliger","seurat4int","rbindsc"]


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=1,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth,
              mem_limit = mem_limit)

