<a href="https://colab.research.google.com/github/myylee/benchmark_sc_multiomic_integration/blob/main/evaluate_vary_situations_public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run with a python kernel

In [None]:
%load_ext autoreload
%autoreload 2

# load libraries
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
import anndata as ad
from copy import deepcopy
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import pickle
import scipy.io as sio
import os

# Path to R [modify as needed]
os.environ['R_HOME'] = '/home/myylee/anaconda3/envs/r_py/lib/R/'
import utils_eval


from data_simulation import pair_unpair_split_size,downsample_samples, data_simulation, data_simulation_batch
from data_simulation import eval_test_all


# Sceanrio 1

## PBMC 

### Data simulation

In [None]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")

in_dir = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"

iter_list = [1000,3000,8000]

depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list)
depth_snatac_list = [1]*len(iter_list)

n_scrna_list = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list

repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "{}/dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz".format(os.getcwd())

cond_key = "nmulti"
    
def to_str(s): return(str(s))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)


### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"

dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired =====
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single.R",
                  "run_rbindsc_single.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rliger/rliger_result.csv",
              "rfigr/rfigr_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



### Add dashed line in peak-gene recovery percentage with the true paired RNA and ATAC profiles

In [None]:
# copy over the true paired RNA-seq data
import shutil
dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5
output_folder="results_single_same_cell_number"

for i in range(len(iter_list)):
    for j in range(1,repeats+1):
        in_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j))
        out_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j),output_folder)

        os.makedirs(os.path.join(out_dir_i,"truth","predicted","ATAC"), exist_ok=True)
        # save ATAC to the right location
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","barcodes.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","peak.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","peak.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","ATAC_counts.mtx"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","counts.mtx"))

        # load in ATAC barcodes 
        atac_bc = pd.read_csv(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"),header=None)

        # get RNA profile of the corresponding cells 
        in_dir = "dataset/multiome_pbmc_10k/"

        adata_rna = ad.read_h5ad(in_dir+"pbmc_10x_rna_public.h5ad")

        idx_sel = atac_bc[0].tolist()
        # RNA
        adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna[idx_sel,:].X.todense())),
                               obs=adata_rna.obs.loc[idx_sel,:],
                               var=adata_rna.var,dtype=np.float32)
        adata_rna_sel.obs['rna.bc'] = adata_rna_sel.obs_names
        # save RNA to the right location
        utils_eval.write_adata(adata_rna_sel, os.path.join(out_dir_i,"truth","predicted","RNA"),"RNA","gene",feature_name='feature',transpose=True)



In [None]:
# run eval_missming_modality_prediction 

# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "/home/myylee/scmint/methods_eval/submit_job_per_missing_mod_eval.sh"
# python script for metric evaluation 
eval_script = "/home/myylee/scmint/methods_eval/run_metric_eval_single.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

dir_path = "dataset/multiome_pbmc_10k/pbmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,3000,8000]
repeats = 5

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"
method_keys = ["truth"]


#----- All of these can be random strings, are not used in script, but need not to be empty strings ----
conda_envs = ["truth_na"]
# R script for running the method
method_scripts = ["run_truth.csv"]
# if the script should be run in python environment 
py_langs = [False]
file_paths = ["truth/truth_result.csv"]
#------ END ----- 


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)





## BMMC

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 


# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)


in_dir = "dataset/bmmc/bmmc_vary_cell_test/"
iter_list = [1000,2000,4000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"

def to_str(s): return(str(s))


data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired =====
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3_single.R",
                  "run_rbindsc_single.R",
                  "run_rfigr_single.R",
                  "run_rliger_single.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rliger/rliger_result.csv",
              "rfigr/rfigr_result.csv"]

method_keys = ["seurat3","rbindsc","seurat4","rfigr"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_mod",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



### Add dashed line in peak-gene recovery percentage with the true paired RNA and ATAC profiles

In [None]:
# copy over the true paired RNA-seq data
import shutil
dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5
output_folder="results_single_same_cell_number"

for i in range(len(iter_list)):
    for j in range(1,repeats+1):
        in_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j))
        out_dir_i = os.path.join(dir_path,"{}{}_{}".format(cond_key,iter_list[i],j),output_folder)

        os.makedirs(os.path.join(out_dir_i,"truth","predicted","ATAC"), exist_ok=True)
        # save ATAC to the right location
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","barcodes.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","peak.tsv"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","peak.tsv"))
        shutil.copy2(os.path.join(in_dir_i,"unpaired_ATAC","ATAC_counts.mtx"), 
                     os.path.join(out_dir_i,"truth","predicted","ATAC","counts.mtx"))


        # load in ATAC barcodes 
        atac_bc = pd.read_csv(os.path.join(in_dir_i,"unpaired_ATAC","barcodes.tsv"),header=None)
        idx_sel = atac_bc[0].tolist()

        # get RNA profile of the corresponding cells 
        in_dir = "dataset/bmmc/"

        adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

        adata_rna.obs['rna.bc'] = adata_rna.obs_names
        adata_rna.var['feature'] = adata_rna.var['features'] 

        # RNA
        adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna[idx_sel,:].X.todense())),
                               obs=adata_rna.obs.loc[idx_sel,:],
                               var=adata_rna.var,dtype=np.float32)
        # save RNA to the right location
        utils_eval.write_adata(adata_rna_sel, os.path.join(out_dir_i,"truth","predicted","RNA"),"RNA","gene",feature_name='feature',transpose=True)




In [None]:
# run eval_missming_modality_prediction 

# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_single.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vary_cell_test/"
cond_key = "nmulti"

iter_list = [1000,2000,4000]
repeats = 5

method_keys = ["truth"]

#----- All of these can be random strings, are not used in script, but need not to be empty strings ----
conda_envs = ["truth_na"]
# R script for running the method
method_scripts = ["run_truth.csv"]
# if the script should be run in python environment 
py_langs = [False]
file_paths = ["truth/truth_result.csv"]
#------ END ----- 


eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results_single_same_cell_number",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)





# Sceanrio 2 

## PBMC 2000 cells 

### Data simulation

In [None]:
source_dir = "dataset/multiome_pbmc_10k/"

adata_rna = ad.read_h5ad(source_dir+"pbmc_10x_rna_public.h5ad")
adata_atac = ad.read_h5ad(source_dir+"pbmc_10x_atac_public.h5ad")


in_dir = "dataset/multiome_pbmc_10k/nmulti2000_7ct_vdepth_test/"
iter_list = [0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [2000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
fragment_path = "{}/dataset/multiome_pbmc_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz".format(os.getcwd())

cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv"
nclust = 7

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"

dir_path = "dataset/multiome_pbmc_10k/nmulti2000_7ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## BMMC 2000 cells 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
in_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()

in_dir = "dataset/bmmc/nmulti2000_21ct_vdepth_test/"
iter_list = [0.25]#[0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [2000]*len(iter_list)
repeats = 2 #5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)




### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/nmulti2000_21ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## BMMC 4000 cells 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
in_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(in_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()

in_dir = "dataset/bmmc/nmulti4000_21ct_vdepth_test/"
iter_list = [0.25,0.5,0.75,1]
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [4000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)




### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/nmulti4000_21ct_vdepth_test/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Increasing number of cells; 100% depth

### Data simulation

In [None]:
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 


# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()


in_dir = "dataset/bmmc/bmmc_vcells_intervals/"
iter_list = [int(x) for x in np.linspace(start=1000, stop=1000+400*10, num=11)] # ncell increase interval, each interval: increase by 400 cells. 400 cells of 10k atac fragment and 2500 nCounts_RNA is roughly equal to 10% increment in ATAC and RNA sequencing depth, assuming 4000 cells
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))


data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                to_str, downsample=False)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vcells_intervals/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Increasing depth; 4000 cells

### Data simulation

In [None]:
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# aggregate the selected donors 
idx_sel = list(adata_atac.obs['batch'].isin(["s1d2"]))

# Paired - RNA
adata_rna_sel = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_sel,:].todense())),
                       obs=adata_rna.obs.iloc[idx_sel,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_sel = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_sel,:].todense())),
                       obs=adata_atac.obs.iloc[idx_sel,:],
                       var=adata_atac.var,dtype=np.float32)

# ensure that var_names are feature names and obs_names are cell barcodes
adata_rna_sel.var_names = adata_rna.var['feature'].tolist()
adata_atac_sel.var_names = adata_atac.var['feature'].tolist()


# Saving everything in /project/mingyaolpc/ folder to avoid running out of storage
in_dir = "dataset/bmmc/bmmc_vdepth_intervals/"
iter_list = np.linspace(start=10, stop=100, num=10)/100
depth_multiome_list = iter_list
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = [4000]*len(iter_list)
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/s1d2_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "depthmulti"

def str_norm(s):
    return(str(int(s*100)))

data_simulation(in_dir,adata_rna_sel,adata_atac_sel,iter_list,
                depth_multiome_list,depth_scrna_list,depth_snatac_list,
                n_multiome_list,n_scrna_list,n_snatac_list,
                repeats,fragment_path,cond_key,
                str_norm, downsample=True)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_fair.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "eval_missing_modality_prediction_single.R"
gp_truth = "dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv"

dir_path = "dataset/bmmc/bmmc_vdepth_intervals/"
cond_key = "depthmulti"

iter_list = [25,50,75,100]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_2.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



# Sceanrio 3

## Technical batch effect 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_s1d1 = list(adata_atac.obs['batch'] == 's1d1')

# Paired - RNA
adata_rna_s1d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d1,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_s1d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d1,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s2d1 = list(adata_atac.obs['batch'] == 's2d1')
adata_rna_s2d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s2d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s2d1,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s2d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s2d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s2d1,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_technical_batch_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)



### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_technical_batch_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Biological batch effect 

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_s1d1 = list(adata_atac.obs['batch'] == 's1d1')

# Paired - RNA
adata_rna_s1d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d1,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_s1d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d1,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s1d2 = list(adata_atac.obs['batch'] == 's1d2')
adata_rna_s1d2 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d2,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d2,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s1d2 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d2,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d2,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_biological_batch_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_biological_batch_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R"]
# if the script should be run in python environment 
py_langs = [False]

file_paths = ["seurat4/seurat4_result.csv"]

method_keys = ["seurat4"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Complex test #1

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_paired = list(adata_atac.obs['batch'].isin(['s1d1','s1d3']))

# Paired - RNA
adata_rna_paired = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_paired,:].todense())),
                       obs=adata_rna.obs.iloc[idx_paired,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_paired = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_paired,:].todense())),
                       obs=adata_atac.obs.iloc[idx_paired,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s1d2 = list(adata_atac.obs['batch'] == 's1d2')
adata_rna_s1d2 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s1d2,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s1d2,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s1d2 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s1d2,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s1d2,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_complex1_test/"
iter_list = [1000,3000,5000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [1000]*len(iter_list)
n_snatac_list = [1000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5

# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_complex1_test/"
cond_key = "nmulti"

iter_list = [1000,3000,5000]
repeats = 5


# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat","seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R","run_seurat4_4.R"]
# if the script should be run in python environment 
py_langs = [False]*2

file_paths = ["seurat4/seurat4_result.csv","seurat4int/seurat4int_result.csv"]

method_keys = ["seurat4int"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)



## Complex test #2

### Data simulation

In [None]:
# using pmat dataset, generated from bmmc_process R script 
source_dir = "dataset/bmmc/"

adata_atac = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_ATAC_pmat.h5ad")
adata_rna = ad.read_h5ad(source_dir+"multiome_bmmc_site1_or_donor1_RNA.h5ad")

adata_atac.var['feature'] = adata_atac.var['features'] 
adata_rna.var['feature'] = adata_rna.var['features'] 

# Paired
idx_paired = list(adata_atac.obs['batch'].isin(['s4d1','s1d3']))

# Paired - RNA
adata_rna_paired = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_paired,:].todense())),
                       obs=adata_rna.obs.iloc[idx_paired,:],
                       var=adata_rna.var,dtype=np.float32)
# Paired - ATAC
adata_atac_paired = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_paired,:].todense())),
                       obs=adata_atac.obs.iloc[idx_paired,:],
                       var=adata_atac.var,dtype=np.float32)
# snRNA
idx_s2d1 = list(adata_atac.obs['batch'] == 's2d1')
adata_rna_s2d1 = AnnData(csr_matrix(deepcopy(adata_rna.X[idx_s2d1,:].todense())),
                       obs=adata_rna.obs.iloc[idx_s2d1,:],
                       var=adata_rna.var,dtype=np.float32)

# snATAC
adata_atac_s2d1 = AnnData(csr_matrix(deepcopy(adata_atac.X[idx_s2d1,:].todense())),
                       obs=adata_atac.obs.iloc[idx_s2d1,:],
                       var=adata_atac.var,dtype=np.float32)

in_dir = "dataset/bmmc/bmmc_complex2_test/"
iter_list = [10000]
depth_multiome_list = [1]*len(iter_list)
depth_scrna_list = [1]*len(iter_list) 
depth_snatac_list = [1]*len(iter_list)
n_scrna_list  = [2000]*len(iter_list)
n_snatac_list = [2000]*len(iter_list)
n_multiome_list = iter_list
repeats = 5


# if do not create fragment file symlink, use fragment_path = None 
# fragment file was created using correct_s1d2_fragments_sel.ipynb, where barcode was made the same as stored in adata  
fragment_path = "{}/dataset/bmmc/bmmc_site1_or_donor1_atac_fragments.tsv.gz".format(os.getcwd())
cond_key = "nmulti"
    
def to_str(s): return(str(s))

# assume there are ['rna.bc','batch'] in each adata.obs
# assume there is ['feature'] in each adata.var
data_simulation_batch(in_dir,adata_rna_s1d1,adata_atac_s1d1,adata_rna_s2d1,adata_atac_s2d1,
                      iter_list,depth_multiome_list,depth_scrna_list,
                      depth_snatac_list,n_multiome_list,
                      n_scrna_list,n_snatac_list,
                      repeats,fragment_path,cond_key,
                      to_str, downsample=False,same_unpair_origin=True)


### Running method + evaluations

In [None]:
### working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
# bash script for job submission
job_submission_script = "submit_job_per_condition_n_eval2.sh"
# python script for metric evaluation 
eval_script = "run_metric_eval_batch.py"

ct_ref = "dataset/bmmc/bmmc_all_bc_ct3.csv"
nclust = 21

gp_eval_path = "false"
gp_truth = "false"

dir_path = "dataset/bmmc/bmmc_complex2_test/"
cond_key = "nmulti"

iter_list = [10000]
repeats = 5

# ===== multiome-guided (python functions) =====

conda_envs = ["multivi","cobolt"]
# python script for running the method
method_scripts = ["run_multivi_batch.py","run_cobolt.py"]
# if the script should be run in python environment 
py_langs = [True]*2 

file_paths = ["multivi/multivi_result.csv",
              "cobolt/cobolt_result.csv"]

method_keys = ["multivi","cobolt"]


# submit 2 jobs at a time, wait, submit new jobs until the 2 were done
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=True, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== multiome-guided (R functions) =====

conda_envs = ["seurat","seurat"]
# python script for running the method
method_scripts = ["run_seurat4_3.R","run_seurat4_4.R"]
# if the script should be run in python environment 
py_langs = [False]*2

file_paths = ["seurat4/seurat4_result.csv","seurat4int/seurat4int_result.csv"]

method_keys = ["seurat4int"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)


# ===== unpaired (multiome-split) ===== 
conda_envs = ["seurat","bindsc","figr","liger"]
# python script for running the method
method_scripts = ["run_seurat3.R","run_rbindsc.R","run_rfigr_2.R","run_rliger.R"]
# if the script should be run in python environment 
py_langs = [False]*4

file_paths = ["seurat3/seurat3_result.csv",
              "rbindsc/rbindsc_result.csv",
              "rfigr/rfigr_result.csv",
             "rliger/rliger_result.csv"]

method_keys = ["seurat3","rbindsc","rfigr","rliger"]

# submit all jobs at once
eval_test_all(folder_dir,job_submission_script,eval_script,conda_envs,
              method_scripts,py_langs,file_paths,method_keys,
              ct_ref,nclust,dir_path,cond_key,
              iter_list,repeats,repeat_start=1,
              wait=False, wait_time=2*60,batch=2,output_folder="results",
              ncore=8,gp_script=gp_eval_path,gp_truth=gp_truth)

