# Readme

This script contains codes for generating the UMAP plots in the manuscript titled 'Benchmarking algorithms for joint integration of unpaired and paired single-cell RNA-seq and ATAC-seq data'. Most of these figures are in the Additional File 1. 

Please run steps described in evaluate_vary_situations_public.ipynb before this script. 

The same procedure is applied to all scenarios to generate the UMAP plot. One is demonstrated here as an example

There are two separate parts. First part is run using a python kernal in which the umap coordinates and cell labels are saved into csv files. Second part is run using a R kernel that reads in the csv files and generate the UMAP plots. 

# Part 1 - Python

## Functions

In [None]:
# use scib2 python kernel 
import sys 
import os
import pandas as pd
import numpy as np
import pickle
import utils_eval
import matplotlib.pyplot as plt
from anndata import AnnData
import scanpy as sc
import pickle
import anndata as ad 
os.environ['R_HOME'] = '/home/myylee/anaconda3/envs/scib2/lib/R/'
import scib
from copy import deepcopy 
import tempfile
import shutil

import random

# Function
# Generate UMAP plot: one with single modality cells, the other one with all cells. Will be different depending on what cells should be evaluated 
# inputs: 
# @folder_dir: folder_dir
# @dir_path: dir_path
# @output_folder: output_folder 
# @cond_key: cond_key
# @var_list: var_list
# @rep_list: rep_list 
# @method_list: method_list
# @res_list: res_list
# @ct_ref: ct_ref
# @nclust: nclust=None
# @plot_out_dir: plot_out_dir 

# single modality cells only 
def umap_plot(dir_path,ct_ref,nclust_target,plot_out_dir,folder_dir,output_folder,cond_key,var_list,rep_list,res_list,
              with_multi_folder="results_single_same_cell_number",legend_loc=None):
    pd_comb_list = [None]*len(res_list)
    os.makedirs(os.path.join("figures","umap",plot_out_dir),exist_ok=True)
    for i in range(len(var_list)):
        print(i)
        nclust = nclust_target
        if "seurat4" in res_list[i] or "liger" in res_list[i]:
            print("not clustering")
            nclust = None

        random.seed(1234)

        path = os.path.join(folder_dir,dir_path,
                            "{}{}_{}".format(cond_key,var_list[i],rep_list[i]),
                            output_folder[i],
                             res_list[i])
        res_df = pd.read_csv(path,index_col=0)

        col_sel = [col for col in res_df.columns if 'latent' in col]
        sub_df = res_df.loc[: , col_sel]

        cell_type = pd.read_csv(ct_ref,index_col=0)
        cell_type.columns = ['cell_type']
        # Assuming everything before '_' is not useful 
        cell_bc = [sub.split('_')[len(sub.split('_'))-1] for sub in list(res_df.index)]
        res_df['cell_type'] = list(cell_type.loc[cell_bc,'cell_type'])

        adata_plot = AnnData(res_df.loc[: , col_sel].to_numpy(),
                             obs=res_df.loc[:,['dataset','cell_type']],
                             dtype=np.float32)
        sc.pp.neighbors(adata_plot, n_neighbors=15, use_rep="X")
        umap_col_sel = [col for col in res_df.columns if 'umap' in col]
        if len(umap_col_sel) > 0: 
            print("using saved UMAP loading")
            adata_plot.obsm['umap'] = res_df.loc[: , umap_col_sel].to_numpy()
        else:
            print("calculating UMAP loading")
            sc.tl.umap(adata_plot, min_dist=0.2)

        if nclust is not None:
            print("clustering with Louvain")
            resolution = utils_eval.find_resolution_louvain(adata_plot,nclust)
            sc.tl.louvain(adata_plot, resolution = resolution, random_state = 0)
            adata_plot.obs['predicted_ct'] = adata_plot.obs['louvain']
        else:
            print("loading saved clustering result")
            adata_plot.obs['predicted_ct'] = res_df.loc[:,['predicted_ct']]
            adata_plot.obs['predicted_ct'] = adata_plot.obs['predicted_ct'].astype('category')

        adata_plot.obsm['embed'] = adata_plot.X
        adata_plot.obsm['X_emb'] = adata_plot.X

        add_str = ""
        if output_folder[i] == with_multi_folder:
            add_str = "2"
        plot_name = os.path.join(plot_out_dir,"{}{}_{}_{}".format(cond_key,var_list[i],rep_list[i],
                                                                    res_list[i].replace("/", "-").replace(".csv", add_str+".svg")))
        plot_name2 = os.path.join(plot_out_dir,"{}{}_{}_{}".format(cond_key,var_list[i],rep_list[i],
                                                                     res_list[i].replace("/", "-").replace(".csv", "_with_multiome"+add_str+".svg")))
        idx = list(adata_plot.obs['dataset'].isin(["scRNA","snATAC"]))
        adata_plot_sel = deepcopy(adata_plot)[idx,:]
        adata_plot_sel.obs = deepcopy(adata_plot.obs).loc[idx,:]
        print(adata_plot_sel)
        if len(umap_col_sel) > 0: 
            sc.pl.umap(adata_plot_sel, 
                       color=['cell_type','dataset','predicted_ct'],
                       use_raw =False,
                       layer='umap',
                       wspace=0.2,
                       save="/"+plot_name,
                       legend_fontsize="x-small",
                       legend_loc=legend_loc,
                      size = 10)
            pd_umap = pd.DataFrame(adata_plot_sel.obsm['umap'])
        else:
            sc.pl.umap(adata_plot_sel, 
                       color=['cell_type','dataset','predicted_ct'],
                       wspace=0.2,
                       save="/"+plot_name,
                       legend_fontsize="x-small",
                       legend_loc=legend_loc,
                      size = 10)
            pd_umap = pd.DataFrame(adata_plot_sel.obsm['X_umap'])

        # if multiome dataset is used, plot all cells as well in a different UMAP plot
        if adata_plot_sel.shape[0] != adata_plot.shape[0]:
            if len(umap_col_sel) > 0: 
                sc.pl.umap(adata_plot, 
                           color=['cell_type','dataset','predicted_ct'],
                           use_raw =False,
                           layer='umap',
                           wspace=0.2,
                           save="/"+plot_name2,
                       legend_fontsize="x-small",
                       legend_loc=legend_loc,
                      size = 10)
            else:
                sc.pl.umap(adata_plot, 
                           color=['cell_type','dataset','predicted_ct'],
                           wspace=0.2,
                           save="/"+plot_name2,
                       legend_fontsize="x-small",
                       legend_loc=legend_loc,
                      size = 10)
        
        pd_umap.index = adata_plot_sel.obs.index.tolist()
        method_name = res_list[i].split('/', 1) [0]
        if output_folder[i] in with_multi_folder:
            method_name =method_name+"_2"
        pd_umap['key'] = "{}{}_{}_{}".format(cond_key,var_list[i],rep_list[i],method_name)
        pd_comb_list[i] = pd.concat([pd_umap,adata_plot_sel.obs], axis=1)

    pd_comb_all = pd.concat(pd_comb_list, axis=0)
    return(pd_comb_all)


# Function
# not just the single-modality cells, plot the multiome cells as well, and return the embedding.
# half multiomeRNA + half multiomeATAC for the unpaired multiome-splitted methods
def umap_plot_all_cells(dir_path,ct_ref,nclust_target,plot_out_dir,folder_dir,output_folder,cond_key,var_list,rep_list,res_list,
              with_multi_folder="results_single_same_cell_number",legend_loc=None):
    from numpy.random import choice
    subsample_size = lambda vector, size: choice(vector, size = size, replace = False) 

    pd_comb_list = [None]*len(res_list)
    os.makedirs(os.path.join("figures","umap",plot_out_dir),exist_ok=True)
    for i in range(len(var_list)):
        print(i)
        nclust = nclust_target
        if "seurat4" in res_list[i] or "liger" in res_list[i]:
            print("not clustering")
            nclust = None

        random.seed(1234)

        path = os.path.join(folder_dir,dir_path,
                            "{}{}_{}".format(cond_key,var_list[i],rep_list[i]),
                            output_folder[i],
                             res_list[i])
        res_df = pd.read_csv(path,index_col=0)

        col_sel = [col for col in res_df.columns if 'latent' in col]
        sub_df = res_df.loc[: , col_sel]

        cell_type = pd.read_csv(ct_ref,index_col=0)
        cell_type.columns = ['cell_type']
        # Assuming everything before '_' is not useful 
        cell_bc = [sub.split('_')[len(sub.split('_'))-1] for sub in list(res_df.index)]
        res_df['cell_type'] = list(cell_type.loc[cell_bc,'cell_type'])

        adata_plot = AnnData(res_df.loc[: , col_sel].to_numpy(),
                             obs=res_df.loc[:,['dataset','cell_type']],
                             dtype=np.float32)
        sc.pp.neighbors(adata_plot, n_neighbors=15, use_rep="X")
        umap_col_sel = [col for col in res_df.columns if 'umap' in col]
        if len(umap_col_sel) > 0: 
            print("using saved UMAP loading")
            adata_plot.obsm['umap'] = res_df.loc[: , umap_col_sel].to_numpy()
        else:
            print("calculating UMAP loading")
            sc.tl.umap(adata_plot, min_dist=0.2)

        if nclust is not None:
            print("clustering with Louvain")
            resolution = utils_eval.find_resolution_louvain(adata_plot,nclust)
            sc.tl.louvain(adata_plot, resolution = resolution, random_state = 0)
            adata_plot.obs['predicted_ct'] = adata_plot.obs['louvain']
        else:
            print("loading saved clustering result")
            adata_plot.obs['predicted_ct'] = res_df.loc[:,['predicted_ct']]
            adata_plot.obs['predicted_ct'] = adata_plot.obs['predicted_ct'].astype('category')

        adata_plot.obsm['embed'] = adata_plot.X
        adata_plot.obsm['X_emb'] = adata_plot.X

        add_str = ""
        if output_folder[i] == with_multi_folder:
            add_str = "2"
        plot_name2 = os.path.join(plot_out_dir,"{}{}_{}_{}".format(cond_key,var_list[i],rep_list[i],
                                                                     res_list[i].replace("/", "-").replace(".csv", "_with_multiome"+add_str+".svg")))
        

        idx = list(adata_plot.obs['dataset'].isin(["Multiome-RNA","Multiome-ATAC","multiomeRNA","multiomeATAC"]))
        adata_plot.obs['dataset2'] = adata_plot.obs['dataset'].tolist()
        adata_plot.obs['dataset2'][idx] = 'Multiome'
        if len(idx) > 0:
            # select half of the rna cells and half of the atac cells, instead of randomly selecting from the giant list, which does not gaureentee always get half-half
            # get the barcode of every cell 
            adata_plot.obs['bc2'] = [s.split("_") [len(s.split("_"))-1] for s in adata_plot.obs_names]
            # get the id for RNA 
            idx_rna = list(adata_plot.obs['dataset'].isin(["Multiome-RNA","multiomeRNA"]))
            # get id for ATAC 
            idx_atac = list(adata_plot.obs['dataset'].isin(["Multiome-ATAC","multiomeATAC"]))
            # create barcode column with the right prefix
            adata_plot.obs['bc3'] = adata_plot.obs['bc2']
            adata_plot.obs['bc3'][idx_rna] = "prna_"+adata_plot.obs['bc3'][idx_rna] 
            adata_plot.obs['bc3'][idx_atac] = "patac_"+adata_plot.obs['bc3'][idx_atac]
            # select bc, get 
            npaired = round(sum(idx)/2)
            bc_select = (['prna_']*round(npaired/2) + ['patac_']*(npaired-round(npaired/2))) + adata_plot.obs['bc2'][idx].sample(frac=1).unique()
            idx_multi_kept = adata_plot.obs['bc3'].isin(bc_select)
            idx_multi_kept = [i for i, x in enumerate(idx_multi_kept) if x]
            print(len(idx_multi_kept))
            idx_kept = list(~adata_plot.obs['dataset'].isin(["Multiome-RNA","Multiome-ATAC","multiomeRNA","multiomeATAC"]))
            idx_kept = [i for i, x in enumerate(idx_kept) if x]
            idx_f = idx_kept + idx_multi_kept
            adata_plot_sel = deepcopy(adata_plot)[idx_f,:]
            adata_plot_sel.obs = deepcopy(adata_plot.obs).iloc[idx_f,:]
        else:
            adata_plot_sel = adata_plot

        if len(umap_col_sel) > 0: 
            sc.pl.umap(adata_plot_sel, 
                       color=['cell_type','dataset','predicted_ct'],
                       use_raw =False,
                       layer='umap',
                       wspace=0.4,
                       save="/"+plot_name2,
                       legend_loc=legend_loc,
                       legend_fontsize="xx-small")
            pd_umap = pd.DataFrame(adata_plot_sel.obsm['umap'])
        else:
            sc.pl.umap(adata_plot_sel, 
                       color=['cell_type','dataset','predicted_ct'],
                       wspace=0.4,
                       save="/"+plot_name2,
                       legend_loc=legend_loc,
                       legend_fontsize="xx-small")
            pd_umap = pd.DataFrame(adata_plot_sel.obsm['X_umap'])

        pd_umap.index = adata_plot_sel.obs.index.tolist()
        method_name = res_list[i].split('/', 1) [0]
        if output_folder[i] in with_multi_folder:
            method_name =method_name+"_2"
        pd_umap['key'] = "{}{}_{}_{}".format(cond_key,var_list[i],rep_list[i],method_name)
        pd_comb_list[i] = pd.concat([pd_umap,adata_plot_sel.obs], axis=1)

    pd_comb_all = pd.concat(pd_comb_list, axis=0)
    return(pd_comb_all)



## Applying on SHARE-seq mouse skin dataset

In [None]:
# SHARE-seq mouse skin
# working_dir 
folder_dir="/home/myylee/scmint/methods_eval/"
output_folder = (["results_single_mod"]*5+["results_single_same_cell_number"]*5+["results_single_mod"]*4)*2
cond_key = "nmulti"
var_list = [15000]*14+[5000]*14
rep_list = [2]*14+[1]*14
res_list = (["seurat3/seurat3_result.csv","rliger/rliger_result.csv","rfigr/rfigr_result.csv","rbindsc/rbindsc_result.csv","glue/glue_result.csv", # unpaired 
            "seurat3/seurat3_result.csv","rliger/rliger_result.csv","rfigr/rfigr_result.csv","rbindsc/rbindsc_result.csv","glue/glue_result.csv", # unpaired with multiome
           "seurat4/seurat4_result.csv","multivi/multivi_result.csv","cobolt/cobolt_result.csv" ,"scmomat/scmomat_result.csv"#multiome-guided
           ])*2

print(res_list)

#----- shareseq mouse skin 22 ct -----#
dir_path = "/project/mingyaolpc/myylee/scmint/methods_eval/dataset/mouse_skin/multiome_ncells_pmat/"
ct_ref = "/project/mingyaolpc/myylee/scmint/methods_eval/dataset/mouse_skin/mouse_skin_shareseq_bc_ct3.csv"
nclust_target = 22
plot_out_dir = "mouse_skin_22ct_all/"



df_to_plot = umap_plot(dir_path,
          ct_ref,
          nclust_target,
          plot_out_dir,
          folder_dir,
          output_folder,
          cond_key,
          var_list,
          rep_list,
          res_list,
          with_multi_folder="results_single_same_cell_number",
          legend_loc=None)
df_to_plot.to_csv(os.path.join(folder_dir,"figures/umap/",plot_out_dir,"df_to_plot.csv"))



# Part 2 - R 

## Functions

In [None]:
# ----- Function ----- # 
# load saved UMAP loading and metadata csv from umap_plot function in evaluate_vary_situations_all.ipynb and organize umap plota into one plot in R  
library(gridExtra)        
generate_umap_plots_raster <- function(df_plot,
                                       method_names,
                                       tops,
                                       column_names=c("cell_type","predicted_ct","dataset"),
                                       columns_to_plot= c("cell_type","predicted_ct","dataset"),
                                       sample_cp = NA,
                                       cell_type_cp = NA,
                                       pixels = c(128,128),
                                       pointsize = 0.1){
    require('scattermore')
    if((length(column_names)!=length(columns_to_plot))){print("columns names ane column to plot list are not the same size. Return without running"); return()}
    p_outer = list()
    counter_outer = 1
    for(x in columns_to_plot){
        df_plot[,x] <- as.factor(df_plot[,x])
    }
    add_left_title_n = length(unique(df_plot$key))/2
    add_top_title_2 = length(unique(df_plot$key))/2+1
    for (i in unique(df_plot$key)){
        p = list()
        counter = 1
        for (c_i in 1:length(columns_to_plot)){
            c = columns_to_plot[c_i]
            p[[counter]]<- df_plot %>% filter(key == i) %>%
                ggplot(aes_string(x="X0", y = "X1",color=c)) +
                geom_scattermore(
                    pointsize = pointsize,
                    pixels = pixels
                ) + 
                theme(
                    panel.grid.major = element_blank(),
                    panel.grid.minor = element_blank(),
                    strip.background = element_blank(),
                    panel.border = element_rect(colour = "black", fill = NA),
                    panel.background = element_rect(fill='transparent'),
                    legend.position="None",
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    axis.text.x=element_blank(), #remove x axis labels
                    axis.ticks.x=element_blank(), #remove x axis ticks
                    axis.text.y=element_blank(),  #remove y axis labels
                    axis.ticks.y=element_blank(),  #remove y axis ticks
                    plot.title = element_text(hjust = 0.5,size=6)
                ) 
            # use custom color palette for the 'Sample' plot
            if(!is.na(sample_cp)&& c =="sample"){
                p[[counter]]<- p[[counter]] + scale_color_manual(values=sample_cp)
            }
            if(!is.na(cell_type_cp)&& c =="cell_type"){
                p[[counter]]<- p[[counter]] + scale_color_manual(values=cell_type_cp)
            }
            
            counter = counter +1 
        }

        p_outer[[counter_outer]]<- grid.arrange(grobs=p,nrow=1)
        counter_outer = counter_outer + 1
    }
    half_n =  length(unique(df_plot$key))/2
    pos = list()
    for(i in 1:half_n){
        pos[[i]] = c(i, i+half_n)
    }
    lay <- do.call(rbind,pos)
    left_text_grobs <- lapply(method_names,function(t){
        textGrob(t, gp=gpar(fontsize=6))
    })
    top_text_grobs <-  lapply(tops,function(t){
        textGrob(t, gp=gpar(fontsize=9))
    })
    p_outer_all<- c(p_outer,top_text_grobs,left_text_grobs)
        
    # add blank space to top and bottom
    # suppose there is two columns 
    lay <- rbind(c(NA,(length(p_outer)+1),NA,(length(p_outer)+2)),#,ncol(lay)+2),# top text 
                 cbind(c((length(p_outer_all)-length(method_names)+1):length(p_outer_all)),#rep(NA,nrow(lay)),#left text 
                       lay[,1], 
                       rep(NA,nrow(lay)),
                       lay[,2]))
    p_return <-arrangeGrob(grobs=p_outer_all, layout_matrix = lay,
                           widths = c(2,9,1,9),
                           height = c(1,rep(8,nrow(lay)-1)))
    return(p_return)
}



## Applying on SHARE-seq mouse skin dataset

In [None]:
df_plot <- read.csv("figures/umap/mouse_skin_22ct_all/df_to_plot.csv")
unique(df_plot$key)

sceanrio = c("nmulti5000_1","nmulti15000_2")
methods = c("seurat3","rliger","rfigr","rbindsc","glue",
            "seurat3_2","rliger_2","rfigr_2","rbindsc_2","glue_2",
            "seurat4","multivi","cobolt","scmomat")
levels_order = paste0(rep(sceanrio,each=length(methods)),
                      "_",
                      rep(methods,length(sceanrio)))
levels_order

#create a factor, level it by d2
df_plot$key <- factor(df_plot$key, levels=levels_order)

#order
df_plot <- df_plot[order(df_plot$key),] 
unique(df_plot$key)

# load csv 
method_names = c("Seurat3\nUnpaired","LIGER\nUnpaired","FigR\nUnpaired","BindSC\nUnpaired","Glue\nUnpaired",
                "Seurat3\nUnpaired\n(Multiome-split)","LIGER\nUnpaired\n(Multiome-split)",
                 "FigR\nUnpaired\n(Multiome-split)","BindSC\nUnpaired\n(Multiome-split)",
                 "Glue\nUnpaired\n(Multiome-split)",
                "Seurat4\nMultiome-guided","MultiVI\nMultiome-guided","Cobolt\nMultiome-guided",
                "scMoMaT\nMultiome-guided")

#method_names = c("scMoMaT\nMultiome-guided","Glue\nUnpaired\n(Multiome-splitted)")
tops = c("n_multiome = 5000 cells","n_multiome = 15000 cells")
column_names=c("Cell type","Predicted","Modality")
df_plot$dataset <- factor(df_plot$dataset)

In [None]:
fig <- generate_umap_plots_raster(df_plot,method_names,tops,column_names)

ggsave(file="figures/umap/mouse_skin_22ct_all/df_to_plot_dot2_shape19_rev1_raster.pdf", fig, width=6, height=11) #saves g
