In [None]:
import sctoolbox
from sctoolbox.utilities import bgcolor

# Batch effect correction and comparisons
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Set the species of the data
species = "human"

# Set number of cores to use for multiprocessing
threads = 4

# Options for highly variable genes
min_limit = 1000
max_limit = 5000

# Should preliminary clustering be performed?
do_clustering = True #True or False

# Options for batch correction
batch_column = "batch"  #a column in adata.obs containing batch information
perform_batch_correction = True
batch_methods = ["bbknn", "combat", "mnn", "harmony", "scanorama"]

<hr style="border:2px solid black"> </hr>

## Set up

In [None]:
import scanpy as sc
import sctoolbox.utilities as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
utils.settings_from_config("config.yaml", key="03")

## Loading the anndata

In [None]:
adata = utils.load_h5ad("anndata_2.h5ad")
display(adata)

In [None]:
adata.obs[batch_column] = adata.obs[batch_column].astype("category") #ensure that batch column is a category

In [None]:
adata.uns["sctoolbox"]["obs_metrics"]

## Normalization

In [None]:
# Save raw layer before normalization
adata.layers["raw"] = adata.X

In [None]:
sc.pp.normalize_total(adata, target_sum=None, exclude_highly_expressed=True)
sc.pp.log1p(adata)

## Predict Cell Cycle
Predict the division phase of each cell.

In [None]:
tools.predict_cell_cycle(adata, species=species, s_genes=None, g2m_genes=None, inplace=True)

## Find highly variable genes

In [None]:
tools.annot_HVG(adata, hvg_range=(min_limit, max_limit), save="highly_variable.png")

In [None]:
#Number of variable genes selected
adata.var["highly_variable"].sum()

## PCA and neighbors for uncorrected data

In [None]:
sc.pp.pca(adata, svd_solver='arpack', n_comps=50, use_highly_variable=True)

In [None]:
qc_columns = adata.uns["sctoolbox"]["obs_metrics"]
sc.pl.pca(adata, color=list(qc_columns) + [batch_column], ncols=3, show=False)
pl._save_figure("PCA.pdf")

In [None]:
sc.pp.neighbors(adata)

## Batch correction (optional)

In [None]:
if perform_batch_correction:
    batch_corrections = tools.wrap_corrections(adata, 
                                              batch_key=batch_column,
                                              methods=batch_methods)
else:
    batch_corrections = {"uncorrected": adata}

### Plot overview of batch corrections

In [None]:
#Run standard umap for all adatas
tools.wrap_umap(batch_corrections.values(), threads=threads)

In [None]:
#Perform additional clustering if it was chosen
color_by = [batch_column]

if do_clustering:
    for adata in batch_corrections.values():
        sc.tl.leiden(adata)
    color_by.append("leiden")

##### LISI score:
LISI score (stored in adata.obs) indicates the effective number of different categories represented in the local neighborhood of each cell. If the cells are well-mixed, then we expect the LISI score to be closer to n for a data with n batches.

##### The higher the LISI score is, the better batch correction method worked to normalize the batch effect and mix the cells from different batches.



In [None]:
# Calculate LISI scores for batch
tools.wrap_batch_evaluation(batch_corrections, batch_key=batch_column, threads=threads, inplace=True)

In [None]:
#Plot the overview of batch correction methods
_ = pl.anndata_overview(batch_corrections, color_by=color_by, 
                        output="batch_correction_overview.png")

### Select the final object

In [None]:
%bgcolor PowderBlue

selected = "scanorama"

In [None]:
if selected not in batch_corrections:
    raise KeyError(f"'{selected}' is not a key in batch_corrections")

In [None]:
adata = batch_corrections[selected]

## Saving adata for next notebook

In [None]:
adata

In [None]:
#Saving the data
adata_output = "anndata_3.h5ad"
utils.save_h5ad(adata, adata_output)