In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

nb_name = "04_clustering.ipynb"

_compare_version(nb_name)

#  04 - Embedding and clustering
<hr style="border:2px solid black"> </hr>

## 1 - Description
### 1.1 Embedding
Embeddings are dimension reduction methods to transform high-dimensional data into lower-dimensional representations while preserving the inherent structure and relationships between individual cells.  
The sctoolbox supports the [Uniform Manifold Approximation and Projection (UMAP)](https://arxiv.org/abs/1802.03426) and the [t-distributed stochastic neighbor embedding (t-SNE)](https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) methods for dimension reduction, with UMAP being set as the default value.
To learn more about the differences between those methods and get more insight in the parameter selction have a look [here for umap](https://pair-code.github.io/understanding-umap/) and [here for t-SNE](https://distill.pub/2016/misread-tsne/?_ga=2.135835192.888864733.1531353600-1779571267.1531353600).
### 1.2 Clustering
Single cell clustering is used to group individual cells into clusters based on similarities in their gene expression. The clustering allows to identify distinct cell types and characterize cellular heterogeneity within a population.
The sctoolbox supports the [leiden](https://www.nature.com/articles/s41598-019-41695-z) and the [louvain](https://iopscience.iop.org/article/10.1088/1742-5468/2008/10/P10008) clustering methods, with the leiden clustering algorithm being newer and recommended to use.

__________

## 2 - Setup

In [None]:
import sctoolbox
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import sctoolbox.utils as utils
import scanpy as sc
import pandas as pd

sctoolbox.settings.settings_from_config("config.yaml", key="04")

sc.set_figure_params(vector_friendly=True, dpi_save=600, scanpy=False)

__________

## 3 - Load anndata

In [None]:
adata = utils.adata.load_h5ad("anndata_3.h5ad")

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

_______

## 4 - General input

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

#Column to show in UMAPs
condition_column = 'sample'

#Number of threads to use for multiprocessing
threads = 4

# Search embedding parameters (or set parameters later)
embedding = "umap"   #umap or tsne
search_parameters = True

dist_range = (0.1, 0.3, 0.1) # Set min_dist range for umap
spread_range = (1.0, 2.0, 0.5) # Set spread range for umap
n_components = 2 # Number of components for umap
perplexity_range = (30, 60, 10)        # perplexity range for tsne
learning_rate_range = (600, 1000, 200)   # learning_rate for tsne

# Search different clustering resolutions
search_clustering_parameters = True
clustering_method = "leiden" #leiden/louvain

# Annotate regions to genes
GTF_PATH = "test_data/hg38_genes.gtf" # genes gtf file

_______

## 5 - Calculate UMAP/TSNE and find best setting

- NOTE: min_dist: distances between points to make the plot looks more 'clustered'
- NOTE: spread: The effective scale of embedded points value be de default is 1

In [None]:
if search_parameters:
    if embedding == "umap":
        pl.embedding.search_umap_parameters(
            adata, 
            min_dist_range=dist_range,
            spread_range=spread_range,
            color=condition_column,
            n_components=n_components,
            threads=threads,
            save="UMAP_parameter_search.pdf"
        )
    elif embedding == "tsne":
        pl.embedding.search_tsne_parameters(
            adata, 
            perplexity_range=perplexity_range,
            learning_rate_range=learning_rate_range,
            color=condition_column,
            threads=threads,
            save="TSNE_parameter_search.pdf"
        )

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Final choice of spread / dist for umap
min_dist = 0.2
spread = 1.5

# Final choice of perplexity_range / perplexity_range for tsne
perplexity = 50
learning_rate = 800

_________

In [None]:
# Calculate final embedding
if embedding == "umap":
    sc.tl.umap(adata, min_dist=min_dist, spread=spread, n_components=n_components)
elif embedding == "tsne":
    sc.tl.tsne(adata, perplexity=perplexity, learning_rate=learning_rate)

___________

## 6 - Cell clustering
<hr style="border:2px solid black"> </hr>
- NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

In [None]:
# plot different clustering resolutions
if search_clustering_parameters:
    pl.clustering.search_clustering_parameters(adata, ncols=4, method=clustering_method)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# choose final resolution
clustering_column = "leiden_0.5"

_________

### 6.1 - Revise clustering (optional)
Here you can use the `tools.clustering.recluster` function to iteratively adjust clustering. The two cells below are provided as a template for either *joining* clusters (first cell) or *splitting* clusters (second cell) __copy the cells as needed__. Fill in the list at the top of the respective cell to activate *joining* or *splitting*.

In [None]:
%bgcolor PowderBlue
# combine multiple clusters into one
# skipped when empty
combine = [] # add cluster names

if combine:
    tools.clustering.recluster(
        adata=adata,
        column=clustering_column,
        clusters=combine,
        task="join",
        embedding=embedding,
        key_added="recluster"
    )

    clustering_column = "recluster"

In [None]:
%bgcolor PowderBlue
# split (recluster) one or more clusters
# skipped when empty
split = [] # add cluster names
resolution=0.15 # 0-1, small values create less clusters

if split:
    tools.clustering.recluster(
        adata=adata, 
        column=clustering_column,
        clusters=split,
        task="split",
        resolution=resolution,
        embedding=embedding,
        key_added="recluster"
    )

    clustering_column = "recluster"

In [None]:
 # Create final clustering
adata.obs["clustering"] = utils.tables.rename_categories(adata.obs[clustering_column])

___________

### 6.2 - Final clustering of cells

In [None]:
#Plot final leiden
sc.pl.embedding(adata, basis="X_" + embedding, color=[condition_column, "clustering"], show=False)
pl.general._save_figure("embedding_clustering.pdf")

___________

## 7 - Plot distribution of cells across clusters

In [None]:
_ = pl.qc_filter.n_cells_barplot(
    adata,
    "clustering",
    groupby=condition_column,
    save="cell_distribution_barplot.pdf"
)

________

## 8 - Generating 3D Object with UMAP coordinates in HTML

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

#plot 3D html for the "clustering" adata.obs, change to individual leiden or other columns if needed
column_3d = "clustering"

____

In [None]:
if embedding == "umap" and n_components > 2:
    pl.embedding.plot_3D_UMAP(adata, column_3d, save=f"umap_3d_{column_3d}")
    html_file = sctoolbox.settings.full_figure_prefix + f"umap_3d_{column_3d}.html"

    from IPython.display import IFrame
    display(IFrame(src=html_file, width=800, height=400))

_________

## 9 - Annotate regions to genes
This function uses UROPA to annotate regions to genes with a gtf file containing the genes as reference.

In [None]:
tools.peak_annotation.annotate_adata(
    adata,
    GTF_PATH,
    config=None,
    best=True,
    threads=6,
    coordinate_cols=None,
    temp_dir="tmp",
    inplace=True
)

In [None]:
adata.var

__________

## 10 - Saving adata for next notebook

In [None]:
utils.adata.save_h5ad(adata, "anndata_4.h5ad")

In [None]:
sctoolbox.settings.close_logfile()