In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[3, 5, 12, 13, 14, 18])

nb_name = "04_clustering.ipynb"

_compare_version(nb_name)

# 04 - Embedding and clustering
<hr style="border:2px solid black"> </hr>

## 1 - Description
This notebook aims to group cells based on how similar their gene expression profiles are. Ideally, the resulting clusters should represent different cell types. This grouping is achieved in two steps: **Embedding** and **Clustering**.

### 1.1 Embedding
Embeddings are dimension reduction methods to transform high-dimensional data into lower-dimensional representations while preserving the inherent structure and relationships between individual cells.  
The sctoolbox supports the [Uniform Manifold Approximation and Projection (UMAP)](https://arxiv.org/abs/1802.03426) and the [t-distributed stochastic neighbor embedding (t-SNE)](https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) methods for dimension reduction, with UMAP being set as the default value.
To learn more about the differences between those methods and get more insight in the parameter selction have a look [here for umap](https://pair-code.github.io/understanding-umap/) and [here for t-SNE](https://distill.pub/2016/misread-tsne/?_ga=2.135835192.888864733.1531353600-1779571267.1531353600).
### 1.2 Clustering
Single cell clustering is used to group individual cells into clusters based on similarities in their gene expression. The clustering allows to identify distinct cell types and characterize cellular heterogeneity within a population.
The sctoolbox supports the [leiden](https://www.nature.com/articles/s41598-019-41695-z) and the [louvain](https://iopscience.iop.org/article/10.1088/1742-5468/2008/10/P10008) clustering methods, with the leiden clustering algorithm being newer and recommended to use.

-----------

## 2 - Setup

In [None]:
import scanpy as sc
import pandas as pd

import sctoolbox
import sctoolbox.utils as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl

sctoolbox.settings.settings_from_config("config.yaml", key="04")

# Set additional options for figures
sc.set_figure_params(vector_friendly=True, dpi_save=600, scanpy=False)

-----------

## 3 - Load anndata

In [None]:
adata = utils.adata.load_h5ad("anndata_3.h5ad")

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

------------

## 4 - General input
Choose the embedding and clustering method and adjust the range of parameters for each one of them. The specific parameters for **embedding** (`section 5`) and **clustering** (`section 7`) can be chosen after visually inspecting the results.

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Column to show in UMAPs
condition_column = "sample"

# Number of threads to use for multiprocessing
threads = 4

# Search embedding parameters (or set parameters later)
embedding = "umap"   # umap or tsne
search_parameters = True

# UMAP parameters
dist_range = (0.1, 0.31, 0.1)  # Set min_dist range for umap
spread_range = (1, 2.5, 0.5)  # Set spread range for umap
n_components = 2  # Number of components for umap (>=3 enables 3d UMAP but likely degrades 2d results)

# t-SNE parameters
perplexity_range = (30, 60, 10)  # perplexity range for tsne
learning_rate_range = (400, 1000, 200)  # learning_rate for tsne

# Search different clustering resolutions
search_clustering_parameters = True
cluster_res_range = (0.1, 1, 0.1)  # Set the searched resolution range from low to high resolution (less to more clusters).
clustering_method = "leiden"  # leiden or louvain
cluster_ncols = 4  # Number of columns displayed in the plot

_____________

## 5 - Calculate UMAP/TSNE and find the best setting
<hr style="border:2px solid black"> </hr>

After visually inspecting the results, adjust the parameters shown below for the best embedding. While it is somewhat subjective what the "best" parameters for an embedding should be, the chosen embedding should display clear structures that are neither spread too thin nor too clumped up.

### 5.1 Parameter overview

|Method|Parameter|Description|
|------|---------|-----------|
|[UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)|`min_dist`|Distances between points to make the plot look more 'clustered'.|
|[UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)|`spread`|The effective scale of embedded points. Relative to `min_dist`|
|[tSNE](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.tsne.html)|`perplexity`|Related to the number of nearest-neighbors accounted for during embedding creation. Larger datasets require bigger values.|
|[tSNE](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.tsne.html)|`learning_rate`|Can be important for embedding cluster formation.|

In [None]:
if search_parameters:
    if embedding == "umap":
        pl.embedding.search_umap_parameters(
            adata, 
            min_dist_range=dist_range,
            spread_range=spread_range,
            color=condition_column,
            n_components=n_components,
            threads=threads,
            save="UMAP_parameter_search.pdf"
        )
    elif embedding == "tsne":
        pl.embedding.search_tsne_parameters(
            adata, 
            perplexity_range=perplexity_range,
            learning_rate_range=learning_rate_range,
            color=condition_column,
            threads=threads,
            save="TSNE_parameter_search.pdf"
        )

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Final choice of spread / dist for umap
min_dist = 0.4
spread = 2.5

# Final choice of perplexity / learning_rate for tsne
perplexity = 50
learning_rate = 800

___

In [None]:
# Calculate final embedding
if embedding == "umap":
    sc.tl.umap(adata, min_dist=min_dist, spread=spread, n_components=n_components)
elif embedding == "tsne":
    sc.tl.tsne(adata, perplexity=perplexity, learning_rate=learning_rate)

___

## 6 - Plot final embedding with quality measures

In [None]:
# Adjust qc columns to show in plot
qc_columns = [k for k in adata.uns["sctoolbox"]["report"]["qc"]["obs"]["threshold"].keys() if k not in ["before", "after"]] + ["phase", condition_column]

In [None]:
# Plot final umap/tsne with quality measures
_ = pl.embedding.plot_embedding(adata, method=embedding, color=qc_columns, ncols=3, save="embedding_quality.pdf")

In [None]:
# Plot distribution of samples in embedding
_ = pl.embedding.plot_group_embeddings(
    adata,
    groupby=condition_column,
    embedding=embedding,
    save="embedding_sample_distribution.pdf"
)

In [None]:
# Plot density of categorical QC columns
for qc_col in qc_columns:
    if qc_col in adata.obs.select_dtypes(exclude="number").columns:
        sc.tl.embedding_density(adata, basis=embedding, groupby=qc_col, key_added=f"{qc_col}_density")
        pl.embedding.plot_group_embeddings(adata, col=f"{qc_col}_density", groupby=qc_col,
                                           embedding=embedding, ncols=3, color_map="YlOrRd",
                                           save=f"embedding_density_{qc_col}.pdf")

----------

## 7 - Cell clustering
<hr style="border:2px solid black"> </hr>

This step assigns each cell into a cluster. Cells in the same cluster are assumed to be of the same cell type. Cells are assigned based on their distance within the nearest neighbor graph, which is loosely equivalent to their distance within the embedding. The resolution controls the coarseness of the clustering. A lower resolution results in fewer larger clusters, while a higher resolution results in more smaller clusters.

- `clustering_column`: To choose a resolution, change the number in `leiden_0.5`, for example `leiden_0.1` for a resolution of `0.1`. Higher values lead to more clusters.

In [None]:
if search_clustering_parameters:
    pl.clustering.search_clustering_parameters(
        adata,
        ncols=cluster_ncols,
        method=clustering_method, 
        embedding=embedding,
        resolution_range=cluster_res_range,
        save="clustering_search.png")

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Choose final resolution
clustering_column = "leiden_0.5"

___

### 7.1 - Revise clustering (optional)
Here you can use the `tools.clustering.recluster` function to iteratively adjust clustering. The two cells below are provided as a template for either *joining* clusters (first cell) or *splitting* clusters (second cell). Fill in the list at the top of the respective cell to activate *joining* or *splitting*. You can __copy the cells as needed__ for more combining and splitting steps.

In [None]:
# combine multiple clusters into one
# skipped when empty
combine = [] # add cluster names

if combine:
    tools.clustering.recluster(
        adata=adata,
        column=clustering_column,
        clusters=combine,
        task="join",
        embedding=embedding,
        key_added="recluster"
    )

    clustering_column = "recluster"

In [None]:
# split (recluster) one or more clusters
# skipped when empty
split = []  # add cluster names
resolution = 0.15  # 0-1, small values create less clusters

if split:
    tools.clustering.recluster(
        adata=adata, 
        column=clustering_column,
        clusters=split,
        task="split",
        resolution=resolution,
        embedding=embedding,
        key_added="recluster"
    )
    
    clustering_column = "recluster"

In [None]:
# Create final clustering
adata.obs["clustering"] = utils.tables.rename_categories(adata.obs[clustering_column])

___

### 7.2 - Final clustering of cells

In [None]:
# Plot final leiden
_ = pl.embedding.plot_embedding(adata, method=embedding, color=[condition_column, "clustering"], save="embedding_clustering.pdf")

---------

## 8 - Plot distribution of cells across clusters

In [None]:
_ = pl.qc_filter.n_cells_barplot(
    adata,
    "clustering",
    groupby=condition_column,
    save="cell_distribution_barplot.pdf"
)

------

## 9 - Generating 3D Object with UMAP coordinates in HTML

This optional step can be used to create a 3 dimensional UMAP. However, this is **disabled by default** since the process of optimizing in the 3D or higher dimensional space usually **degrades lower dimensional (2D) representation of the UMAP**. Set `n_components` >= 3 at the top of the notebook to enable this step but be aware of the implications.

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# plot 3D html for the "clustering" adata.obs, change to individual leiden or other columns if needed
column_3d = "clustering"

___

In [None]:
if embedding == "umap" and n_components > 2:
    pl.embedding.plot_3D_UMAP(adata, column_3d, save=f"umap_3d_{column_3d}")
    html_file = sctoolbox.settings.full_figure_prefix + f"umap_3d_{column_3d}.html"
    
    from IPython.display import IFrame
    display(IFrame(src=html_file, width=800, height=400))

---------

## 10 - Saving adata for next notebook

In [None]:
utils.adata.save_h5ad(adata, "anndata_4.h5ad")

In [None]:
sctoolbox.settings.close_logfile()