# Clustering

## Settings

In [None]:
#Column to show in UMAPs
metacol = 'sample'

#Number of threads to use for multiprocessing
threads = 4

# Search embedding parameters (or set parameters later)
embedding = "umap"   #umap or tsne
search_parameters = True

dist_range = (0.1, 0.3, 0.1) # Set min_dist range for umap
spread_range = (1.0, 2.0, 0.5) # Set spread range for umap
n_components = 2 # Number of components for umap
perplexity_range = (30, 60, 10)        # perplexity range for tsne
learning_rate_range = (600, 1000, 200)   # learning_rate for tsne

# Search different clustering resolutions
search_clustering_parameters = True
clustering_method = "leiden" #leiden/louvain

## Loading packages and setup

In [None]:
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import sctoolbox.utils as utils
import scanpy as sc

utils.settings_from_config("config.yaml", key="04")

sc.set_figure_params(vector_friendly=True, dpi_save=600, scanpy=False)

## Load anndata 

In [None]:
adata = utils.load_h5ad("anndata_3.h5ad")
display(adata)

## Embedding

In [None]:
if search_parameters:
    if embedding == "umap":
        pl.search_umap_parameters(adata, 
                                  min_dist_range=dist_range,
                                  spread_range=spread_range,
                                  color=metacol,
                                  n_components=n_components,
                                  threads=threads,
                                  save="UMAP_parameter_search.pdf")
    elif embedding == "tsne":
        pl.search_tsne_parameters(adata, 
                                  perplexity_range=perplexity_range,
                                  learning_rate_range=learning_rate_range,
                                  color=metacol,
                                  threads=threads,
                                  save="TSNE_parameter_search.pdf")

In [None]:
# Final choice of spread / dist for umap
min_dist = 0.2
spread = 1.5

# Final choice of perplexity_range / perplexity_range for tsne
perplexity = 50
learning_rate = 800

In [None]:
# Calculate final embedding
if embedding == "umap":
    sc.tl.umap(adata, min_dist=min_dist, spread=spread, n_components=n_components)
elif embedding == "tsne":
    sc.tl.tsne(adata, perplexity=perplexity, learning_rate=learning_rate)

In [None]:
#Adjust qc columns to show in plot 
qc_columns = [metacol] + list(adata.uns["sctoolbox"]["obs_metrics"])

In [None]:
# Plot final umap/tsne with quality measures
sc.pl.embedding(adata, basis=embedding, color=qc_columns, ncols=3, show=False)
pl._save_figure("embedding_quality.pdf")

--------------

## Cell Clustering
- NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

In [None]:
# plot different clustering resolutions
if search_clustering_parameters:
    pl.search_clustering_parameters(adata, ncols=4, method=clustering_method)

In [None]:
# choose final resolution
resolution = "0.1"

In [None]:
clustering_column = "leiden_" + str(resolution)
adata.uns['sctoolbox']['clustering'] = [clustering_column]

### Reclustering
- Based on the last two plots, mainly the heatmap, decide how to make reclusterization.

In [None]:
#recluster_dec = input('Do you want to recluster? answer with yes or no: ')
recluster_dec = "no"

In [None]:
if recluster_dec.lower() == 'yes':
    recluster_cols = input('Which clusters do you want to recluster? Enter cluster numbers sperated by commas: ')
    recluster_cols = recluster_cols.split(',')
    join_split = input("Do you want to join or split the clusters? ")
    tools.recluster(adata, clustering_column, recluster_cols, task=join_split)
    clustering_column += "_recluster"  #update clustering column

In [None]:
#Create final clustering
adata.obs["clustering"] = utils.rename_categories(adata.obs[clustering_column])

### Final clustering of cells

In [None]:
#Plot final leiden
sc.pl.embedding(adata, basis="X_" + embedding, color=[metacol, "clustering"], show=False)
pl._save_figure("embedding_clustering.pdf")

## Plot distribution of cells across clusters

In [None]:
_ = pl.n_cells_barplot(adata, "clustering", groupby=metacol, 
                       save="cell_distribution_barplot.pdf")

---------

## Save anndata

In [None]:
utils.save_h5ad(adata, "anndata_4.h5ad")