In [None]:
from sctoolbox.utilities import bgcolor

# Embedding and clustering
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

############### DEFINING RUN ID ######################################
run_id = "Run1"

############### DEFINING INPUTS ######################################
#path_imput="/mnt/agnerds/loosolab_SC_RNA_framework/raw_data" # This is the directory where the preprocessed data is located

############### DEFINING THE STAGE OF ANALYSIS ########################
is_it_the_final_run = "No" # Set to Yes if this is the final running before send to colaborators

#### Number of PCs
n_pcs = None # This overwrites the automatic selection of PCs. Set to None to set number of PCs automatically

search_umap_parameters = True

############### CLUSTERING DATA ######################################
n_neighbors = 15 # Set the number of nearest neighbors to be used in clustering. Default=15

dist_range = (0.1, 1, 0.2) # Set min_dist range for umap
spread_range = (1.0, 3.5, 0.5) # Set spread range for umap

search_clustering_parameters = True
clustering_method = "leiden" # leiden/louvain

############### DEFINING COLORS AND FIGURE PATH #####################
color_list = ['green', 'red', 'blue', 'pink', 'chartreuse', 'gray', 'yellow', 'brown', 'purple', 'orange', 'wheat', 'lightseagreen', 'cyan', 'khaki', 'cornflowerblue', 'olive', 'gainsboro', 'darkmagenta', 'slategray', 'ivory', 'darkorchid', 'papayawhip', 'paleturquoise', 'oldlace', 'orangered', 'lavenderblush', 'gold', 'seagreen', 'deepskyblue', 'lavender', 'peru', 'silver', 'midnightblue', 'antiquewhite', 'blanchedalmond', 'firebrick', 'greenyellow', 'thistle', 'powderblue', 'darkseagreen', 'darkolivegreen', 'moccasin', 'olivedrab', 'mediumseagreen', 'lightgray', 'darkgreen', 'tan', 'yellowgreen', 'peachpuff', 'cornsilk', 'darkblue', 'violet', 'cadetblue', 'palegoldenrod', 'darkturquoise', 'sienna', 'mediumorchid', 'springgreen', 'darkgoldenrod', 'magenta', 'steelblue', 'navy', 'lightgoldenrodyellow', 'saddlebrown', 'aliceblue', 'beige', 'hotpink', 'aquamarine', 'tomato', 'darksalmon', 'navajowhite', 'lawngreen', 'lightsteelblue', 'crimson', 'mediumturquoise', 'mistyrose', 'lightcoral', 'mediumaquamarine', 'mediumblue', 'darkred', 'lightskyblue', 'mediumspringgreen', 'darkviolet', 'royalblue', 'seashell', 'azure', 'lightgreen', 'fuchsia', 'floralwhite', 'mintcream', 'lightcyan', 'bisque', 'deeppink', 'limegreen', 'lightblue', 'darkkhaki', 'maroon', 'aqua', 'lightyellow', 'plum', 'indianred', 'linen', 'honeydew', 'burlywood', 'goldenrod', 'mediumslateblue', 'lime', 'lightslategray', 'forestgreen', 'dimgray', 'lemonchiffon', 'darkgray', 'dodgerblue', 'darkcyan', 'orchid', 'blueviolet', 'mediumpurple', 'darkslategray', 'turquoise', 'salmon', 'lightsalmon', 'coral', 'lightpink', 'slateblue', 'darkslateblue', 'white', 'sandybrown', 'chocolate', 'teal', 'mediumvioletred', 'skyblue', 'snow', 'palegreen', 'ghostwhite', 'indigo', 'rosybrown', 'palevioletred', 'darkorange', 'whitesmoke']

<hr style="border:2px solid black"> </hr>

## Loading packages

In [None]:
import os
from os import path
import scanpy as sc
from kneed import KneeLocator
import matplotlib.pyplot as plt

import sctoolbox.utilities as utils
import sctoolbox.analyser as analyser
import sctoolbox.plotting as pl

## Load anndata from previous notebook

In [None]:
adata = utils.load_anndata(is_from_previous_note=True, which_notebook=3)
display(adata)

In [None]:
condition_column = adata.uns['infoprocess']['data_to_evaluate']

In [None]:
figure_path = adata.uns["infoprocess"]["Anndata_path"] + "/"

## Subset number of PCs 
 Find initial neighbors and calculate differential expression
- NOTE: trim: Set to 0 to skip default 10. trims neighbours of cells can help to identify individual populations. Lower value more population but more batch
- NOTE: n_pcs: number of pca dimensions

In [None]:
if n_pcs is None:
    n_pcs = analyser.define_PC(adata)

In [None]:
ax = pl.plot_pca_variance(adata)
ax.axvline(n_pcs-0.5, color="red", label=f"n PCs included: {n_pcs}")
plt.legend()
utils.save_figure(figure_path + "PC_selection.pdf")

In [None]:
# Subset PCA
analyser.subset_PCA(adata, n_pcs)

## Rerun neighbors

In [None]:
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)

## Calculate UMAP and find best setting

- NOTE: min_dist: distances between points to make the plot looks more 'clustered'
- NOTE: spread: The effective scale of embedded points value be de default is 1

In [None]:
if search_umap_parameters:
    pl.search_umap_parameters(adata,dist_range=dist_range,
                              spread_range=spread_range,
                              metacol=condition_column,
                              save=figure_path + "UMAP_parameter_search.pdf")

In [None]:
%bgcolor PowderBlue

# Final choice of spread / dist
min_dist = 0.4
spread = 2.5

In [None]:
# Plot final UMAP with quality measures
sc.tl.umap(adata, min_dist=min_dist, spread=spread)
sc.pl.umap(adata, color=condition_column)

In [None]:
# Plot distribution of samples in umap
pl.plot_group_embeddings(adata, groupby=condition_column, save=figure_path + "sample_distribution_umap.png")

## Cell clustering
- NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

In [None]:
if search_clustering_parameters:
    pl.search_clustering_parameters(adata, ncols=4, method=clustering_method, save=figure_path + "clustering_search.png")

In [None]:
%bgcolor PowderBlue

#Choose final resolution
clustering_column = "leiden_0.5"

### Reclustering

- Based on the last two plots, mainly the heatmap, decide how to make reclusterization.

In [None]:
analyser.recluster(adata, clustering_column, ["1", "3"], task="join")
clustering_column = "leiden_0.5_recluster"  #update clustering column

In [None]:
analyser.recluster(adata, clustering_column, ["3"], task="split", resolution=0.15, key_added=clustering_column)  #overwrite column

In [None]:
#Create final clustering
adata.obs["clustering"] = analyser.rename_categories(adata.obs[clustering_column])

### Final clustering

In [None]:
#Plot final leiden
sc.pl.umap(adata, color=[condition_column, "clustering"], show=False)
utils.save_figure(figure_path + "umap_final.pdf")

## Plot distribution of cells across clusters

In [None]:
pl.n_cells_barplot(adata, "clustering", groupby=condition_column, 
                   save=figure_path + "cell_distribution_barplot.pdf")

## Saving adata for next notebook

In [None]:
utils.saving_anndata(adata, current_notebook=4)