# Clustering

## Settings

In [None]:
# set true if you want rerun umap embedding
run_umap = True
n_pcs = 20
first_pc = 0
metacol = 'sample'

##################### CLUSTERING ########################
# clustering method
method = 'leiden'  # leiden or louvain
n_neighbors=15 #Set the number of nearest neighbors to be used in clustering. Default=15
# if True, a range of resolutions will be calculated and plotted, so you can choose the best fit
# If False, set the resolution parameter
search_clustering_parameters = True
res = 0.5

# name of condition column in adata.obs, will be used for plotting
condition_column = 'sample'

# save figures
save_figs = False

## Loading packages and setup

In [None]:
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import sctoolbox.utils as utils
import scanpy as sc

utils.settings_from_config("config.yaml", key="04")

## Load anndata 

In [None]:
adata = utils.load_h5ad("anndata_3.h5ad")
display(adata)

## PCA

In [None]:
# maybe run PCA again after batch correction?
#sc.pp.pca(adata, svd_solver='arpack', n_comps=50, use_highly_variable=True)

### Subset number of PCs
 Find initial neighbors and calculate differential expression
- NOTE: trim: Set to 0 to skip default 10. trims neighbours of cells can help to identify individual populations. Lower value more population but more batch
- NOTE: n_pcs: number of pca dimensions

In [None]:
if n_pcs is None:
    n_pcs = tools.define_PC(adata)

In [None]:
ax = pl.plot_pca_variance(adata, n_selected=n_pcs, save="PC_selection.pdf")

In [None]:
# Subset PCA
tools.subset_PCA(adata, n_pcs, start=first_pc)

## Calc Neighbours

In [None]:
# do we have to run neighbors again after batch correction?
# # Change to module
# if save_figs:
#     epi.pl.pca_overview(adata, color=['nb_features'], show=False)
#     #plt.savefig(f'{OUTPUT_FIGS}/pca_nb_features.png')
#     plt.show()
# else:
#     epi.pl.pca_overview(adata, color=['nb_features'])

In [None]:
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)

## UMAP

In [None]:
if run_umap:
    pl.search_umap_parameters(adata, 
                       min_dist_range=(0.1, 0.4, 0.1), 
                       spread_range=(2.0, 3.0, 0.5), 
                       color=metacol, 
                       n_components=2, 
                       verbose=True, 
                       threads=4, 
                       save=None)

In [None]:
# Choose final parameters
min_dist = 0.2
spread = 2.5

In [None]:
sc.tl.umap(adata, min_dist=min_dist, spread=spread)
sc.pl.umap(adata, color=condition_column)

## Cell Clustering
- NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

In [None]:
# plot different clustering resolutions
if search_clustering_parameters:
    pl.search_clustering_parameters(adata, ncols=4, method=method)

In [None]:
# choose final resolution
resolution = "0.1"

In [None]:
clustering_column = "leiden_" + str(resolution)
adata.uns['sctoolbox']['clustering'] = [clustering_column]

### Reclustering
- Based on the last two plots, mainly the heatmap, decide how to make reclusterization.

In [None]:
#recluster_dec = input('Do you want to recluster? answer with yes or no: ')
recluster_dec = "no"

In [None]:
if recluster_dec.lower() == 'yes':
    recluster_cols = input('Which clusters do you want to recluster? Enter cluster numbers sperated by commas: ')
    recluster_cols = recluster_cols.split(',')
    join_split = input("Do you want to join or split the clusters? ")
    tools.recluster(adata, clustering_column, recluster_cols, task=join_split)
    clustering_column += "_recluster"  #update clustering column

In [None]:
#Create final clustering
adata.obs["clustering"] = utils.rename_categories(adata.obs[clustering_column])

### Final clustering

In [None]:
#Plot final leiden
sc.pl.umap(adata, color=[condition_column, "clustering"])
#utils.save_figure(figure_path + "umap_final.pdf")

## Plot distribution of cells across clusters

In [None]:
# n_cells_barplot(adata, "clustering", groupby=condition_column)
#                    save=figure_path + "cell_distribution_barplot.pdf")


## Save anndata

In [None]:
utils.save_h5ad(adata, "anndata_4.h5ad")