# Clustering

## Settings

#Path related settings (these should be the same as for the previous notebook)
output_dir = '/home/rstudio/processed_data'
test = 'cropped_146'

In [128]:
# Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/jdetlef/processed_data'
test = 'Esophagus'

In [129]:
# set true if you want rerun umap embedding
run_umap = False

##################### CLUSTERING ########################
# clustering method
method = 'leiden'  # leiden or louvain
# if True, a range of resolutions will be calculated and plotted, so you can choose the best fit
# If False, set the resolution parameter
search_clustering_parameters = True
res = 0.5

# name of condition column in adata.obs, will be used for plotting
condition_column = 'sample'

# save figures
save_figs = False

## Loading packages and setup

In [130]:
# sctoolbox modules
import sctoolbox.atac_tree as sub_tree
import sctoolbox.creators as cr
import sctoolbox.annotation as an
from sctoolbox.qc_filter import *
from sctoolbox.plotting import *
from sctoolbox.atac_utils import *
from sctoolbox.analyser import *
import sctoolbox.atac as atac
# import episcanpy
import scanpy as sc
import episcanpy as epi
#from episcanpy.preprocessing import _decomposition
import numpy as np

## Setup path handling object 

In [131]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

all directories existing
all directories existing


## Load anndata 

In [132]:
# path to norm_correction anndata should be here -> not yet implemented in generalized_tree
norm_correction_output = tree.norm_correction_anndata  # path to norm_correction anndata should be here
adata = epi.read_h5ad(norm_correction_output)
adata

AnnData object with n_obs × n_vars = 45493 × 148047
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample', 'n_features_by_counts', 'log1p_n_features_by_counts', 'total_counts', 'log1p_total_counts', 'insertsize_count', 'mean_insertsize', 'n_total_fragments', 'n_fragments_in_promoters', 'pct_fragments_in_promoters', 'nb_features', 'log_nb_features'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
    uns: 'color_set', 'infoprocess', 'insertsize_distribution', 'legend'
    layers: 'binary'

## PCA

In [138]:
# maybe run PCA again after batch correction?
#sc.pp.pca(adata, svd_solver='arpack', n_comps=50, use_highly_variable=True)

## Calc Neighbours

In [1]:
# do we have to run neighbors again after batch correction?
# # Change to module
# if save_figs:
#     epi.pl.pca_overview(adata, color=['nb_features'], show=False)
#     #plt.savefig(f'{OUTPUT_FIGS}/pca_nb_features.png')
#     plt.show()
# else:
#     epi.pl.pca_overview(adata, color=['nb_features'])

In [140]:
#sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, method='umap', metric='euclidean')

## UMAP

In [None]:
if run_umap:
    search_umap_parameters(adata, 
                       dist_range=(0.1, 0.4, 0.1), 
                       spread_range=(2.0, 3.0, 0.5), 
                       metacol='sample', 
                       n_components=2, 
                       verbose=True, 
                       threads=4, 
                       save=None)

In [2]:
if run_umap:
    min_dist = float(input('Input min_dist parameter: '))
    spread = float(input('Input spread parameter: '))
    # Plot final UMAP with quality measures
    sc.tl.umap(adata, min_dist=min_dist, spread=spread)
    sc.pl.umap(adata, color=condition_column)

## Cell Clustering
- NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

In [None]:
# plot different clustering resolutions
if search_clustering_parameters:
    search_clustering_parameters(adata, ncols=4, method=method)

In [None]:
# choose final resolution
if search_clustering_parameters:
    res = input('Input resolution for final clustering: ')
    clustering_column = "leiden_" + res

### Reclustering
- Based on the last two plots, mainly the heatmap, decide how to make reclusterization.

In [None]:
recluster_dec = input('Do you want to recluster? answer with yes or no: ')

In [None]:
if recluster_dec.lower() == 'yes':
    recluster_cols = input('Which clusters do you want to recluster? Enter cluster numbers sperated by commas: ')
    recluster_cols = recluster_cols.split(',')
    join_split = input("Do you want to join or split the clusters? ")
    recluster(adata, clustering_column, recluster_cols, task=join_split)
    clustering_column += "_recluster"  #update clustering column

In [None]:
#Create final clustering
adata.obs["clustering"] = analyser.rename_categories(adata.obs[clustering_column])

### Final clustering

In [None]:
#Plot final leiden
sc.pl.umap(adata, color=[condition_column, "clustering"])
#utils.save_figure(figure_path + "umap_final.pdf")

## Plot distribution of cells across clusters

In [None]:
n_cells_barplot(adata, "clustering", groupby=condition_column)
#                    save=figure_path + "cell_distribution_barplot.pdf")


## save anndata