## Notebook for re-clustering as needed

In [None]:
!date

#### import libraries

In [None]:
import scanpy as sc
from pandas import DataFrame
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
from seaborn import lineplot
from sklearn.metrics import silhouette_score
from numpy import arange, mean

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'
sc.settings.figdir = f'{figures_dir}/'

# in files
anndata_file = f'{quants_dir}/{project}.multivi.curated.h5ad'

# out files
final_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad'

# variables
DEBUG = False

### load data

#### load the anndata object

In [None]:
%%time
adata_multivi = sc.read_h5ad(anndata_file)
print(adata_multivi)
if DEBUG:
    display(adata_multivi.obs.head())

In [None]:
if DEBUG:
    display(adata_multivi.obs.curated_type.value_counts())

### visualize the final curated cell-types

In [None]:
figure_file = f'_{project}.umap.curated_celltype.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    sc.pl.umap(adata_multivi, color=['curated_type'], 
               frameon=False, legend_loc='on data', save=figure_file)

In [None]:
figure_file = f'_{project}.umap.curated_prev_celltypes.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    sc.pl.umap(adata_multivi, color=['Cell_type'], 
               frameon=False, legend_loc='on data', save=figure_file)

### check range of Leiden resolutions for clustering

In [None]:
%%time
resolutions_to_try = arange(0.3, 1.05, 0.05)
print(resolutions_to_try)
mean_scores = {}
largest_score = 0
best_res = 0
new_leiden_key = 'leiden_MultiVI'
for leiden_res in resolutions_to_try:
    # use only 2 decimals
    leiden_res = round(leiden_res, 2)    
    print(f'### using Leiden resolution of {leiden_res}')
    # neighbors were already computed using scVI
    sc.tl.leiden(adata_multivi, key_added=new_leiden_key, resolution=leiden_res)
    silhouette_avg = silhouette_score(adata_multivi.obsm['MultiVI_latent'], adata_multivi.obs[new_leiden_key])
    print((f'For res = {leiden_res:.2f}, average silhouette: {silhouette_avg:.3f} '
           f'for {adata_multivi.obs[new_leiden_key].nunique()} clusters'))
    # mean sample count per cluster
    df_grouped = adata_multivi.obs.groupby(new_leiden_key)['sample_id'].count()
    mean_sample_per_cluster = df_grouped.mean()
    # mean cell count per cluster
    df_grouped = adata_multivi.obs[new_leiden_key].value_counts()
    mean_cell_per_cluster = df_grouped.mean()        
    mean_scores[leiden_res] = [silhouette_avg, adata_multivi.obs[new_leiden_key].nunique(), 
                               mean_sample_per_cluster, mean_cell_per_cluster]
    # update best resolution info
    if silhouette_avg > largest_score:
        largest_score = silhouette_avg
        best_res = leiden_res

    figure_file = f'_{project}.umap.{leiden_res}.leiden.png'
    with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
        plt.style.use('seaborn-v0_8-bright')
        sc.pl.umap(adata_multivi, color=[new_leiden_key], 
                   frameon=False, save=figure_file)

In [None]:

scores_df = DataFrame(index=mean_scores.keys(), data=mean_scores.values())
scores_df.columns = ['score', 'num_clusters', 'mean_samples', 'mean_cells']
print('max score at')
best_result = scores_df.loc[scores_df.score == scores_df.score.max()]
display(best_result)
best_resolution = best_result.index.values[0]
print(f'best resolution found at {best_resolution}')
if DEBUG:
    display(scores_df)
fig_filename = f'{figures_dir}/leiden_resolution_silhouette_score.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    lineplot(x=scores_df.index, y='score', data=scores_df)
    plt.xlabel('resolution')
    plt.savefig(fig_filename)
    plt.show()
lineplot(x=scores_df.index, y='num_clusters', data=scores_df)
plt.xlabel('resolution')
plt.show()
lineplot(x=scores_df.index, y='mean_samples', data=scores_df)
plt.xlabel('resolution')
plt.show()
lineplot(x=scores_df.index, y='mean_cells', data=scores_df)
plt.xlabel('resolution')
plt.show()

### re-cluster at the best resolution found based on Silhouette score
note based on Phase1 clustering, in order to seperate PVALB and SST InN LHX6 clusters would use Leiden resolution of 0.9 but for now stick with best scored value of 0.5

In [None]:
best_resolution = round(best_resolution, 2)
sc.tl.leiden(adata_multivi, key_added='leiden_MultiVI', resolution=best_resolution)

### assign the small number of cells that may have moved around during re-clustering or dropped in or out around thresholds
make the assigment based on what the rest of the cluster was called

In [None]:
if DEBUG:
    display(adata_multivi.obs.loc[adata_multivi.obs.curated_type.isna()].Cell_type.value_counts())

In [None]:
for cluster_number in adata_multivi.obs.leiden_MultiVI.unique():
    # new_assignment = set()
    print(f'--- {cluster_number}')
    this_cell_set = set(adata_multivi.obs.loc[adata_multivi.obs.leiden_MultiVI == cluster_number].index)
    new_assignment = adata_multivi.obs[adata_multivi.obs.leiden_MultiVI == cluster_number].curated_type.value_counts().idxmax()
    if DEBUG:
        print(len(this_cell_set))
        display(adata_multivi.obs[adata_multivi.obs.leiden_MultiVI == cluster_number].curated_type.value_counts()) 
        print(new_assignment)        
    adata_multivi.obs.loc[adata_multivi.obs.index.isin(this_cell_set), 'curated_type'] = new_assignment

In [None]:
if DEBUG:
    display(adata_multivi.obs.curated_type.value_counts())

#### notes from the re-assignments above
particularly OPC-4, PeriVasc-20, and Micro-30

- OD-0 had 14 cells from 5 other types re-assigned to OD, 0%
- OPC-4 had 898 cells from OD re-assigned to OPC, ~7%
- Micro-3 had 9 cells from 3 other types re-assigned to Micro, 0%
- InN-5 had 4 cells from 2 other types re-assigned to InN, 0%
- InN-12 had 1 cell from OD re-assigned to InN, 0%
- InN-10 had 2 cell from 2 other types re-assigned to InN, 0%
- Astro-1 had 43 cells from 4 other types re-assigned to Astro, 0%
- PeriVasc-20 had 905 VLMC cells and 2 cells from 2 other types re-assigned to PeriVasc, 58.6%
- ExN-7 had 1 InN cell re-assigned to ExN, 0%
- ExN-2 had 1 InN cell re-assigned to ExN, 0%
- Micro-30 had 107 VLMC cells reassigned to Micro, 27.4%
- OD-21 had 7 cells from 2 other types assigned to OD, 0%
- InN-17 had 3 ExN cells re-assigned as InN, 0%
- VLMC-22 had 2 cells from 2 other types re-assigned as VLMC, 0%
- ExN-29 had 1 InN cell re-assigned as ExN, 0%
- ExN-28 had 8 OD cells re-assigned as ExN, 1%
- ExN-19 had 35 cells from 5 other types re-assigned as ExN, 1%
- Uncertain-16 had 2 Astro cells re-assigned as Uncertain, 0%
- RadialGlia-27 had 2 VLMC cells re-assigned as RadialGlia, 0%

ExN-37 only has one cell in it

### re-name cluster specific names

In [None]:
adata_multivi.obs['cluster_name'] = (adata_multivi.obs.curated_type.astype('str') + 
                                     '-' + adata_multivi.obs.leiden_MultiVI.astype('str'))
print(f'new shape of obs {adata_multivi.obs.shape}')
print(adata_multivi)
if DEBUG:
    display(adata_multivi.obs.sample(5))
    display(adata_multivi.obs.cluster_name.value_counts())

In [None]:
adata_multivi.obs.curated_type.value_counts()

In [None]:
if DEBUG:
    for cluster_num in adata_multivi.obs.leiden_MultiVI.unique():
        print(f'### cluster number: {cluster_num}')
        display(adata_multivi.obs[adata_multivi.obs.leiden_MultiVI == cluster_num].curated_type.value_counts())
        display(adata_multivi.obs[adata_multivi.obs.leiden_MultiVI == cluster_num].modality.value_counts())

#### Notes about some counts that stand out in the clusters:
Smaller clusters that are almost exclusively single modality

- ExN-37 only has one cell in it and is an ATAC sample
- Astro-36 has 75 cells all ATAC
- ExN-31 has 397 cells all ATAC
- ExN-26 has 1040 cells 99.9% ATAC
- Astro-35 has 111 cells 99.9% ATAC
- RadialGlia-27 has 693 cells all GEX
- Uncertain-16 has 2914 cells all GEX
- Astro-34 has 209 cells 99.9% GEX
- ExN-25 1117 cells 99% GEX
- InN-13 has 3897 cells 99% ATAC

In [None]:
figure_file = f'_{project}.umap.curated_celltype_final.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    sc.pl.umap(adata_multivi, color=['curated_type'], 
               frameon=False, legend_loc='on data', save=figure_file)

In [None]:
figure_file = f'_{project}.umap.curated_cluster_names_final.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    sc.pl.umap(adata_multivi, color=['cluster_name'], 
               frameon=False, legend_loc='on data', save=figure_file)

In [None]:
figure_file = f'_{project}.umap.curated_final.png'
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    sc.pl.umap(adata_multivi, color=['curated_type', 'cluster_name'], 
               frameon=False, legend_loc='on data', save=figure_file)

### save the modified FINAL anndata object

In [None]:
%%time
adata_multivi.write(final_file)

### visualize clusters of curated broad cell types

In [None]:
for curated_type in adata_multivi.obs.curated_type.unique():
    print(curated_type)
    adata_sub = adata_multivi[(adata_multivi.obs.curated_type == curated_type)].copy()
    if DEBUG:
        print(adata_sub)
        display(adata_sub.obs.curated_type.value_counts())
    figure_file = f'_{project}.umap.{curated_type}_final.png'
    with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
        plt.style.use('seaborn-v0_8-talk')
        sc.pl.umap(adata_sub, color=['cluster_name'], frameon=False, 
                   legend_loc='on data', save=figure_file)    

In [None]:
!date