In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [1]:
from sctoolbox.utils.jupyter import bgcolor

# Marker genes and cell type assignment
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [2]:
%bgcolor PowderBlue

#Final clustering column to use for cell type assignment
clustering_col = "clustering"

#SCSA options
species = "zebrafish"

#Name of column to add with the marker gene predicted celltype
celltype_column_name = "marker_pred_celltype"

<hr style="border:2px solid black"> </hr>

## Loading packages

In [None]:
import scanpy as sc
import pandas as pd
pd.set_option('display.max_columns', None)  #no limit to the number of columns shown
import sctoolbox.utils as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
from sctoolbox._settings import settings

settings.settings_from_config("config.yaml", key="05")

In [None]:
# marker genes for zebrafish heart cell types
marker_genes_dict = pd.read_csv('marker_list.csv', sep='\t').groupby('cell_name').agg(lambda x: x.tolist())['gene_name'].to_dict()

## Loading adata

In [None]:
adata = utils.adata.load_h5ad("anndata_4.h5ad")
display(adata)

In [None]:
adata.obs

## Check for custom marker genes

In [None]:
marker_genes_dict = utils.checker.check_marker_lists(adata, marker_genes_dict)

--------------

## Automatic markers per cluster using rank_genes_groups

In [None]:
# Identify markers per cluster (adjust group fraction and fold change to filter genes)
tools.marker_genes.run_rank_genes(adata, clustering_col,
                                 min_in_group_fraction=0.25,
                                 min_fold_change=0.5,
                                 max_out_group_fraction=0.8)

In [None]:
# Plot dotplot of markers
_ = pl.marker_genes.rank_genes_plot(adata, key="rank_genes_clustering_filtered", n_genes=15, 
                                   save=f"marker_genes_dotplot_{clustering_col}.pdf")

In [None]:
# Write marker genes to table
marker_tables = tools.marker_genes.get_rank_genes_tables(adata, out_group_fractions=True,
                                                        key="rank_genes_clustering_filtered",
                                                        save_excel=f"rank_genes_{clustering_col}.xlsx")

In [None]:
# Markers for cluster "1" (change key to access markers for other clusters)
marker_tables["1"].head()

---------

## Automatic cell type prediction with SCSA

In [None]:
# Marker list from Marco
userDB = '../marker_list.csv'

In [None]:
tools.celltype_annotation.run_scsa(adata, species=None, user_db=userDB, key='rank_genes_clustering')

In [None]:
pl.embedding.plot_embedding(adata, color="SCSA_pred_celltype", title="Automatic celltype assignment", 
                            save="SCSA_assignment.pdf")

-------

## Plot expression of known markers

In [None]:
marker_gene_names = []
marker_gene_titles = []
for key, genes in marker_genes_dict.items():
    if isinstance(genes, str):
        genes = [genes]
        
    for gene in genes:
        marker_gene_names.append(gene)
        marker_gene_titles.append(f"{gene} ({key})")

In [None]:
pl.embedding.plot_embedding(adata, method='umap', color=marker_gene_names, title=marker_gene_titles,
                  ncols=3,
                  save="marker_gene_expression_umap.png", dpi=100)

# Merge clusters

In [None]:
# merge clusters 1, 4 and 5 into one cluster
reclustering_column = "clustering"
embedding = 'umap'

tools.clustering.recluster(adata, clustering_col, ["2", "4", "5"], task="join", resolution=0.1, embedding=embedding,
                            key_added=reclustering_column)

In [None]:
# Identify markers per cluster (adjust group fraction and fold change to filter genes)
kwargs = {'key_added': 'rank_genes_groups_2'}  # to keep ranked genes for original clustering
tools.marker_genes.run_rank_genes(adata, reclustering_column,
                                 min_in_group_fraction=0.25,
                                 min_fold_change=0.5,
                                 max_out_group_fraction=0.8,
                                 **kwargs)

In [None]:
# deal with bug in run_rank_genes -> if other key_added is used, the codes is still hardcoded for rank_genes_groups
adata.uns['rank_genes_clustering_2'] = adata.uns['rank_genes_groups_2']

# filter
sc.tl.filter_rank_genes_groups(adata, key='rank_genes_clustering_2',
                               groupby=reclustering_column,
                               min_in_group_fraction=0.25,
                               min_fold_change=0.5,
                               max_out_group_fraction=0.8, key_added='rank_genes_clustering_2_filtered')

In [None]:
# Plot dotplot of markers
_ = pl.marker_genes.rank_genes_plot(adata, key="rank_genes_clustering_2_filtered", n_genes=15, 
                       save=f"marker_genes_dotplot_{reclustering_column}.pdf")

In [None]:
adata.obs['celltypes'] = adata.obs['SCSA_pred_celltype']
adata.obs['celltypes'] = adata.obs['celltypes'].astype(str)

In [None]:
# rename clusters 1,2,6 after merge to Myocardium
adata.obs.loc[adata.obs['SCSA_pred_celltype'] == 'Myocardium - Atrium', 'celltypes'] = 'Myocardium'
adata.obs.loc[adata.obs['SCSA_pred_celltype'] == 'Myocardium - Ventricle', 'celltypes'] = 'Myocardium'
adata.obs.loc[adata.obs['SCSA_pred_celltype'] == 'Nervous system - 2', 'celltypes'] = 'Myocardium'

# change to categorical
adata.obs['celltypes'] = pd.Categorical(adata.obs['celltypes'])

### Add manual annotation

In [None]:
# import annotation from cellxgene
annot = pd.read_csv("../test-cell-labels-ZB3TYJAL.csv", header=2, names=['index', 'celltype']).set_index('index')
# add annotation to adata
adata.obs['celltype_manual'] = adata.obs.index.map(annot['celltype'].to_dict())
adata.obs['celltype_manual'] = pd.Categorical(adata.obs['celltype_manual'])
# rename Blood - Immune cells to Other cell types
adata.obs['celltype_manual'] = adata.obs['celltype_manual'].cat.rename_categories({'Blood - Immune cells': 'Other cell types'})

### Plot new celltypes

In [None]:
pl.embedding.plot_embedding(adata, color="celltypes", title="Cell types", 
                            save="SCSA_mod_assignment.pdf")

In [None]:
pl.embedding.plot_embedding(adata, color="celltype_manual", title="Cell types", 
                            save="manual_celltype_assignment.pdf")

### Assign celltypes based on markers

In [None]:
sc.tl.dendrogram(adata, clustering_col)
_ = pl.clustering.marker_gene_clustering(adata, clustering_col, marker_genes_dict, 
                                          save="marker_gene_expression_dotplot.png")

In [None]:
sc.pl.dotplot(adata, marker_genes_dict, groupby=clustering_col, show=True, dendrogram=True)

In [None]:
pl.embedding.plot_embedding(adata, color=["celltype_manual", "timepoint", "phase"], ncols=2,
                            save="UMAP_celltype_timepoint_phase.pdf", wspace=0.3)

In [None]:
adata = utils.adata.load_h5ad("anndata_5.h5ad")
display(adata)

In [None]:
neuregulins = ['nrg1', 'nrg2a', 'nrg2b', 'nrg3b', 'erbb2', 'erbb3a', 'erbb3b', 'erbb4b']

In [None]:
sc.pl.umap(adata, color=neuregulins,
           vmin=0., vmax=1., show=False, cmap='Blues', ncols=3)
pl.general._save_figure("neuregulins_expression_umap.pdf", dpi=300)

In [None]:
ax = sc.pl.dotplot(adata, neuregulins, 'timepoint', cmap='Blues', show=False)
ax['mainplot_ax'].set_xticks(ax['mainplot_ax'].get_xticks(), ax['mainplot_ax'].get_xticklabels(),
                             fontstyle='italic', fontsize=12)
ax['mainplot_ax'].set_yticks(ax['mainplot_ax'].get_yticks(), ax['mainplot_ax'].get_yticklabels(), fontsize=12)

pl.general._save_figure(f"neuregulins_expression_timepoint_heatmap.pdf")

In [None]:
ax = sc.pl.dotplot(adata, neuregulins, 'celltype_manual', cmap='Blues', show=False)
ax['mainplot_ax'].set_xticks(ax['mainplot_ax'].get_xticks(), ax['mainplot_ax'].get_xticklabels(),
                             fontstyle='italic', fontsize=12)
ax['mainplot_ax'].set_yticks(ax['mainplot_ax'].get_yticks(), ax['mainplot_ax'].get_yticklabels(), fontsize=12)

pl.general._save_figure(f"neuregulins_expression_clusters_heatmap.pdf")

-------------

## Save adata

In [None]:
#fix error when saving filtered rank gene names
import re
for key in list(adata.uns.keys()):
    if re.match("rank_genes_.*_filtered", key):
        del adata.uns[key]

In [None]:
adata

In [None]:
del adata.uns['sctoolbox']['log']['plot_embedding']['run_2']['kwargs']['cmap']

In [None]:
utils.adata.save_h5ad(adata, "anndata_5.h5ad")

## Prepare for cellxgene

In [None]:
# add timepoints_ordered for cellxgene
times = adata.obs['timepoint'].unique().to_list()
points = list(range(1, 11))
time_points = dict(zip(times, [f'tp0{x}_{y}' if x < 10 else f'tp{x}_{y}' for x, y in zip(points, times)]))
adata.obs['timepoints_ordered'] = adata.obs['timepoint'].map(time_points)
adata.obs['timepoints_ordered'] = pd.Categorical(adata.obs['timepoints_ordered'])

In [None]:
adata = utils.adata.prepare_for_cellxgene(adata, keep_obs=None, keep_var=None,
                                          rename_obs=None, rename_var=None,
                                          embedding_names=['pca', 'umap', 'tsne'], cmap='viridis', inplace=False)

In [None]:
utils.adata.save_h5ad(adata, "anndata_5_cellxgene.h5ad")