# Annotation, marker genes and cell type assignment

## Settings 

In [None]:
sample = "all"

# Gene annotation
GTF_PATH = "test_data/hg38_genes.gtf" # genes gtf file
ENSEMBL_DATASET = "auto" # This is used for replaceing ensembl gene IDs with gene names if possible
                         # Use "auto" to infer species from Ensembl ID or
                         # use first letter of all words but complete last word e.g. hsapiens, mmusculus

# Cell type annotation
SPECIES = "Hs" # Currently only Hs (Homo Sapiens) and Mm (Mus Musculus) are available for custom cell type annotation
SCSA_SPECIES = "human" # Currently only human (Homo Sapiens) and mouse (Mus Musculus) are available for SCSA
                      # None if using a custom DB file
    
# Optional custom database for SCSA
SCSA_CUSTOM_DB_PATH = None #"/mnt/workspace/mkessle/projects/marker_repo/dbs/mouse/panglao_all_mandatory_2020"

# Path to panglao markers
PANGLAO_DB_PATH = "" # panglao markers

# The clustering resolutions which will be annotated
clustering_cols = "leiden_0.4"  # It is also possible to add more than one column by using a list of columns

MODIFY_CELL_TYPES = False # If true you get the opportunity to manually modify the automatic annotated cell types

# Save figures
save_figs = True

-------------

## Loading packages and setup

In [None]:
import sctoolbox.utilities as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import pandas as pd
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
import os

utils.settings_from_config("config.yaml", key="05")
anndata_dir = "pipeline_output/annotation"

## Check files 

In [None]:
files = [GTF_PATH, SCSA_CUSTOM_DB_PATH, PANGLAO_DB_PATH]
for file in files:
    if file is not None and os.path.isfile(file):
        print(f"{file} exists.")
    else:
        print(f"Please make sure that your input is correct. {file} does not exist.")

## Load anndata

In [None]:
adata = utils.load_h5ad("anndata_4.h5ad")
display(adata)

In [None]:
adata.var

## Check columns

In [None]:
if clustering_cols:
    if type(clustering_cols) == str:
        clustering_cols = [clustering_cols]

In [None]:
utils.check_columns(adata.obs, clustering_cols)

-------------

## Annotate regions to genes 

In [None]:
tools.annotate_adata(adata,
                       GTF_PATH,
                       config=None,
                       best=True,
                       threads=6,
                       coordinate_cols=None,
                       temp_dir="tmp",
                       inplace=True)

In [None]:
adata.var.head()

--------------

## Find marker peaks

In [None]:
adata_peaks = adata.copy()

In [None]:
# Identify markers per cluster (adjust group fraction and fold change to filter genes)
for column in clustering_cols:
    tools.run_rank_genes(adata_peaks, column,
                         min_in_group_fraction=0.25,
                         min_fold_change=0.5,
                         max_out_group_fraction=0.8)

In [None]:
# Plot dotplot of markers
for column in clustering_cols:
    _ = pl.rank_genes_plot(adata_peaks, key=f"rank_genes_{column}_filtered", n_genes=15, 
                           measure="accessibility",
                           save=f"marker_peaks_dotplot_{column}.pdf")

In [None]:
# Write marker peaks to table
for column in clustering_cols:
    marker_tables = tools.get_rank_genes_tables(adata_peaks, key=f"rank_genes_{column}_filtered",
                                                var_columns=["gene_name"],
                                                out_group_fractions=True,
                                                save_excel=f"rank_peaks_{column}.xlsx")

------------

## Find marker genes

In [None]:
# Remove peaks without annotation and replace names
adata_genes = adata[:,~adata.var['gene_name'].isnull()]

adata_genes.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
adata_genes.var.set_index('gene_name',inplace=True)  # set genes as index
adata_genes.var.index = adata_genes.var.index.astype('str')  # to avoid index being categorical
adata_genes.var_names_make_unique(join='_')

In [None]:
print(f"Number of peaks before filtering: {adata.shape[1]}")
print(f"Number of peaks annotated to genes: {adata_genes.shape[1]}")

In [None]:
# Identify markers per cluster (adjust group fraction and fold change to filter genes)
for column in clustering_cols:
    tools.run_rank_genes(adata_genes, column,
                         min_in_group_fraction=0.25,
                         min_fold_change=0.5,
                         max_out_group_fraction=0.8)

In [None]:
# Plot dotplot of markers
for column in clustering_cols:
    _ = pl.rank_genes_plot(adata_genes, key=f"rank_genes_{column}_filtered", n_genes=15,
                           measure="accessibility",
                           save=f"marker_genes_dotplot_{column}.pdf")

In [None]:
# Write marker peaks to table
for column in clustering_cols:
    marker_tables = tools.get_rank_genes_tables(adata_genes, key=f"rank_genes_{column}_filtered",
                                                out_group_fractions=True,
                                                save_excel=f"rank_genes_{column}.xlsx")

-------------

## Custom script cell type annotation

In [None]:
annotation_dir = "."

In [None]:
for column in clustering_cols:
    tools.annot_ct(adata=adata, genes_adata=adata_genes, output_path=annotation_dir, 
                     db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"{column}", 
                     rank_genes_column=f"rank_genes_groups_{column}", sample=sample, 
                     ct_column=f"cell_types_{column}", tissue="all", db="panglao", inplace=True, species=SPECIES)

In [None]:
for column in clustering_cols:
    sc.pl.umap(adata, color=[f'cell_types_{column}', f'{column}'], wspace=0.5)

In [None]:
for column in clustering_cols:
    print(f"Tables of cell type annotation with clustering {column}")
    tools.show_tables(annotation_dir=annotation_dir, n=5, clustering_column=column)

### Modify wrong cluster annotations by selecting other recommendations

In [None]:
if MODIFY_CELL_TYPES:
# Pick the cell type column of the obs table you want to modify as well as the corresponding clustering column
    cell_type_column = "cell_types_leiden_0.4"
    clustering_column = "leiden_0.4"

    tools.modify_ct(adata=adata, annotation_dir=annotation_dir, clustering_column=clustering_column, 
                      cell_type_column=cell_type_column, inplace=True)

---------

## SCSA cell type annotation

In [None]:
for column in clustering_cols:
    tools.run_scsa(adata_genes,
                   key=f'rank_genes_{column}',
                   column_added=f'SCSA_pred_celltype_{column}',
                   species=SCSA_SPECIES,
                   fc=1.5,
                   pvalue=0.01,
                   tissue='All',
                   user_db=SCSA_CUSTOM_DB_PATH,
                 )
    adata.obs[f'SCSA_pred_celltype_{column}'] = adata_genes.obs[f'SCSA_pred_celltype_{column}']
    sc.pl.umap(adata, color=[column, f'SCSA_pred_celltype_{column}'], wspace=0.5)

--------------

## Save notebook and adata object

In [None]:
utils.save_h5ad(adata, "anndata_5.h5ad")