# Annotation

## Settings 

In [None]:
sample = "all"
# Gene annotation
GTF_PATH = "/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.genes.gtf"
ENSEMBL_DATASET = "auto" # This is used for replaceing ensembl gene IDs with gene names if possible
                         # Use "auto" to infer species from Ensembl ID or
                         # use first letter of all words but complete last word e.g. hsapiens, mmusculus

# Cell type annotation
SPECIES = "Hs" # Currently only Hs (Homo Sapiens) and Mm (Mus Musculus) are available for custom cell type annotation
SCSA_SPECIES = None # Currently only human (Homo Sapiens) and mouse (Mus Musculus) are available for SCSA
                    # None if using a custom DB file
# Optional custom database for SCSA
SCSA_CUSTOM_DB_PATH = "/mnt/workspace/mkessle/projects/marker_repo/dbs/mouse/panglao_all_mandatory_2020"
# Path to panglao markers
PANGLAO_DB_PATH = "/mnt/workspace/jdetlef/panglao_markers"

# The clustering resolutions which will be annotated
RESOLUTIONS = "leiden_0.4" # Choose "None" to keep the selected clustering resolution from the previous notebook
                           # You can also choose a column which contains the clustering information in the obs table
                           # It is also possible to add more than one column (resolutions) by using a list of columns
        
MODIFY_CELL_TYPES = False # If true you get the opportunity to manually modify the automatic annotated cell types

# Save figures
save_figs = True

## Loading packages and setup

In [None]:
import sctoolbox.utilities as utils
import sctoolbox.annotation as annot
import sctoolbox.custom_celltype_annotation as ctannot
import sctoolbox.utilities as utils
from sctoolbox.celltype_annotation import run_scsa
import pandas as pd
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
import apybiomart
import os
import shutil


utils.settings_from_config("config.yaml", key="05")
anndata_dir = "pipeline_output/annotation"

## Check files 

In [None]:
files = [GTF_PATH, SCSA_CUSTOM_DB_PATH, PANGLAO_DB_PATH]
for file in files:
    if os.path.isfile(file):
        print(f"{file} exists.")
    else:
        print(f"Please make sure that your input is correct. {file} does not exist.")

## Load anndata

In [None]:
adata = utils.load_h5ad("anndata_4.h5ad")
display(adata)

In [None]:
if RESOLUTIONS:
    if type(RESOLUTIONS) == str:
        RESOLUTIONS = [RESOLUTIONS]
else:
    RESOLUTIONS = adata.uns['infoprocess']['leiden_res']

## Check columns

In [None]:
for column in RESOLUTIONS:
    if column in adata.obs:
        print(f"The obs table contains the column {column}.")
    else:
        print(f"{column} was not found in the obs table. Please pick a valid column instead.")

In [None]:
adata.var

## Gene annotation 

In [None]:
annot.annotate_adata(adata,
                   GTF_PATH,
                   config=None,
                   best=True,
                   threads=6,
                   coordinate_cols=None,
                   temp_dir="",
                   verbose=True,
                   inplace=True)

adata.var['gene_name']

### remove Nan

In [None]:
assigned_features = adata.var['gene_name'].dropna()
adata = adata[:, assigned_features.index]
adata.var['gene_name']

## Rank peaks

In [None]:
peaks_adata = adata.copy()

In [None]:
# peaks_adata.uns['log1p']['base'] = None

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking peaks for clusters using obs column {column}')
    epi.tl.rank_features(peaks_adata, f'{column}', omic='ATAC', 
                         key_added=f'rank_features_groups_{column}', n_features=100, use_raw=False)
    
print('Done')

## Rank genes
Replace peaks with gene names

In [None]:
genes_adata = peaks_adata.copy()
genes_adata.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
genes_adata.var.set_index('gene_name',inplace=True)  # set genes as index
genes_adata.var.index = genes_adata.var.index.astype('str')  # to avoid index being categorical
genes_adata.var_names_make_unique(join='_')
genes_adata.raw = genes_adata

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking genes groups for clusters using obs column {column}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'{column}', use_raw=False, key_added=f'rank_genes_groups_{column}')
    
print('Done')

### Plot ranked genes

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting ranked genes groups for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups(genes_adata, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_leiden_{res}.png')
    plt.show()

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting matrixplot for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups_matrixplot(genes_adata, standard_scale='var', n_genes=10, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_matrixplot_leiden_{res}.png')
    plt.show()

## Cell type annotation

In [None]:
for column in RESOLUTIONS:
    ctannot.annot_ct(adata=adata, genes_adata=genes_adata, output_path=annotation_dir, 
                     db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"{column}", 
                     rank_genes_column=f"rank_genes_groups_{column}", sample=sample, 
                     ct_column=f"cell_types_{column}", tissue="all", db="panglao", inplace=True, species=SPECIES)

In [None]:
for column in RESOLUTIONS:
    sc.pl.umap(adata, color=[f'cell_types_{column}', f'{column}'], wspace=0.5)

In [None]:
for column in RESOLUTIONS:
    print(f"Tables of cell type annotation with clustering {column}")
    ctannot.show_tables(annotation_dir=annotation_dir, n=5, clustering_column=column)

### Modify wrong cluster annotations by selecting other recommendations

In [None]:
if MODIFY_CELL_TYPES:
# Pick the cell type column of the obs table you want to modify as well as the corresponding clustering column
    cell_type_column = "cell_types_leiden_0.4"
    clustering_column = "leiden_0.4"

    ctannot.modify_ct(adata=adata, annotation_dir=annotation_dir, clustering_column=clustering_column, 
                      cell_type_column=cell_type_column, inplace=True)

### SCSA cell type annotation

Removing underscores from gene names

In [None]:
genes_adata.var.index = pd.Index([name.split('_')[0] for name in genes_adata.var.index])

Rank new gene names

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking genes groups for clusters using obs column {column}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'{column}', use_raw=False, key_added=f'rank_genes_groups_SCSA_{column}')

print('Done')

In [None]:
for column in RESOLUTIONS:
    run_scsa(genes_adata,
             gene_column=None,
             gene_symbol='auto',
             key=f'rank_genes_groups_SCSA_{column}',
             column_added=f'SCSA_pred_celltype_{column}',
             inplace=True,
             python_path=None,
             species=None,
             fc=1.5,
             pvalue=0.01,
             tissue='All',
             user_db=SCSA_CUSTOM_DB_PATH,
             )
    adata.obs[f'SCSA_pred_celltype_{column}'] = genes_adata.obs[f'SCSA_pred_celltype_{column}']
    sc.pl.umap(adata, color=[f'SCSA_pred_celltype_{column}', f'cell_types_{column}'], wspace=0.5)


## Save notebook and adata object

In [None]:
utils.save_h5ad(adata, "anndata_5.h5ad")