# Annotation

## Settings 

In [None]:
# Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/mkessle/ext442'
test = 'ext442'

# Gene annotation
GTF_PATH = "/mnt/flatfiles/organisms/new_organism/mus_musculus/104/mus_musculus.104.genes.gtf"
ENSEMBL_DATASET = "mmusculus_gene_ensembl" # This is used for replaceing ensembl gene IDs with gene names if possible
                                           # use "hsapiens_gene_ensembl" for homo sapiens
# Cell type annotation
SPECIES = "Mm" # Currently only Hs (Homo Sapiens) and Mm (Mus Musculus) are available for custom cell type annotation
SCSA_SPECIES = "mouse" # Currently only human (Homo Sapiens) and mouse (Mus Musculus) are available for SCSA
PANGLAO_DB_PATH = "/mnt/workspace/jdetlef/sc-atac-framework/ctannot/panglao_markers"
RESOLUTIONS = "leiden_0.4" # Choose "None" to keep the selection of the clustering resolution from the previous notebook
                           # You can also choose a column which contains the clustering information in the obs table
                           # It is also possible to add more than one column (resolutions) by using a list of columns

In [None]:
RESOLUTIONS

In [None]:
%load_ext autoreload
%autoreload 2

## Loading packages and setup

In [None]:
import sctoolbox.atac_tree as sub_tree
import sctoolbox.annotation as annot
import sctoolbox.custom_celltype_annotation as ctannot
from sctoolbox.celltype_annotation import run_scsa
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
from pybiomart import Dataset

## Setup path handling object 

In [None]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

In [None]:
tree.gtf_path = GTF_PATH

## Load anndata

In [None]:
clustering_output = tree.clustering_anndata
adata = epi.read_h5ad(clustering_output)
adata.var

In [None]:
if RESOLUTIONS:
    if type(RESOLUTIONS) == str:
        RESOLUTIONS = [RESOLUTIONS]
else:
    RESOLUTIONS = adata.uns['infoprocess']['leiden_res']

In [None]:
# FIX FOR THE EXT442
adata.var['peak_chr'] = adata.var['peak_chr'].astype(str)
adata.var['peak_start'] = adata.var['peak_start'].astype(int)
adata.var['peak_end'] = adata.var['peak_end'].astype(int)

In [None]:
adata.var

## Gene annotation 

In [None]:
annot.annotate_adata(adata,
                   GTF_PATH,
                   config=None,
                   best=True,
                   threads=6,
                   coordinate_cols=None,
                   temp_dir="",
                   verbose=True,
                   inplace=True)

In [None]:
adata.var['gene_name']

In [None]:
assigned_features = adata.var['gene_name'].dropna()
assigned_features

### Replace ensembl gene IDs with gene names if possible

In [None]:
dataset = Dataset(name=ENSEMBL_DATASET,
                  host='http://www.ensembl.org')

df = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])

In [None]:
ens_dict = dict(zip(df['Gene stable ID'], df['Gene name']))

In [None]:
count = 0
for index, gene in enumerate(assigned_features):
    if gene.startswith("ENSG"):
        if gene in ens_dict.keys():
            if isinstance(ens_dict[gene], str):
                assigned_features.replace(gene, ens_dict[gene], inplace=True)
                count += 1
print(f'{count} ensembl gene ids have been replaced with gene names')

In [None]:
assigned_features

In [None]:
assigned_adata = adata[:, assigned_features.index]
assigned_adata

## Rank peaks

In [None]:
peaks_adata = assigned_adata.copy()

In [None]:
peaks_adata.uns['log1p']['base'] = None

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking peaks for clusters using obs column {column}')
    epi.tl.rank_features(peaks_adata, f'{column}', omic='ATAC', 
                         key_added=f'rank_features_groups_{column}', n_features=100, use_raw=False)
    
print('Done')

## Rank genes
Replace peaks with gene names

In [None]:
genes_adata = peaks_adata.copy()
genes_adata.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
genes_adata.var.set_index('gene_name',inplace=True)  # set genes as index
genes_adata.var.index = genes_adata.var.index.astype('str')  # to avoid index being categorical
genes_adata.var_names_make_unique(join='_')
genes_adata.raw = genes_adata

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking genes groups for clusters using obs column {column}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'{column}', use_raw=False, key_added=f'rank_genes_groups_{column}')

print('Done')

### Plot ranked genes

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting ranked genes groups for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups(genes_adata, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_leiden_{res}.png')
    plt.show()

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting matrixplot for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups_matrixplot(genes_adata, standard_scale='var', n_genes=10, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_matrixplot_leiden_{res}.png')
    plt.show()

## Cell type annotation

In [None]:
for column in RESOLUTIONS:
    ctannot.annot_ct(adata=adata, genes_adata=genes_adata, output_path=tree.annotation_dir, 
                     db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"{column}", 
                     rank_genes_column=f"rank_genes_groups_{column}", sample=tree.run, 
                     ct_column=f"cell_types_{column}", tissue="all", db="panglao", inplace=True, species=SPECIES)

In [None]:
adata.obs

In [None]:
for column in RESOLUTIONS:
    sc.pl.umap(adata, color=[f'cell_types_{column}', f'{column}'], wspace=0.5)

In [None]:
ctannot.show_tables(annotation_dir=tree.annotation_dir, resolution=0.2, clustering_column="leiden")

### Modify wrong cluster annotations by selecting other recommendations

In [None]:
change_annotation = input('Do you want to change the annotation? answer with yes or no: ')

In [None]:
# Pick the cell type column of the obs table you want to modify as well as the corresponding clustering column
cell_type_column = "cell_types_leiden_0.4"
clustering_column = "leiden_0.4"

if change_annotation == 'yes':
    ctannot.modify_ct(adata=adata, annotation_dir=tree.annotation_dir, clustering_column=clustering_column, 
                      cell_type_column=cell_type_column, inplace=True)

### SCSA cell type annotation

In [None]:
for column in RESOLUTIONS:
    run_scsa(genes_adata,
             gene_column=None,
             gene_symbol='auto',
             key=f'rank_genes_groups_{column}',
             column_added=f'SCSA_pred_celltype_{column}',
             inplace=True,
             python_path=None,
             scsa_path=None,
             wholedb_path=None,
             species=SCSA_SPECIES,
             fc=1.5,
             pvalue=0.01,
             tissue='All',
             celltype='normal',
             user_db=None,
             z_score='best',
             )
    adata.obs[f'SCSA_pred_celltype_{column}'] = genes_adata.obs[f'SCSA_pred_celltype_{column}']
    sc.pl.umap(adata, color=[f'SCSA_pred_celltype_{column}', f'cell_types_{column}'], wspace=0.5)


## Save notebook and adata object

In [None]:
import os
import shutil
repo_path = os.getcwd()
notebook_name = '5_annotation.ipynb'
notebook_path = os.path.join(repo_path, notebook_name)
notebook_copy = os.path.join(tree.annotation_dir , notebook_name)
shutil.copyfile(notebook_path, notebook_copy)

In [None]:
adata_output = tree.annotation_anndata
adata.write(filename=adata_output)