# Annotation

## Settings 

In [None]:
# Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/jdetlef/ext_ana/processed/'
test = 'Esophagus'

# Gene annotation
GTF_PATH = "/mnt/workspace/jdetlef/ext_ana/mus_musculus.104.genes.gtf"
# Cell type annotation
PANGLAO_DB_PATH = "/mnt/workspace/jdetlef/sc-atac-framework/ctannot/panglao_markers"

%load_ext autoreload
%autoreload 2

## Loading packages and setup

In [None]:
import sctoolbox.atac_tree as sub_tree
import sctoolbox.annotation as annot
import sctoolbox.custom_celltype_annotation as ctannot
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
from pybiomart import Dataset

## Setup path handling object 

In [None]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

In [None]:
tree.gtf_path = GTF_PATH

## Load anndata

In [None]:
clustering_output = tree.clustering_anndata
adata = epi.read_h5ad(clustering_output)
adata.var

In [None]:
#adata.var.index = adata.var.index.str.replace('_',' ')
    

In [None]:
# FIX FOR THE EXT442
adata.var['peak_chr'] = adata.var['peak_chr'].astype(str)
adata.var['peak_start'] = adata.var['peak_start'].astype(int)
adata.var['peak_end'] = adata.var['peak_end'].astype(int)

In [None]:
adata.var

## Gene annotation 

In [None]:
annot.annotate_adata(adata,
                   GTF_PATH,
                   config=None,
                   best=True,
                   threads=6,
                   coordinate_cols=None,
                   temp_dir="",
                   verbose=True,
                   inplace=True)

In [None]:
adata.var['gene_name']

In [None]:
assigned_features = adata.var['gene_name'].dropna()
assigned_features

In [None]:
dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

df = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])

In [None]:
ens_dict = dict(zip(df['Gene stable ID'], df['Gene name']))

In [None]:
count = 0
for index, gene in enumerate(assigned_features):
    if gene.startswith("ENSG"):
        if gene in ens_dict.keys():
            if isinstance(ens_dict[gene], str):
                assigned_features.replace(gene, ens_dict[gene], inplace=True)
                count += 1
print(f'{count} ensembl gene ids have been replaced with gene names')

In [None]:
assigned_features

In [None]:
assigned_adata = adata[:, assigned_features.index]
assigned_adata

In [None]:
peaks_adata = assigned_adata.copy()

## Rank peaks

In [None]:
adata.obs['clustering']


In [None]:
adata.uns

In [None]:
peaks_adata.uns['log1p']['base'] = None

In [None]:
# TODO save ALL clustering resolutions in infoprocess!
# Workaround:
resolutions = ['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']

In [None]:
for res in resolutions:
    print(f'Ranking peaks for leiden clusters with resolution: {res}')
    epi.tl.rank_features(peaks_adata, f'leiden_{res}', omic='ATAC', 
                         key_added=f'rank_features_groups_{res}', n_features=100, use_raw=False)
    
print('Done')

## Rank genes
Replace peaks with gene names

In [None]:
genes_adata = peaks_adata.copy()
genes_adata.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
genes_adata.var.set_index('gene_name',inplace=True)  # set genes as index
genes_adata.var.index = genes_adata.var.index.astype('str')  # to avoid index being categorical
genes_adata.var_names_make_unique(join='_')
genes_adata.raw = genes_adata

In [None]:
for res in resolutions:
    print(f'Ranking genes groups for leiden clusters with resolution: {res}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'leiden_{res}', use_raw=False, key_added=f'rank_genes_groups_{res}')

print('Done')

### Plot ranked genes

In [None]:
for res in resolutions:
    print(f'Plotting ranked genes groups for leiden with resolution: {res}')
    plot_key = 'rank_genes_groups_' + str(res)
    sc.pl.rank_genes_groups(genes_adata, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_leiden_{res}.png')
    plt.show()

In [None]:
for res in resolutions:
    print(f'Plotting matrixplot for leiden with resolution: {res}')
    plot_key = 'rank_genes_groups_' + str(res)
    sc.pl.rank_genes_groups_matrixplot(genes_adata, standard_scale='var', n_genes=10, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_matrixplot_leiden_{res}.png')
    plt.show()

In [None]:
for res in resolutions:
    ctannot.annot_ct(adata=adata, genes_adata=genes_adata, output_path=tree.annotation_dir, db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"leiden_{res}", rank_genes_column=f"rank_genes_groups_{res}", sample=tree.run, ct_column=f"cell_types_{res}", tissue="all", db="panglao", inplace=True, species='Mm')

In [None]:
adata.obs

In [None]:
for res in resolutions:
    sc.pl.umap(adata, color=[f'cell_types_{res}', f'leiden_{res}'], wspace=0.5)

In [None]:
ctannot.show_tables(annotation_dir=tree.annotation_dir, resolution=0.1, clustering_column="leiden")

## Modify wrong cluster annotations by selecting other recommendations

In [None]:
change_annotation = input('Do you want to change the annotation? answer with yes or no:')

In [None]:
if change_annotation == 'yes':
    ctannot.modify_ct(adata=adata, resolutions=resolutions, annotation_dir=tree.annotation_dir, clustering_column="leiden", cell_type_column="cell_types", inplace=True)

In [None]:
sc.pl.umap(adata, color=[f'cell_types_mod_0.1', f'cell_types_0.1'], wspace=0.5)

In [None]:
adata_output = tree.annotation_anndata
adata.write(filename=adata_output)

In [None]:
import os
import shutil
repo_path = os.getcwd()
notebook_name = '5_annotation.ipynb'
notebook_path = os.path.join(repo_path, notebook_name)
notebook_copy = os.path.join(tree.annotation_dir , notebook_name)
shutil.copyfile(notebook_path, notebook_copy)