# Annotation

## Settings 

In [2]:
# Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/jdetlef/ext_ana/processed'
test = 'all'

# Gene annotation
GTF_PATH = "/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.genes.gtf"
ENSEMBL_DATASET = "auto" # This is used for replaceing ensembl gene IDs with gene names if possible
                         # Use "auto" to infer species from Ensembl ID or
                         # use first letter of all words but complete last word e.g. hsapiens, mmusculus

# Cell type annotation
SPECIES = "Hs" # Currently only Hs (Homo Sapiens) and Mm (Mus Musculus) are available for custom cell type annotation
SCSA_SPECIES = None # Currently only human (Homo Sapiens) and mouse (Mus Musculus) are available for SCSA
                    # None if using a custom DB file
# Optional custom database for SCSA
SCSA_CUSTOM_DB_PATH = "/mnt/workspace/mkessle/projects/marker_repo/dbs/mouse/panglao_all_mandatory_2020"
# Path to panglao markers
PANGLAO_DB_PATH = "/mnt/workspace/jdetlef/panglao_markers"

# The clustering resolutions which will be annotated
RESOLUTIONS = "leiden_0.4" # Choose "None" to keep the selected clustering resolution from the previous notebook
                           # You can also choose a column which contains the clustering information in the obs table
                           # It is also possible to add more than one column (resolutions) by using a list of columns
        
MODIFY_CELL_TYPES = False # If true you get the opportunity to manually modify the automatic annotated cell types

# Save figures
save_figs = True

## Loading packages and setup

In [2]:
import sctoolbox.atac_tree as sub_tree
import sctoolbox.annotation as annot
import sctoolbox.custom_celltype_annotation as ctannot
import sctoolbox.utilities as utils
from sctoolbox.celltype_annotation import run_scsa
import pandas as pd
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
import apybiomart
import os
import shutil

## Check files 

In [3]:
files = [GTF_PATH, SCSA_CUSTOM_DB_PATH, PANGLAO_DB_PATH]
for file in files:
    if os.path.isfile(file):
        print(f"{file} exists.")
    else:
        print(f"Please make sure that your input is correct. {file} does not exist.")

/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.genes.gtf exists.
/mnt/workspace/mkessle/projects/marker_repo/dbs/mouse/panglao_all_mandatory_2020 exists.
/mnt/workspace/jdetlef/panglao_markers exists.


## Setup path handling object 

In [4]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

all directories existing
all directories existing


In [5]:
tree.gtf_path = GTF_PATH

## Load anndata

In [6]:
clustering_output = tree.clustering_anndata
adata = epi.read_h5ad(clustering_output)
adata.var

Unnamed: 0,peak_chr,peak_start,peak_end,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,commonness,highly_variable
chr1:10005-10731,chr1,10005,10731,277,0.025136,0.024825,97.486388,277.0,5.627621,277,410.0,True
chr1:28730-29439,chr1,28730,29439,15,0.001361,0.001360,99.863884,15.0,2.772589,15,22.0,True
chr1:91328-91482,chr1,91328,91482,4,0.000363,0.000363,99.963702,4.0,1.609438,4,6.0,False
chr1:139088-139266,chr1,139088,139266,2,0.000181,0.000181,99.981851,2.0,1.098612,2,3.0,False
chr1:180710-182007,chr1,180710,182007,386,0.035027,0.034428,96.497278,386.0,5.958425,386,568.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
chr9:138203969-138204112,chr9,138203969,138204112,6,0.000544,0.000544,99.945554,6.0,1.945910,6,7.0,True
chr9:138209358-138209542,chr9,138209358,138209542,6,0.000544,0.000544,99.945554,6.0,1.945910,6,8.0,True
chr9:138213762-138214121,chr9,138213762,138214121,34,0.003085,0.003081,99.691470,34.0,3.555348,34,43.0,True
chr9:138215567-138215868,chr9,138215567,138215868,1,0.000091,0.000091,99.990926,1.0,0.693147,1,3.0,False


In [7]:
if RESOLUTIONS:
    if type(RESOLUTIONS) == str:
        RESOLUTIONS = [RESOLUTIONS]
else:
    RESOLUTIONS = adata.uns['infoprocess']['leiden_res']

## Check columns

In [8]:
for column in RESOLUTIONS:
    if column in adata.obs:
        print(f"The obs table contains the column {column}.")
    else:
        print(f"{column} was not found in the obs table. Please pick a valid column instead.")

The obs table contains the column leiden_0.4.


In [9]:
# FIX FOR THE EXT442
adata.var['peak_chr'] = adata.var['peak_chr'].astype(str)
adata.var['peak_start'] = adata.var['peak_start'].astype(int)
adata.var['peak_end'] = adata.var['peak_end'].astype(int)

In [10]:
adata.var

Unnamed: 0,peak_chr,peak_start,peak_end,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,commonness,highly_variable
chr1:10005-10731,chr1,10005,10731,277,0.025136,0.024825,97.486388,277.0,5.627621,277,410.0,True
chr1:28730-29439,chr1,28730,29439,15,0.001361,0.001360,99.863884,15.0,2.772589,15,22.0,True
chr1:91328-91482,chr1,91328,91482,4,0.000363,0.000363,99.963702,4.0,1.609438,4,6.0,False
chr1:139088-139266,chr1,139088,139266,2,0.000181,0.000181,99.981851,2.0,1.098612,2,3.0,False
chr1:180710-182007,chr1,180710,182007,386,0.035027,0.034428,96.497278,386.0,5.958425,386,568.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
chr9:138203969-138204112,chr9,138203969,138204112,6,0.000544,0.000544,99.945554,6.0,1.945910,6,7.0,True
chr9:138209358-138209542,chr9,138209358,138209542,6,0.000544,0.000544,99.945554,6.0,1.945910,6,8.0,True
chr9:138213762-138214121,chr9,138213762,138214121,34,0.003085,0.003081,99.691470,34.0,3.555348,34,43.0,True
chr9:138215567-138215868,chr9,138215567,138215868,1,0.000091,0.000091,99.990926,1.0,0.693147,1,3.0,False


## Gene annotation 

In [11]:
annot.annotate_adata(adata,
                   GTF_PATH,
                   config=None,
                   best=True,
                   threads=6,
                   coordinate_cols=None,
                   temp_dir="",
                   verbose=True,
                   inplace=True)

adata.var['gene_name']

Setting up annotation configuration...
Config dictionary: {'queries': [{'distance': [10000, 1000], 'feature_anchor': ['start'], 'feature': ['gene'], 'name': 'promoters'}], 'priority': True, 'show_attributes': ['all'], 'output_by_query': False}
Setting up genomic regions to annotate...
Index(['peak_chr', 'peak_start', 'peak_end'], dtype='object')
Preparing gtf file for annotation...
- Reading gtf with Tabix
- Index of gtf not found - trying to index gtf
- Reading gtf with Tabix
Done preparing gtf!
Annotating regions...
Formatting annotations...
Finished annotation of features! The results are found in the .var table.
[Errno 2] No such file or directory: ''


chr1:10005-10731                DDX11L1
chr1:28730-29439            MIR1302-2HG
chr1:91328-91482                    NaN
chr1:139088-139266                  NaN
chr1:180710-182007             DDX11L17
                               ...     
chr9:138203969-138204112            NaN
chr9:138209358-138209542            NaN
chr9:138213762-138214121        FAM157B
chr9:138215567-138215868        FAM157B
chr9:138232071-138232356            NaN
Name: gene_name, Length: 166597, dtype: category
Categories (20839, object): ['5S_rRNA', '5_8S_rRNA', '7SK', 'A1BG-AS1', ..., 'ZYG11B', 'ZYX', 'ZZZ3', 'hsa-mir-1253']

### remove Nan

In [12]:
assigned_features = adata.var['gene_name'].dropna()
adata = adata[:, assigned_features.index]
adata.var['gene_name']

chr1:10005-10731                DDX11L1
chr1:28730-29439            MIR1302-2HG
chr1:180710-182007             DDX11L17
chr1:191767-191866            MIR6859-2
chr1:199219-200319               WASH9P
                               ...     
chr9:137877669-137877793        CACNA1B
chr9:138147665-138147786         TUBBP5
chr9:138150009-138150225         TUBBP5
chr9:138213762-138214121        FAM157B
chr9:138215567-138215868        FAM157B
Name: gene_name, Length: 37653, dtype: category
Categories (20839, object): ['5S_rRNA', '5_8S_rRNA', '7SK', 'A1BG-AS1', ..., 'ZYG11B', 'ZYX', 'ZZZ3', 'hsa-mir-1253']

## Unify genes column

In [13]:
assigned_adata = utils.unify_genes_column(
    adata,
    column="gene_name",
    unified_column="gene_name",
    species=ENSEMBL_DATASET,
    inplace=False
)

_BiomartException: No internet connection available!

## Rank peaks

In [None]:
peaks_adata = assigned_adata.copy()

In [None]:
peaks_adata.uns['log1p']['base'] = None

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking peaks for clusters using obs column {column}')
    epi.tl.rank_features(peaks_adata, f'{column}', omic='ATAC', 
                         key_added=f'rank_features_groups_{column}', n_features=100, use_raw=False)
    
print('Done')

## Rank genes
Replace peaks with gene names

In [None]:
genes_adata = peaks_adata.copy()
genes_adata.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
genes_adata.var.set_index('gene_name',inplace=True)  # set genes as index
genes_adata.var.index = genes_adata.var.index.astype('str')  # to avoid index being categorical
genes_adata.var_names_make_unique(join='_')
genes_adata.raw = genes_adata

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking genes groups for clusters using obs column {column}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'{column}', use_raw=False, key_added=f'rank_genes_groups_{column}')
    
print('Done')

### Plot ranked genes

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting ranked genes groups for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups(genes_adata, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_leiden_{res}.png')
    plt.show()

In [None]:
for column in RESOLUTIONS:
    print(f'Plotting matrixplot for clustering column {column}')
    plot_key = 'rank_genes_groups_' + str(column)
    sc.pl.rank_genes_groups_matrixplot(genes_adata, standard_scale='var', n_genes=10, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_matrixplot_leiden_{res}.png')
    plt.show()

## Cell type annotation

In [None]:
for column in RESOLUTIONS:
    ctannot.annot_ct(adata=adata, genes_adata=genes_adata, output_path=tree.annotation_dir, 
                     db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"{column}", 
                     rank_genes_column=f"rank_genes_groups_{column}", sample=tree.run, 
                     ct_column=f"cell_types_{column}", tissue="all", db="panglao", inplace=True, species=SPECIES)

In [None]:
for column in RESOLUTIONS:
    sc.pl.umap(adata, color=[f'cell_types_{column}', f'{column}'], wspace=0.5)

In [None]:
for column in RESOLUTIONS:
    print(f"Tables of cell type annotation with clustering {column}")
    ctannot.show_tables(annotation_dir=tree.annotation_dir, n=5, clustering_column=column)

### Modify wrong cluster annotations by selecting other recommendations

In [None]:
if MODIFY_CELL_TYPES:
# Pick the cell type column of the obs table you want to modify as well as the corresponding clustering column
    cell_type_column = "cell_types_leiden_0.4"
    clustering_column = "leiden_0.4"

    ctannot.modify_ct(adata=adata, annotation_dir=tree.annotation_dir, clustering_column=clustering_column, 
                      cell_type_column=cell_type_column, inplace=True)

### SCSA cell type annotation

Removing underscores from gene names

In [None]:
genes_adata.var.index = pd.Index([name.split('_')[0] for name in genes_adata.var.index])

Rank new gene names

In [None]:
for column in RESOLUTIONS:
    print(f'Ranking genes groups for clusters using obs column {column}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'{column}', use_raw=False, key_added=f'rank_genes_groups_SCSA_{column}')

print('Done')

In [None]:
for column in RESOLUTIONS:
    run_scsa(genes_adata,
             gene_column=None,
             gene_symbol='auto',
             key=f'rank_genes_groups_SCSA_{column}',
             column_added=f'SCSA_pred_celltype_{column}',
             inplace=True,
             python_path=None,
             scsa_path=None,
             wholedb_path=None,
             species=None,
             fc=1.5,
             pvalue=0.01,
             tissue='All',
             celltype='normal',
             user_db=SCSA_CUSTOM_DB_PATH,
             z_score='best',
             )
    adata.obs[f'SCSA_pred_celltype_{column}'] = genes_adata.obs[f'SCSA_pred_celltype_{column}']
    sc.pl.umap(adata, color=[f'SCSA_pred_celltype_{column}', f'cell_types_{column}'], wspace=0.5)


## Save notebook and adata object

In [None]:
repo_path = os.getcwd()
notebook_name = '05_annotation.ipynb'
notebook_path = os.path.join(repo_path, notebook_name)
notebook_copy = os.path.join(tree.annotation_dir , notebook_name)
shutil.copyfile(notebook_path, notebook_copy)

In [None]:
adata_output = tree.annotation_anndata
adata.write(filename=adata_output)