# Annotation

## Settings 

In [1]:
# Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/mkessle/ext442'
test = 'ext442'

# Gene annotation
GTF_PATH = "/mnt/flatfiles/organisms/new_organism/mus_musculus/104/mus_musculus.104.genes.gtf"
ENSEMBL_DATASET = "mmusculus_gene_ensembl" # This is used for replaceing ensembl gene IDs with gene names if possible
                                           # use "hsapiens_gene_ensembl" for homo sapiens
# Cell type annotation
SPECIES = "Mm" # Currently only Hs (Homo Sapiens) and Mm (Mus Musculus) are available for custom cell type annotation
SCSA_SPECIES = "mouse" # Currently only human (Homo Sapiens) and mouse (Mus Musculus) are available for SCSA
PANGLAO_DB_PATH = "/mnt/workspace/jdetlef/sc-atac-framework/ctannot/panglao_markers"
RESOLUTIONS = None # Choose "None" to keep the selection of the clustering resolution from the previous notebook
                   # You can also choose a list of clustering resolutions, which have been calculated and saved

## Loading packages and setup

In [2]:
import sctoolbox.atac_tree as sub_tree
import sctoolbox.annotation as annot
import sctoolbox.custom_celltype_annotation as ctannot
from sctoolbox.celltype_annotation import run_scsa
import episcanpy as epi
import scanpy as sc
from matplotlib import pyplot as plt
from pybiomart import Dataset

## Setup path handling object 

In [3]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

all directories existing
all directories existing


In [4]:
tree.gtf_path = GTF_PATH

## Load anndata

In [5]:
clustering_output = tree.clustering_anndata
adata = epi.read_h5ad(clustering_output)
adata.var

Unnamed: 0,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,feature,gene_start,gene_end,gene_strand,...,region_length,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,commonness,highly_variable
annotated_peaks_0_chr1:3050427-3050801,chr1,3050427,3050801,peak_1,.,.,,,,,...,374,187,0.003213,0.003208,99.678672,187.0,5.236442,187,274.0,True
annotated_peaks_100000_chr4:129518038-129518515,chr4,129518038,129518515,peak_100001,.,.,gene,129513066.0,129517740.0,-,...,477,4464,0.076706,0.073907,92.329370,4464.0,8.404024,4464,7932.0,False
annotated_peaks_100001_chr4:129518849-129519134,chr4,129518849,129519134,peak_100002,.,.,gene,129513066.0,129517740.0,-,...,285,1068,0.018352,0.018185,98.164822,1068.0,6.974479,1068,1788.0,True
annotated_peaks_100002_chr4:129527627-129527883,chr4,129527627,129527883,peak_100003,.,.,gene,129519870.0,129534858.0,-,...,256,137,0.002354,0.002351,99.764589,137.0,4.927254,137,289.0,True
annotated_peaks_100003_chr4:129533947-129534180,chr4,129533947,129534180,peak_100004,.,.,gene,129519870.0,129534858.0,-,...,233,536,0.009210,0.009168,99.078974,536.0,6.285998,536,1040.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
annotated_peaks_99999_chr4:129517580-129517846,chr4,129517580,129517846,peak_100000,.,.,gene,129513066.0,129517740.0,-,...,266,1156,0.019864,0.019669,98.013609,1156.0,7.053586,1156,2128.0,True
annotated_peaks_9999_chr1:192477325-192477572,chr1,192477325,192477572,peak_10000,.,.,gene,192179018.0,192453531.0,-,...,247,295,0.005069,0.005056,99.493092,295.0,5.690360,295,521.0,True
annotated_peaks_999_chr1:37026210-37026612,chr1,37026210,37026612,peak_1000,.,.,gene,37065676.0,37226694.0,+,...,402,300,0.005155,0.005142,99.484501,300.0,5.707110,300,507.0,True
annotated_peaks_99_chr1:7849890-7850111,chr1,7849890,7850111,peak_100,.,.,gene,7799084.0,7800789.0,-,...,221,163,0.002801,0.002797,99.719912,163.0,5.099866,163,229.0,True


In [None]:
if not RESOLUTIONS:
    RESOLUTIONS = adata.uns['infoprocess']['leiden_res']

In [7]:
# FIX FOR THE EXT442
adata.var['peak_chr'] = adata.var['peak_chr'].astype(str)
adata.var['peak_start'] = adata.var['peak_start'].astype(int)
adata.var['peak_end'] = adata.var['peak_end'].astype(int)

In [8]:
adata.var

Unnamed: 0,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,feature,gene_start,gene_end,gene_strand,...,region_length,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,commonness,highly_variable
annotated_peaks_0_chr1:3050427-3050801,chr1,3050427,3050801,peak_1,.,.,,,,,...,374,187,0.003213,0.003208,99.678672,187.0,5.236442,187,274.0,True
annotated_peaks_100000_chr4:129518038-129518515,chr4,129518038,129518515,peak_100001,.,.,gene,129513066.0,129517740.0,-,...,477,4464,0.076706,0.073907,92.329370,4464.0,8.404024,4464,7932.0,False
annotated_peaks_100001_chr4:129518849-129519134,chr4,129518849,129519134,peak_100002,.,.,gene,129513066.0,129517740.0,-,...,285,1068,0.018352,0.018185,98.164822,1068.0,6.974479,1068,1788.0,True
annotated_peaks_100002_chr4:129527627-129527883,chr4,129527627,129527883,peak_100003,.,.,gene,129519870.0,129534858.0,-,...,256,137,0.002354,0.002351,99.764589,137.0,4.927254,137,289.0,True
annotated_peaks_100003_chr4:129533947-129534180,chr4,129533947,129534180,peak_100004,.,.,gene,129519870.0,129534858.0,-,...,233,536,0.009210,0.009168,99.078974,536.0,6.285998,536,1040.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
annotated_peaks_99999_chr4:129517580-129517846,chr4,129517580,129517846,peak_100000,.,.,gene,129513066.0,129517740.0,-,...,266,1156,0.019864,0.019669,98.013609,1156.0,7.053586,1156,2128.0,True
annotated_peaks_9999_chr1:192477325-192477572,chr1,192477325,192477572,peak_10000,.,.,gene,192179018.0,192453531.0,-,...,247,295,0.005069,0.005056,99.493092,295.0,5.690360,295,521.0,True
annotated_peaks_999_chr1:37026210-37026612,chr1,37026210,37026612,peak_1000,.,.,gene,37065676.0,37226694.0,+,...,402,300,0.005155,0.005142,99.484501,300.0,5.707110,300,507.0,True
annotated_peaks_99_chr1:7849890-7850111,chr1,7849890,7850111,peak_100,.,.,gene,7799084.0,7800789.0,-,...,221,163,0.002801,0.002797,99.719912,163.0,5.099866,163,229.0,True


## Gene annotation 

In [9]:
annot.annotate_adata(adata,
                   GTF_PATH,
                   config=None,
                   best=True,
                   threads=6,
                   coordinate_cols=None,
                   temp_dir="",
                   verbose=True,
                   inplace=True)

Setting up annotation configuration...
Config dictionary: {'queries': [{'distance': [10000, 1000], 'feature_anchor': ['start'], 'feature': ['gene'], 'name': 'promoters'}], 'priority': True, 'show_attributes': ['all'], 'output_by_query': False}
Setting up genomic regions to annotate...
Index(['peak_chr', 'peak_start', 'peak_end'], dtype='object')
Preparing gtf file for annotation...
- Reading gtf with Tabix
- Index of gtf not found - trying to index gtf
- Reading gtf with Tabix
Done preparing gtf!
Annotating regions...
Formatting annotations...
These columns will be overwritten by the annotation
Finished annotation of features! The results are found in the .var table.
[Errno 2] No such file or directory: ''


In [10]:
adata.var['gene_name']

annotated_peaks_0_chr1:3050427-3050801                 NaN
annotated_peaks_100000_chr4:129518038-129518515    Ccdc28b
annotated_peaks_100001_chr4:129518849-129519134    Ccdc28b
annotated_peaks_100002_chr4:129527627-129527883        NaN
annotated_peaks_100003_chr4:129533947-129534180      Txlna
                                                    ...   
annotated_peaks_99999_chr4:129517580-129517846     Ccdc28b
annotated_peaks_9999_chr1:192477325-192477572          NaN
annotated_peaks_999_chr1:37026210-37026612             NaN
annotated_peaks_99_chr1:7849890-7850111                NaN
annotated_peaks_9_chr1:3191548-3191937                 NaN
Name: gene_name, Length: 148517, dtype: category
Categories (25960, object): ['0610006L08Rik', '0610009B22Rik', '0610009E02Rik', '0610009L18Rik', ..., 'mmu-mir-7676-1', 'mmu-mir-7679', 'n-R5s162', 'n-R5s41']

In [None]:
assigned_features = adata.var['gene_name'].dropna()
assigned_features

### Replace ensembl gene IDs with gene names if possible

In [None]:
dataset = Dataset(name=ENSEMBL_DATASET,
                  host='http://www.ensembl.org')

df = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])

In [None]:
ens_dict = dict(zip(df['Gene stable ID'], df['Gene name']))

In [None]:
count = 0
for index, gene in enumerate(assigned_features):
    if gene.startswith("ENSG"):
        if gene in ens_dict.keys():
            if isinstance(ens_dict[gene], str):
                assigned_features.replace(gene, ens_dict[gene], inplace=True)
                count += 1
print(f'{count} ensembl gene ids have been replaced with gene names')

In [None]:
assigned_features

In [None]:
assigned_adata = adata[:, assigned_features.index]
assigned_adata

## Rank peaks

In [None]:
peaks_adata = assigned_adata.copy()

In [None]:
peaks_adata.uns['log1p']['base'] = None

In [None]:
for res in RESOLUTIONS:
    print(f'Ranking peaks for leiden clusters with resolution: {res}')
    epi.tl.rank_features(peaks_adata, f'leiden_{res}', omic='ATAC', 
                         key_added=f'rank_features_groups_{res}', n_features=100, use_raw=False)
    
print('Done')

## Rank genes
Replace peaks with gene names

In [None]:
genes_adata = peaks_adata.copy()
genes_adata.var.reset_index(inplace=True)  # remove peaks from index and save them in the column ['index']
genes_adata.var.set_index('gene_name',inplace=True)  # set genes as index
genes_adata.var.index = genes_adata.var.index.astype('str')  # to avoid index being categorical
genes_adata.var_names_make_unique(join='_')
genes_adata.raw = genes_adata

In [None]:
for res in RESOLUTIONS:
    print(f'Ranking genes groups for leiden clusters with resolution: {res}')
    sc.tl.rank_genes_groups(genes_adata, groupby=f'leiden_{res}', use_raw=False, key_added=f'rank_genes_groups_{res}')

print('Done')

### Plot ranked genes

In [None]:
for res in RESOLUTIONS:
    print(f'Plotting ranked genes groups for leiden with resolution: {res}')
    plot_key = 'rank_genes_groups_' + str(res)
    sc.pl.rank_genes_groups(genes_adata, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_leiden_{res}.png')
    plt.show()

In [None]:
for res in RESOLUTIONS:
    print(f'Plotting matrixplot for leiden with resolution: {res}')
    plot_key = 'rank_genes_groups_' + str(res)
    sc.pl.rank_genes_groups_matrixplot(genes_adata, standard_scale='var', n_genes=10, key=plot_key, show=False)
    # TODO
    # if save_figs:
    #    plt.savefig(f'{OUTPUT_FIGS}/ranked_genes_groups_matrixplot_leiden_{res}.png')
    plt.show()

## Cell type annotation

In [None]:
for res in RESOLUTIONS:
    ctannot.annot_ct(adata=adata, genes_adata=genes_adata, output_path=tree.annotation_dir, db_path=f"{PANGLAO_DB_PATH}", cluster_path=None, cluster_column=f"leiden_{res}", rank_genes_column=f"rank_genes_groups_{res}", sample=tree.run, ct_column=f"cell_types_{res}", tissue="all", db="panglao", inplace=True, species=SPECIES)

In [None]:
adata.obs

In [None]:
for res in RESOLUTIONS:
    sc.pl.umap(adata, color=[f'cell_types_{res}', f'leiden_{res}'], wspace=0.5)

In [None]:
ctannot.show_tables(annotation_dir=tree.annotation_dir, resolution=0.1, clustering_column="leiden")

### Modify wrong cluster annotations by selecting other recommendations

In [None]:
change_annotation = input('Do you want to change the annotation? answer with yes or no:')

In [None]:
if change_annotation == 'yes':
    ctannot.modify_ct(adata=adata, resolutions=RESOLUTIONS, annotation_dir=tree.annotation_dir, clustering_column="leiden", cell_type_column="cell_types", inplace=True)

### SCSA cell type annotation

In [None]:
for res in RESOLUTIONS:
    run_scsa(genes_adata,
             gene_column=None,
             gene_symbol='auto',
             key=f'rank_genes_groups_{res}',
             column_added=f'SCSA_pred_celltype_{res}',
             inplace=True,
             python_path=None,
             scsa_path=None,
             wholedb_path=None,
             species=SCSA_SPECIES,
             fc=1.5,
             pvalue=0.01,
             tissue='All',
             celltype='normal',
             user_db=None,
             z_score='best',
             )
    adata.obs[f'SCSA_pred_celltype_{res}'] = genes_adata.obs[f'SCSA_pred_celltype_{res}']
    sc.pl.umap(adata, color=[f'SCSA_pred_celltype_{res}', f'cell_types_{res}'], wspace=0.5)


## Save notebook and adata object

In [None]:
import os
import shutil
repo_path = os.getcwd()
notebook_name = '5_annotation.ipynb'
notebook_path = os.path.join(repo_path, notebook_name)
notebook_copy = os.path.join(tree.annotation_dir , notebook_name)
shutil.copyfile(notebook_path, notebook_copy)

In [None]:
adata_output = tree.annotation_anndata
adata.write(filename=adata_output)