# Marker Repo - annotation

In this notebook, clustered h5ad files can be annotated using the MarkerRepo or SCSA.

## Loading packages

In [None]:
import markerrepo.marker_repo as mr
import markerrepo.wrappers as wrap
import markerrepo.annotation as annot
import markerrepo.parsing as pars
import markerrepo.utils as utl
import auto_annotation as auto
import scanpy as sc
import os
import matplotlib.pyplot as plt
import pandas as pd
import inspect
import re
import math

%load_ext autoreload
%autoreload 2

## Settings

Specify path of the cloned repository and the h5ad file which is going to be annotated.

In [None]:
repo_path = "/mnt/workspace_stud/allstud/wp2/annotate_by_marker_and_features-sort"
h5ad_path = "/mnt/workspace_stud/allstud/wp1/data/2024_01_28/esophagus_muscularis_SM-A8CPH.h5ad"

Load anndata and list all possible settings.

In [None]:
adata = sc.read_h5ad(h5ad_path)
annot.list_possible_settings(repo_path, adata=adata)

Enter general annotation settings.

In [None]:
# Taxonomy ID or Organism Name
# e.g., "human" or 9606
organism = "human"

# Column in .obs table where ranked genes groups are stored
# e.g., "rank_genes_groups"
# Enter None if no ranking has been performed yet
rank_genes_column = None

# Column in .var table where gene symbols or Ensembl IDs are stored
# Enter None if the index column of the .var table already has gene symbols or Ensembl IDs
# that you want to use for your annotation
genes_column = None

# The .obs table column of the clustering you want to annotate (e.g., "leiden" or "louvain")
# If None, you can pick one interactively 
clustering_column_list = auto.get_clustering_column_list(adata=adata)

# Specify whether your index of the .var tables are Ensembl IDs (True) or gene symbols (False)
ensembl = mr.check_ensembl(adata)

# Name of the column to add with the final cell type annotation
# If None, all annotation columns will be kept
celltype_column_name = None

# Whether to delete the created marker lists after annotation or not
delete_lists = True

# 
column_specific_terms={"Source":"panglao", "Organism name":"human"}

# 
mr_parameters = [{"style":"two_column", "file_name":"two_column"}]

Validate general annotation settings and the mr_parameters.

In [None]:
auto.auto_validate_settings(settings=mr_parameters, repo_path=repo_path, adata=adata, organism=organism, 
                        rank_genes_column=rank_genes_column, genes_column=genes_column, clustering_column=clustering_column_list, ensembl=ensembl,
                        column_specific_terms=column_specific_terms)

## Create suitable marker list(s)

<details>
    <summary>Click here to see/collapse the function description</summary>
    <p><b>Function Call:</b> create_multiple_marker_lists</p>
    <p>This function calls 'create_marker_lists' with multiple parameter sets to create marker lists. It iterates over each dictionary within a list, using its contents to call 'create_marker_lists'. Default values are assigned for any parameters missing from a dictionary, but these can be overridden by individual dictionary entries.</p>
    <p><b>Parameters (excerpt):</b></p>
    <ul>
        <li><b>settings:</b> list of dict, default [{}] - A list of dictionaries where each dictionary contains parameters for a single call to 'create_marker_lists'. Keys should match the parameter names of 'create_marker_lists', and values are the desired values for those parameters.</li>
        <li><b>style:</b> str, default "score" - Determines the style of the marker lists. Available options include "two_column", "score", "ui", and "panglao".</li>
        <li><b>force_homology:</b> bool, default False - If set to True, the function will attempt to create marker lists via homology, even if marker lists for the given organism already exist.</li>
        <li><b>show_lists:</b> bool, default True - If True, the function displays the marker lists of the query post-creation.</li>
        <li><b>column_specific_terms:</b> dict, default None - A dictionary with column names as keys and lists of search terms as values. If provided, this overrides the 'col_to_search' and 'search_terms' parameters.</li>
        <li><b>adata:</b> AnnData, default None - If provided, the function adds the marker list IDs to the .uns table of the AnnData object.</li>
    </ul>
    <p><b>Returns:</b></p>
    <ul>
        <li><b>list of str:</b> A list of all paths to the created marker lists.</li>
    </ul>
</details>


The paths of the marker lists will be stored in the <b>marker_lists</b> variable. They will work as input for the actual cell type annotation of the next cell.

In [None]:
marker_lists = wrap.create_multiple_marker_lists(settings=mr_parameters, repo_path=repo_path, organism=organism, 
                                                 ensembl=ensembl, column_specific_terms=column_specific_terms,
                                                 show_lists=True, adata=adata)

## Annotate adata using the created list(s)

Show new annotation column.

In [None]:
annotation_results = auto.auto_run_annotation(adata, marker_repo=True, SCSA=True, marker_lists=marker_lists, mr_obs="mr", scsa_obs="scsa", 
                   rank_genes_column=None, clustering_column_list=clustering_column_list, reference_obs=None, keep_all=False, 
                   verbose=False, show_ct_tables=False, show_plots=False, ignore_overwrite=True,
                   celltype_column_name=None)

In [None]:
auto.show_umap_collection(annotation_results, clustering_column_list)

In [None]:
compared_result_df = auto.create_compare_df(annotation_results)

In [None]:
auto.find_cluster("lovain_1", 2, annotation_results)

In [None]:
best_cluter = str(adata.uns["best_cluster"])
best_cluter

In [None]:
reference_best = wrap.run_annotation(adata, SCSA=False, marker_lists=marker_lists, reference_obs="ontology label", show_comparison=False,
                    clustering_column=best_cluter, rank_genes_column=rank_genes_column, 
                    ignore_overwrite=True, verbose=False, show_plots=True, show_ct_tables=True, 
                    celltype_column_name=celltype_column_name)

Delete created marker lists.

In [None]:
if delete_lists:
    mr.delete_files(marker_lists)