In [None]:
from sctoolbox.utilities import bgcolor

# Marker genes and differentially expressed genes between groups
<hr style="border:2px solid black"> </hr>

## Get marker genes per cluster

The marker genes of a cluster are defined as differentially expressed genes of that cluster compared to all other cells/clusters. These genes are typically used to annotate the cluster to a specific cell type.

The gene ranking uses the rank_genes_groups() method from scanpy. See detailed documentation [here](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html).

---------

## Loading packages

In [None]:
import scanpy as sc
import pandas as pd
pd.set_option('display.max_columns', None)  #no limit to the number of columns shown
import sctoolbox.utilities as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
utils.settings_from_config("config.yaml", key="05")

## Loading adata

In [None]:
adata = utils.load_h5ad("anndata_4.h5ad")
display(adata)

<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Column names of clustering columns
clustering_cols = ["leiden_0.1", "leiden_0.2", "leiden_0.3", "leiden_0.4", "leiden_0.5",
                   "leiden_0.6", "leiden_0.7", "leiden_0.8", "leiden_0.9" ]

# Marker genes

# Method for gene ranking
ranking_method = "t-test"
# Top X genes to be reported, None for all genes
n_genes = None

# Marker gene filter

# Minimum fraction of cells in a group that must express a gene to be considered as a marker gene
min_in_group_fraction = 0.25
# Minimum foldchange (+/-) to be considered as a marker gene
min_fold_change = 0.5
# Maximum fraction of cells in other groups that must express a gene to be considered as a marker gene
max_out_group_fraction = 0.8

# Plotting
n_genes_dotplot = 15

<hr style="border:2px solid black"> </hr>

## Automatic markers per cluster using rank_genes_groups

In [None]:
marker_tables = dict()

In [None]:
for clustering in clustering_cols:
    
    # Identify markers per cluster (adjust group fraction and fold change to filter genes)
    tools.run_rank_genes(adata, clustering,
                         min_in_group_fraction=min_in_group_fraction,
                         min_fold_change=min_fold_change,
                         max_out_group_fraction=max_out_group_fraction,
                         n_genes=n_genes,
                         ranking_method=ranking_method)
    
    # Plot dotplot of markers
    _ = pl.rank_genes_plot(adata, key=f"rank_genes_{clustering}_filtered",
                           n_genes=n_genes_dotplot, 
                           save=f"marker_genes_dotplot_{clustering}.pdf")

    # Write marker genes to table
    marker_table = tools.get_rank_genes_tables(adata, out_group_fractions=True,
                                               key=f"rank_genes_{clustering}_filtered",
                                               save_excel=f"cluster_marker_genes_{clustering}.xlsx")
    marker_tables[clustering] = marker_table

---------

# Plot expression

<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Column names of clustering columns
marker_table_cols = ["leiden_0.1", "leiden_0.2", "leiden_0.3", "leiden_0.4", "leiden_0.5",
                     "leiden_0.6", "leiden_0.7", "leiden_0.8", "leiden_0.9" ]
n_marker = 5
n_cols = 5

# List of genes which should be plotted instead
custom_gene_list = []

embedding = "umap"

<hr style="border:2px solid black"> </hr>

In [None]:
if embedding == "umap":
    pl_method = sc.pl.umap
elif embedding == "tsne":
    pl_method = sc.pl.tsne
else:
    raise ValueError("Invalid embedding set.")

## Plot custom gene list

In [None]:
if custom_gene_list:
    pl_method(adata, color=custom_gene_list, cmap=pl.sc_colormap(), ncols=n_cols)

### Plot cluster marker

In [None]:
for cluster_col in marker_table_cols:
    for cluster, table in marker_tables[cluster_col].items():
        marker = list(table["names"][:n_marker])
        title = [f"Cluster_{cluster} - {gene}" for gene in marker]
        pl_method(adata, color=marker, title=title, cmap=pl.sc_colormap(), ncols=n_cols)

-------------

# DEG between conditions

 ## Run DEseq2 between conditions/clusters

In [None]:
# Normalize raw counts across cells
d = sc.pp.normalize_total(adata, layer="raw", inplace=False) # returns a dict
adata.layers["raw_norm"] = d["X"]
adata.layers["raw_norm"] = adata.layers["raw_norm"].ceil().astype(int)

In [None]:
%bgcolor PowderBlue

# Adjust which columns to use for DEseq2
sample_col = "sample"
condition_col = "chamber"

In [None]:
# Run DEseq2
deseq_table = tools.run_deseq2(adata, sample_col, condition_col, layer="raw_norm")

In [None]:
deseq_table.head(10)

--------------

## Save adata

In [None]:
#fix error when saving filtered rank gene names
import re
for key in list(adata.uns.keys()):
    if re.match("rank_genes_.*_filtered", key):
        del adata.uns[key]

In [None]:
adata

In [None]:
utils.save_h5ad(adata, "anndata_5.h5ad")