In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 4, 6, 8, 17, 21])

nb_name = "group_markers.ipynb"

_compare_version(nb_name)

# Marker computation and analysis
<hr style="border:2px solid black"> </hr>

## 1 - Description

**Note: Requires the dataset to be split into groups/ clusters. This can be achieved using a clustering notebook e.g. `04_clustering.ipynb` for RNA.**

**Move this notebook into the notebook folder (e.g. `rna_analysis/notebooks/`) of the respective analysis before using it!**

An important part of most analysis is the identification and subsequent interpretation of changes between predefined groups. Most commonly, groups were based on experimental conditions (e.g. healthy vs. ill) with the aim of identifying differential features (e.g. genes) to explain the underlying mechanisms. With the increased resolution of single cell data another approach became viable, namely the identification of cell types. But before cell type assignment (see  `annotation.ipynb`) markers have to be identified which follows the same concept of identifying group specific differences.

This notebook is aimed at computing lists of potential markers, e.g. marker genes for RNA and marker peaks for ATAC, based on the selected groups and reviewing their performance through visualization. Two different methods are provided for the identification of group markers:

- The [rank_genes_groups()](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html) method from scanpy.
- [DESeq2](https://doi.org/doi:10.18129/B9.bioc.DESeq2) a method originally intended for bulk that will be run by first creating pseudobulks from the given groups.

---------

## 2 - Loading packages

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.backends.backend_pdf

import sctoolbox
import sctoolbox.utils as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
from sctoolbox import settings

---------

## 3 - Loading adata

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# sctoolbox settings
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/"
settings.figure_dir = "../figures/markers/"
settings.table_dir = "../tables/markers/"
settings.log_file = "../logs/marker_log.txt"
settings.overwrite_log = True

anndata_file = "anndata_4_2D.h5ad"

___

In [None]:
adata = utils.adata.load_h5ad(anndata_file)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)

---------

## 4 - Select grouping
<hr style="border:2px solid black"> </hr>

Compute markers for all selected groupings and choose the optimal one based on the visualization. Ideally, markers should be only found within one group, causing the plot to show a "stair"-like pattern. The identified markers are typically used to assign cell types in downstream analysis.

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Column names of clustering columns
# Add all columns of interest
clustering_cols = ["clustering"] #, "leiden_0.2", "leiden_0.3", "leiden_0.4", "leiden_0.5", "leiden_0.6", "leiden_0.7", "leiden_0.8", "leiden_0.9"]

# Marker features

# Method for feature ranking
ranking_method = "t-test"
# Top n features to be reported, None for all features
top_n = None

# Marker feature filter

# Minimum fraction of cells in a group that must express a feature to be considered as a marker
min_in_group_fraction = 0.25
# Minimum foldchange (+/-) to be considered as a marker
min_fold_change = 0.5
# Maximum fraction of cells in other groups that must express a feature to be considered as a marker
max_out_group_fraction = 0.8

# Plotting
n_features_markerplot = 5  # number of features to show per group
marker_style = "dots"  # Either `dots` or `heatmap`.

________

In [None]:
for clustering in clustering_cols:

    # Identify markers per cluster (adjust group fraction and fold change to filter features)
    tools.marker_genes.run_rank_genes(
        adata,
        clustering,
        min_in_group_fraction=min_in_group_fraction,
        min_fold_change=min_fold_change,
        max_out_group_fraction=max_out_group_fraction,
        n_genes=top_n,
        ranking_method=ranking_method,
        key_added=f"rank_feature_{clustering}"
    )

    # Plot dotplot of markers
    _ = pl.marker_genes.rank_genes_plot(
        adata,
        key=f"rank_feature_{clustering}_filtered",
        n_genes=n_features_markerplot,
        style=marker_style,
        title=clustering,
        save=f"marker_{marker_style}_{clustering}.pdf"
    )

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# choose the final clustering (grouping)
final_clustering = "clustering"

In [None]:
for m_style in ["dots", "heatmap"]:
    # Plot dotplot of markers
    _ = pl.marker_genes.rank_genes_plot(
        adata,
        key=f"rank_feature_{final_clustering}_filtered",
        n_genes=n_features_markerplot,
        style=m_style,
        title=final_clustering,
        save=f"marker_{m_style}_{final_clustering}.pdf"
    )

# Write marker genes to table
marker_table = tools.marker_genes.get_rank_genes_tables(
    adata,
    out_group_fractions=True,
    key=f"rank_feature_{final_clustering}_filtered",
    save_excel=f"marker_feature_{final_clustering}.xlsx"
)

---------

## 5 - Feature accumulation
<hr style="border:2px solid black"> </hr>

This section shows the location of cells within the embedding that express features. Features are either the top markers of the respective group or a user-provided list of features.

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Top X marker to be plotted
n_marker = 5

# List of features additionally shown in the embedding.
custom_feat_list = []
# name of the feature group
fname = ""

# planet plot
# set the data which should be displayed on the x- and y-axis (adata.obs column names) to enable this plot
x = "phase"
y = final_clustering
log_scale=False

# embedding
embedding = "umap"  # Either umap or tsne
style = "hexbin"  # Either 'dots' to show all cells or 'hexbin' or 'density' for density approximations
binarize_threshold = 0.0  # threshold of a features expression that a cell must have to be shown in the binarized plot.
binarize_threshold_percentile = None # Percentile threshold of a features expression that a cell must have to be shown in the binarized plot. If this is set it takes precedence over binarize_threshold.

______

### 5.1 - Plot custom feature list

Show where the selected features are expressed in the embedding.

In [None]:
if custom_feat_list:
    pl.embedding.plot_embedding(
        adata=adata,
        method=embedding,
        color=custom_feat_list,
        style=style,
        save=f"{embedding}_custom_list_expression.pdf",
        ncols=3
    )

Show the aggregated expression of the selected features.

In [None]:
if custom_feat_list:
    pl.embedding.agg_feature_embedding(
        adata=adata,
        features=custom_feat_list,
        fname=fname if fname else f"{', '.join(custom_feat_list)} mean expression",
        save=f"{embedding}_custom_list_aggr_expression.pdf"
    )

In [None]:
if custom_feat_list:
    _ = pl.embedding.feature_per_group(
        adata=adata,
        y=y,
        x=custom_feat_list,
        style=style,
        save=f"{embedding}_custom_list_with_groups.pdf",
        method=embedding
    )

In [None]:
if custom_feat_list:
    _ = pl.embedding.feature_per_group(
        adata=adata,
        y=y,
        x=custom_feat_list,
        binarize_threshold=binarize_threshold,
        binarize_percentile_threshold=binarize_threshold_percentile,
        style=style,
        save=f"{embedding}_custom_list_with_groups_binarized.pdf",
        method=embedding
    )

Show the expression of features split by x- and y-axis (e.g. clustering vs. cell-cycle). Each combination is further divided into a center dot showing the aggregation of the surrounding features.

**Note: can display up to 6 surrounding features. Will use the first 6 if more are selected.**

In [None]:
if custom_feat_list and x and y:
    plot_vars = pl.planet_plot.planet_plot_anndata_preprocess(
        adata=adata,
        x_col=x,
        y_col=y,
        genes=custom_feat_list[:6]
    )
    
    pl.planet_plot.planet_plot_render(
        plot_vars,
        x_col=x,
        y_col=y,
        mode="planet",
        use_log_scale=log_scale,
        planet_columns=custom_feat_list[:6],
        color_schema="viridis"
    )

---------

### 5.2 - Plot cluster marker
Show the top n markers expression next to their respective group. In the best case a marker is exclusively expressed within the cells that are assigned to the respective group.

In [None]:
_ = pl.embedding.feature_per_group(
    adata=adata,
    y=y,
    top_n=n_marker,
    marker_key=f"rank_feature_{final_clustering}_filtered",
    style=style,
    save=f"{embedding}_clustering_{y}_top_{n_marker}_markers.pdf",
    method=embedding
)

Show the top n markers expression binarized next to their respective group. This plot highlights where markers have a minimum expression level, making it easier to identify clusters with a high expression of the marker.

In [None]:
_ = pl.embedding.feature_per_group(
    adata=adata,
    y=y,
    top_n=n_marker,
    marker_key=f"rank_feature_{final_clustering}_filtered",
    binarize_threshold=binarize_threshold,
    binarize_percentile_threshold=binarize_threshold_percentile,
    style=style,
    save=f"{embedding}_clustering_{y}_top_{n_marker}_markers_binarized.pdf",
    method=embedding
)

---------

## 6 - Condition related markers
Until this point markers were computed to identify differences between the previously created cluster groups. However, most experiments provide additional information (e.g. ill vs. healthy, wild-type vs. mutant, etc.). This section aims to use these conditions to identify changes within the above selected clusters.

In [None]:
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata.obs)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Both variables expect a column name from above (adata.obs)
clustering_col = final_clustering  # the clustering e.g. the one chosen in the prior section
condition_col = "meta c19_severity"  # the condition used to find changes within each cluster

____

The dataset is split on each cluster and a ranking is performed on each individual subset.

In [None]:
# split adata by group, then rank features on each
adata_cond = {}

for cluster in set(adata.obs[clustering_col]):
    print(f"Cluster {cluster}")
    adata_sub = adata[adata.obs[clustering_col] == cluster]

    # Check if sample count is sufficent
    value_counts = adata_sub.obs[condition_col].value_counts()
    insufficient_size = [i for i in value_counts.index if value_counts[i] == 1]
    if insufficient_size: 
        print(f"Removed conditions due to insufficent size {insufficient_size}")
        adata_sub = adata_sub[~adata_sub.obs[condition_col].isin(insufficient_size)]

    if len(set(adata_sub.obs[condition_col])) < 2:
        print(f"Skipped Cluster {cluster}")
        continue

    tools.marker_genes.run_rank_genes(
        adata_sub,
        condition_col,
        min_in_group_fraction=min_in_group_fraction,
        min_fold_change=min_fold_change,
        max_out_group_fraction=max_out_group_fraction,
        n_genes=top_n,
        ranking_method=ranking_method
    )
    
    adata_cond[cluster] = adata_sub

One dotplot per group (cluster) showing the top differential features.

In [None]:
for key, value in adata_cond.items():
    # Plot dotplot of markers
    _ = pl.marker_genes.rank_genes_plot(
        value,
        key=f"rank_genes_{condition_col}_filtered",
        n_genes=n_features_markerplot,
        style=marker_style,
        title=key,
        save=f"{marker_style}_{condition_col}_cluster_{key}.pdf"
    )

In [None]:
# write marker tables
for key, value in adata_cond.items():
    # Write marker genes to table
    deg_table = tools.marker_genes.get_rank_genes_tables(
        value,
        out_group_fractions=True,
        key=f"rank_genes_{condition_col}_filtered",
        save_excel=f"{condition_col}_cluster_{key}.xlsx"
    )

---------

 ## 7 - Run DEseq2 between conditions/samples

 <h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Adjust which columns to use for DEseq2
sample_col = "sample"
condition_col = "meta c19_severity" 

___

In [None]:
# Normalize raw counts across cells
d = sc.pp.normalize_total(adata, layer="raw", inplace=False) # returns a dict
adata.layers["raw_norm"] = d["X"]
adata.layers["raw_norm"] = adata.layers["raw_norm"].ceil().astype(int)

In [None]:
# Run DEseq2import numpy as np
#deseq_table = tools.marker_genes.run_deseq2(adata, sample_col, condition_col, layer="raw_norm")

The DEseq2 result table (below) shows the top 10 differential features sorted after the p-value of the first two conditions.

| Column | Description |
|--------|-------------|
|`[condition]_mean`|The mean expression of the respective feature within the given condition.|
|`[condition 1]/[condition 2]_log2FoldChange`|Log2 of the quotient of the two conditions mean expression. Values `>0` can be interpreted as "more expression in 'condition 1'" and `<0` as "more expression in 'condition 2'".|
|`[condition 1]/[condition 2]_pvalue`|The convidence if a feature is expressed differentially between the two conditions. Closer to zero = better. `<=0.05` is typically considered significant.|
|`[condition 1]/[condition 2]_padj`|The p-value (see above) adjusted for the number of tests (here features).|
|`[sample][condition]`|The mean feature expression within the respective sample and condition.|
|`baseMean`|The mean expression over each sample split by condition (see above).|

In [None]:
#with pd.option_context("display.max.rows", 10, "display.max.columns", None):
    #display(deseq_table.head(10))
#print(f"{len(deseq_table)} rows x {len(deseq_table.columns)} columns")

In [None]:
#deseq_table.to_excel(f"{sctoolbox.settings.table_dir}/DEseq_{sample_col}_vs_{condition_col}.xlsx")

---------

## 8 - Save adata

In [None]:
utils.adata.save_h5ad(adata, "anndata_marker.h5ad")