In [1]:
from sctoolbox.utils.jupyter import bgcolor

# Gene Set Enrichment Analysis (GSEA)
<hr style="border:2px solid black"> </hr>

## 1 - Description

**Note: You need to have run the marker gene notebook before using the GSEA notebook**

The main function of this notebook is to get the enrichted GO pathways per cluster. For this we use the marker genes as the gene set input.  
This notebook uses [enrichr](https://maayanlab.cloud/Enrichr/) which is implemented in [geseapy](https://github.com/zqfang/GSEApy).    

`The enrichr module enable you perform gene set enrichment analysis using Enrichr API. Enrichr is open source and freely available online at: http://amp.pharm.mssm.edu/Enrichr . It runs very fast.` - GSEApy docu

---

## 2 - Setup

In [None]:
import sctoolbox.utils as utils
import sctoolbox.tools as tools
from sctoolbox import settings

import pandas as pd
import gseapy as gp
import tqdm
import matplotlib.pyplot as plt

---

## 3 - General Input

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [2]:
%bgcolor PowderBlue

# sctoolbox settings
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/"
settings.figure_dir = "../figures/GSEA/"
settings.log_file = "../logs/GSEA_log.txt"
last_notebook_adata = "anndata_0A.h5ad"

organism = "human"

# key for marker table in adata.uns
marker_key = "rank_genes_leiden_0.1_filtered" 
pvals_adj_tresh = 0.05

NameError: name 'settings' is not defined

---

## 4 - Load anndata

In [None]:
adata = utils.adata.load_h5ad(last_notebook_adata)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

---

## 5 - Select library

**List of available librarys**

In [None]:
[db for db in gp.get_library_name(organism) if db.startswith("GO")]

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Choose public library to use
library_name = "GO_Biological_Process_2023"

# If custom gene sets and background should be used set here.
# The public library will be ignored if custom_gene_set is given.
# To use a custom background for the public gene set library only set custom_background.
custom_gene_set = None     # {"Pathway 1": ["Gene1", "Gene2",...], ...}
custom_background = None   # ["Gene 1", "Gene 2", ....]

---

## 6 - Run enrichr

In [None]:
combined = tools.gsea.enrichr_marker_genes(adata,
                                           marker_key=marker_key,
                                           organism=organism,
                                           pvals_adj_tresh=pvals_adj_tresh,
                                           gene_sets=custom_gene_set,
                                           background=custom_background,
                                           library_name=library_name)

---

## 7 - Plotting

**Dotplot**  
The dotplot shows all pathways as dots per cluster.  
The size of the dot indicates the fraction of genes in the cluster that match the pathway.  
The color of the dot indicates the adjusted p-value.

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Dotplot
figsize = (8, 20) # Set figure size for dotplot
top_term = 10     # Number of pathways shown per cluster

size = 5          # Size scale for dots

---

In [None]:
for reg in ["UP", "DOWN"]:
    comb = combined[combined["UP_DW"] == reg]
    if not comb.empty:
        ax = gp.dotplot(comb,
                        figsize=figsize,
                        x='Cluster',
                        title=f"Top {top_term} {reg} regulated Pathways per Cluster",
                        cmap = plt.cm.autumn_r,
                        size=size,
                        show_ring=True,
                        top_term=top_term,
                        xticklabels_rot=45
                       )
        ax.set_xlabel("")
        plt.tight_layout()
        plt.savefig(f"{settings.figure_dir}/GSEA_dotplot_top{top_term}_{reg}_pathways_per_cluster.pdf", dpi=300)