In [6]:
from sctoolbox.utils.jupyter import bgcolor

# Gene Set Enrichment Analysis (GSEA)
<hr style="border:2px solid black"> </hr>

## 1 - Description

**Note: You need to have run the marker gene notebook before using the GSEA notebook**

The main function of this notebook is to get the enrichted GO pathways per cluster. For this we use the marker genes as the gene set input.  
This notebook uses [enrichr](https://maayanlab.cloud/Enrichr/) which is implemented in [geseapy](https://github.com/zqfang/GSEApy).    

`The enrichr module enable you perform gene set enrichment analysis using Enrichr API. Enrichr is open source and freely available online at: http://amp.pharm.mssm.edu/Enrichr . It runs very fast.` - GSEApy docu

---

## 2 - Setup

In [7]:
import sctoolbox.utils as utils
import sctoolbox.tools as tools
from sctoolbox import settings

import pandas as pd
import gseapy as gp
import tqdm
import matplotlib.pyplot as plt

---

## 3 - General Input

In [10]:
%bgcolor PowderBlue

# sctoolbox settings
settings.adata_input_dir = "../rna_analysis/adatas/"
settings.adata_output_dir = "../rna_analysis/adatas/"
settings.figure_dir = "../figures/GSEA/"
settings.log_file = "../logs/GSEA_log.txt"
last_notebook_adata = "anndata_5.h5ad"

organism = "human"

# key for marker table in adata.uns
marker_key = "rank_genes_leiden_0.1_filtered" 
pvals_adj_tresh = 0.05



---

## 4 - Load anndata

In [11]:
adata = utils.adata.load_h5ad(last_notebook_adata)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

[INFO] The adata object was loaded from: ../rna_analysis/adatas/anndata_5.h5ad


AnnData object with n_obs × n_vars = 2769 × 21142
    obs: 'orig.ident', 'chamber', 'donor', 'batch', 'sample', 'celltype', 'total_counts', 'log1p_total_counts', 'total_counts_is_ribo', 'log1p_total_counts_is_ribo', 'pct_counts_is_ribo', 'total_counts_is_mito', 'log1p_total_counts_is_mito', 'pct_counts_is_mito', 'total_counts_is_gender', 'log1p_total_counts_is_gender', 'pct_counts_is_gender', 'doublet_score', 'predicted_doublet', 'predicted_sex', 'n_genes', 'log1p_n_genes', 'S_score', 'G2M_score', 'phase', 'leiden', 'LISI_score_X_pca', 'LISI_score_X_umap', 'leiden_0.1', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_0.6', 'leiden_0.7', 'leiden_0.8', 'leiden_0.9', 'clustering'
    var: 'is_ribo', 'is_mito', 'is_gender', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'batch_colors', 'clustering_colors', 'hvg', 'lei

Unnamed: 0,orig.ident,chamber,donor,batch,sample,celltype,total_counts,log1p_total_counts,total_counts_is_ribo,log1p_total_counts_is_ribo,pct_counts_is_ribo,total_counts_is_mito,log1p_total_counts_is_mito,pct_counts_is_mito,total_counts_is_gender,log1p_total_counts_is_gender,pct_counts_is_gender,doublet_score,predicted_doublet,predicted_sex,n_genes,log1p_n_genes,S_score,G2M_score,phase,leiden,LISI_score_X_pca,LISI_score_X_umap,leiden_0.1,leiden_0.2,leiden_0.3,leiden_0.4,leiden_0.5,leiden_0.6,leiden_0.7,leiden_0.8,leiden_0.9,clustering
NF2_RV_AGGTCTAGTAACTGCT,NF2_RV,B,2,1,2B,END,729.0,6.593045,2.0,1.098612,0.274348,4.0,1.609438,0.548697,16.0,2.833213,2.194787,0.021459,False,Female,572,6.350886,-0.061678,0.082235,G2M,6,1.116329,1.014324,2,2,3,3,3,3,3,3,3,3
NF3_RV_CATTTCACAAGAGTTA,NF3_RV,B,3,1,3B,vCM,961.0,6.869014,4.0,1.609438,0.416233,5.0,1.791759,0.520291,18.0,2.944439,1.873049,0.078167,False,Male,857,6.754604,-0.108512,-0.065094,G1,0,1.490767,1.974460,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NF3_RV_CGAAGGAGTATCCCAA,NF3_RV,B,3,1,3B,vCM,846.0,6.741701,3.0,1.386294,0.354610,4.0,1.609438,0.472813,18.0,2.944439,2.127660,0.060674,False,Male,766,6.642487,-0.051448,0.036180,G2M,3,1.993282,1.824108,1,1,1,1,1,1,1,1,1,1
NF2_RV_GAAATGAGTCGACGCT,NF2_RV,B,2,1,2B,FB,788.0,6.670766,6.0,1.945910,0.761421,4.0,1.609438,0.507614,28.0,3.367296,3.553300,0.023464,False,Female,610,6.415097,-0.065677,-0.083300,G1,1,1.986206,1.968581,1,1,2,2,2,2,2,2,2,2


Unnamed: 0_level_0,is_ribo,is_mito,is_gender,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,highly_variable,means,dispersions,dispersions_norm,mean,std
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AL627309.1,False,False,False,19,0.007945,0.007914,99.313832,22.0,3.135494,False,0.007644,0.192276,-0.148830,0.005069,0.062276
AC114498.1,False,False,False,1,0.000361,0.000361,99.963886,1.0,0.693147,False,0.000286,-0.231932,-1.950036,0.000211,0.011096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LINC01069,False,False,False,1,0.000361,0.000361,99.963886,1.0,0.693147,False,0.000302,-0.178531,-1.723294,0.000220,0.011552
GOLGA8Q,False,False,False,1,0.000361,0.000361,99.963886,1.0,0.693147,False,0.000299,-0.190068,-1.772278,0.000218,0.011452


---

## 5 - Select library

In [13]:
[db for db in gp.get_library_name(organism) if db.startswith("GO")]

['GO_Biological_Process_2013',
 'GO_Biological_Process_2015',
 'GO_Biological_Process_2017',
 'GO_Biological_Process_2017b',
 'GO_Biological_Process_2018',
 'GO_Biological_Process_2021',
 'GO_Biological_Process_2023',
 'GO_Cellular_Component_2013',
 'GO_Cellular_Component_2015',
 'GO_Cellular_Component_2017',
 'GO_Cellular_Component_2017b',
 'GO_Cellular_Component_2018',
 'GO_Cellular_Component_2021',
 'GO_Cellular_Component_2023',
 'GO_Molecular_Function_2013',
 'GO_Molecular_Function_2015',
 'GO_Molecular_Function_2017',
 'GO_Molecular_Function_2017b',
 'GO_Molecular_Function_2018',
 'GO_Molecular_Function_2021',
 'GO_Molecular_Function_2023']

In [14]:
%bgcolor PowderBlue

library_name = "GO_Biological_Process_2023"

In [15]:
## download a library or read a .gmt file
go_mf = gp.get_library(name=library_name, organism=organism)
# list of all genes as background
flat_list = set([item for sublist in go_mf.values() for item in sublist])

---

## 7 - Run enrichr

In [None]:
combined = tools.gsea.enrichr_marker_genes(adata,
                                           marker_key = marker_key,
                                           gene_sets = go_mf,
                                           organism = organsim,
                                           background = flat_list,
                                           pvals_adj_tresh = pvals_adj_tresh)

---

## 8 - Plotting

In [None]:
%bgcolor PowderBlue

# Dotplot
figsize = (8, 20)
top_term = 10

size = 5

In [None]:
for reg in ["UP", "DOWN"]:
    comb = combined[combined["UP_DW"] == reg]
    if not comb.empty:
        ax = gp.dotplot(comb,
                        figsize=figsize,
                        x='Cluster',
                        title=f"Top {top_term} {reg} regulated Pathways per Cluster",
                        cmap = plt.cm.autumn_r,
                        size=size,
                        show_ring=True,
                        top_term=top_term,
                        xticklabels_rot=45
                       )
        ax.set_xlabel("")
        plt.tight_layout()
        plt.savefig(f"{settings.figure_dir}/GSEA_dotplot_top{top_term}_{reg}_pathways_per_cluster.pdf", dpi=300)

---