In [1]:
import gseapy as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(
    '../diffexp/significant_degs.skin.tissue.sat1.tsv',
    sep = '\t'
)
df['regulation'] = np.select(
    [df.avg_log2FC < 0, df.avg_log2FC > 0],
    ['down', 'up']
)
df

Unnamed: 0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,significant,gene,regulation
0,1.934653e-190,1.157765,0.920,0.663,3.894456e-186,True,CREM,up
1,1.336488e-156,1.244570,0.909,0.712,2.690351e-152,True,SAT1,up
2,6.995350e-144,1.168116,0.657,0.298,1.408164e-139,True,ZNF331,up
3,6.945343e-143,1.015987,0.830,0.583,1.398098e-138,True,SELENOK,up
4,1.515644e-135,0.838064,0.981,0.963,3.050992e-131,True,BTG1,up
...,...,...,...,...,...,...,...,...
159,2.494065e-19,-0.704236,0.372,0.464,5.020552e-15,True,MT-ATP8,down
160,2.979304e-17,0.702524,0.725,0.655,5.997339e-13,True,HSP90AA1,up
161,1.975323e-15,0.532631,0.572,0.466,3.976326e-11,True,HSPE1,up
162,2.570882e-11,0.561183,0.330,0.272,5.175185e-07,True,MAGEH1,up


In [6]:
gene_set = 'MSigDB_Hallmark_2020'
for regulation, regulation_group in df.groupby('regulation'):
    results = gp.enrichr(
        gene_list = regulation_group.gene,
        gene_sets = gene_set,
        organism = 'Human',
        no_plot = True,
        outdir = None
    )
    ax = gp.dotplot(
        results.res2d,
        top_term = 20,
        title = f'{regulation}regulated genes',
        ofname = f'../plots/over_representation_{regulation}_{gene_set}.pdf'
    )

In [9]:
gene_set = 'MSigDB_Hallmark_2020'
for regulation, regulation_group in df.groupby('regulation'):
    rnk = regulation_group.loc[:, ['gene', 'avg_log2FC']].copy()
    rnk.index = rnk.gene.str.upper()
    results = gp.prerank(
        rnk = rnk.sort_values(by = 'avg_log2FC').loc[:, ['avg_log2FC']],
        gene_sets = gene_set,
        min_size = 1,
        max_size = 1000,
        permutation_num = 1000, # reduce number to speed up testing
        outdir = None, # don't write to disk
        seed = 6,
        verbose = True, # see what's going on behind the scenes
        no_plot = True
    )
    ax = gp.dotplot(
        results.res2d,
        column = 'FDR q-val',
        top_term = 20,
        title = f'{regulation}regulation',
        ofname = f'../plots/gsea_{regulation}_{gene_set}.pdf'
    )

2023-03-22 17:04:26,893 [INFO] Parsing data files for GSEA.............................
2023-03-22 17:04:26,896 [INFO] Enrichr library gene sets already downloaded in: /users/daniel.malzl/.cache/gseapy, use local file
2023-03-22 17:04:26,902 [INFO] 0032 gene_sets have been filtered out when max_size=1000 and min_size=1
2023-03-22 17:04:26,903 [INFO] 0018 gene_sets used for further statistical testing.....
2023-03-22 17:04:26,905 [INFO] Start to run GSEA...Might take a while..................
2023-03-22 17:04:26,931 [INFO] Congratulations. GSEApy runs successfully................



ValueError: Warning: No enrich terms when cutoff = 0.05

In [8]:
results.res2d

Unnamed: 0,Name,Term,ES,NES,NOM p-val,FDR q-val,FWER p-val,Tag %,Gene %,Lead_genes
0,prerank,Interferon Alpha Response,-0.931034,-1.666816,0.02139,0.080008,0.07,2/2,16.13%,IFITM1;TXNIP
1,prerank,Oxidative Phosphorylation,0.52,1.453338,0.114355,0.357795,0.252,6/6,58.06%,ATP5PF;UQCR10;BAX;ATP5ME;NDUFS7;UQCR11
2,prerank,Apical Junction,-0.793103,-1.391863,0.126582,0.220023,0.324,2/2,29.03%,ACTB;EVL
3,prerank,Pperoxisome,0.966667,1.256992,0.124169,0.712887,0.677,1/1,6.45%,GSTK1
4,prerank,Myc Targets V1,0.933333,1.224203,0.171367,0.576507,0.747,1/1,9.68%,LSM7
5,prerank,Interferon Gamma Response,-0.9,-1.192973,0.234004,0.663324,0.823,1/1,16.13%,TXNIP
6,prerank,Bile Acid Metabolism,0.551724,0.986988,0.478936,1.0,0.977,2/2,48.39%,GSTK1;LCK
7,prerank,DNA Repair,0.733333,0.963351,0.567391,0.956858,0.98,1/1,29.03%,DAD1
8,prerank,Inflammatory Response,-0.497252,-0.902086,0.629344,1.0,0.999,1/2,12.90%,IFITM1
9,prerank,p53 Pathway,-0.471084,-0.835418,0.721831,1.0,0.999,1/2,16.13%,TXNIP
