In [None]:
import gseapy as gp
import pandas as pd
import os

When loading in genesets, I used csv files with the following format:

| index | gene_symbol |
| ----- | ----------- |
| 0     | ABCE1       |
| 1     | NEAT1       |
| 2     | ALB         |

In [None]:
gene_sets = {}
for file in ["path/to/geneset.csv"]: #can run on multiple genesets at a time
    df = pd.read_csv(file, header=0, index_col=0)
    set_name = os.path.basename(file).split(".")[0].replace("Homo_sapiens_", "") # my genesets had prefixes on them, can remove replace()
    gene_sets[set_name] = df["gene_symbol"].tolist()

### Option #1: run pre-rank on a .rnk file with your genes preranked in descending order

In [None]:
rnk_df = pd.read_csv("../ranking_expression.rnk", sep="\t", header=None)
print(rnk_df.head())
print(rnk_df.dtypes)

pre_res = gp.prerank(rnk=rnk_df,
                    gene_sets=gene_sets)

### Option #2: Run GSEA full, similar to GSEA CLI provided by Broad Institute

In [None]:
phenoA, phenoB, class_vector =  gp.parser.gsea_cls_parser("annotations.cls")
print(class_vector)
gene_exp = pd.read_csv("../expression_matrix.gct", sep="\t", header=2)
gene_exp.head()

In [None]:
print("positively correlated: ", phenoA) #phenotype of interest
print("negtively correlated: ", phenoB) #control

positively correlated:  Low
negtively correlated:  High


In [None]:
gs_res = gp.gsea(data=gene_exp,
         gene_sets=gene_sets, 
         cls = class_vector,
         permutation_type='phenotype', 
         permutation_num=100000,
         outdir="GSEA", 
         method='s2n') #default signal to noise, strong where n>=3/condition
gs_res.pheno_pos = "Low" 
gs_res.pheno_neg = "High"
gs_res.run()

