## Run Gene Set Enrichment Analysis (GSEA) Enrichr using gseapy for the each broad and specific cell-type's individual age associated latent factors

In [1]:
!date

Fri Jun 21 15:38:28 EDT 2024


#### import libraries

In [6]:
from pandas import read_csv, concat, DataFrame
from pickle import load as pkl_load
from gseapy.enrichr import Enrichr
from time import sleep
from re import match

#### set notebook variables

In [34]:
# parameters
project = 'aging_phase2'
latent_type = 'all'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
assoc_file = f'{results_dir}/{project}.latent.age_glm.csv'
loadings_pickle = f'{results_dir}/{project}.latent.loadings.pkl'

# out files
results_file = f'{figures_dir}/{project}.cell_type_latents.{latent_type}.gsea_enrichr.csv'

# constants and variables
DEBUG = True
marker_sets = ['MSigDB_Hallmark_2020',
               'GO_Biological_Process_2023', 
               'GO_Cellular_Component_2023',
               'GO_Molecular_Function_2023', 
               'KEGG_2019_Human']
PAUSE_AMT = 2
ALPHA = 0.05

In [38]:
if DEBUG:
    print(assoc_file)
    print(loadings_pickle)
    print(results_file)

/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/results/aging_phase2.latent.age_glm.csv
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/results/aging_phase2.latent.loadings.pkl
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/figures/aging_phase2.cell_type_latents.all.gsea_enrichr.csv


#### functions

In [4]:
def find_enrichment(name: str, genes: list, sets,
                    verbose: bool=False) -> DataFrame:
    enr_res = gseapy.enrichr(gene_list=genes,
                             organism='Human',
                             gene_sets=sets,
                             cutoff=0.5)
    enr_res.results['factor'] = name    
    if verbose:
        print(f'full {sets} results shape{enr_res.results.shape}')        
        sig = enr_res.results.loc[enr_res.results['Adjusted P-value'] <= 0.05]
        print(f'significant {sets} results shape{sig.shape}')
        display(sig)
    return enr_res.results

### load input data

#### load the latent factor age associations

In [5]:
age_glm_df = read_csv(assoc_file, index_col=0)
print(f'shape of age_glm_df is {age_glm_df.shape}')
age_glm_df['key_name'] = age_glm_df.cell_type + ':' + age_glm_df.feature
if DEBUG:
    display(age_glm_df.sample(4))
    print(f'age_glm_df has {age_glm_df.key_name.nunique()} keys')

shape of age_glm_df is (491, 9)


Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type,fdr_bh,key_name
0,NMF_0,-16.021782,12.974903,-1.234829,0.2168943,specific,InN-5,nmf,0.34133,InN-5:NMF_0
1,ICA_1,-2.226917,4.883931,-0.455968,0.6484129,specific,ExN-6,ica,0.786101,ExN-6:ICA_1
4,NMF_4,34.70334,6.49461,5.343407,9.121578e-08,specific,InN-10,nmf,1e-06,InN-10:NMF_4
0,NMF_0,29.282517,5.909427,4.955221,7.224811e-07,broad,Micro,nmf,6e-06,Micro:NMF_0


age_glm_df has 491 keys


### load the latent factor's feature loadings

In [11]:
with open(loadings_pickle, 'rb') as pkl_file:
    feature_loadings = pkl_load(pkl_file)
print(f'loadings_pickle has {len(feature_loadings)} entries')

loadings_pickle has 491 entries


### subset the latent factor to only those with a statistically significant age association

In [14]:
age_glm_df = age_glm_df.loc[age_glm_df.fdr_bh <= ALPHA]
print(f'shape of age_glm_df is {age_glm_df.shape}')
if DEBUG:
    display(age_glm_df.sample(4))
    display(age_glm_df.model_type.value_counts())

shape of age_glm_df is (196, 10)


Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type,fdr_bh,key_name
0,PCA_0,1.015366,0.160244,6.336367,2.35246e-10,specific,Astro-1,pca,4.687992e-09,Astro-1:PCA_0
1,NMF_1,32.594619,11.110218,2.933752,0.003348921,specific,ExN-14,nmf,0.01174514,ExN-14:NMF_1
5,ICA_5,-18.999449,4.604183,-4.126563,3.682254e-05,specific,ExN-15,ica,0.0002259983,ExN-15:ICA_5
0,PCA_0,1.334802,0.374338,3.565768,0.0003627919,specific,ExN-14,pca,0.001746381,ExN-14:PCA_0


nmf    82
ica    66
pca    48
Name: model_type, dtype: int64

### build the gene sets to use per age associated latent factor

In [26]:
%%time
latent_features = {}
# regex pattern for match ATAC peaks naming format
pattern = r'^chr.*:.*-.*$'
for factor in age_glm_df.key_name.unique():
    loading = feature_loadings.get(factor)
    age_features = []
    for feature, weight in loading.items():
        # # # only add genes not ATAC peaks, very large and slow otherwise
        if not match(pattern, feature):
            age_features.append(feature)
    if len(age_features) > 0:
        latent_features[factor] = list(set(age_features))
    if DEBUG:
        print(f'{factor} has {len(age_features)} genes')
print(f'latent_features has {len(latent_features)} keys')

OD-0:PCA_0 has 2 genes
OD-0:NMF_0 has 37 genes
OD-0:NMF_2 has 57 genes
OD-0:ICA_1 has 146 genes
OD-0:ICA_2 has 107 genes
ExN-8:PCA_1 has 87 genes
ExN-8:NMF_1 has 140 genes
ExN-8:NMF_2 has 121 genes
ExN-8:NMF_3 has 137 genes
ExN-8:ICA_0 has 119 genes
ExN-8:ICA_4 has 121 genes
ExN-14:PCA_0 has 0 genes
ExN-14:NMF_1 has 0 genes
ExN-14:NMF_3 has 14 genes
ExN-14:ICA_0 has 80 genes
ExN-14:ICA_3 has 52 genes
OPC-4:PCA_0 has 0 genes
OPC-4:NMF_1 has 0 genes
OPC-4:NMF_5 has 29 genes
OPC-4:ICA_0 has 127 genes
OPC-4:ICA_3 has 121 genes
Micro-3:PCA_0 has 2 genes
Micro-3:PCA_1 has 239 genes
Micro-3:NMF_0 has 63 genes
Micro-3:NMF_1 has 3 genes
Micro-3:NMF_2 has 417 genes
Micro-3:NMF_4 has 209 genes
Micro-3:ICA_3 has 199 genes
Micro-3:ICA_4 has 233 genes
Micro-3:ICA_5 has 345 genes
ExN-9:PCA_0 has 0 genes
ExN-9:NMF_4 has 1 genes
ExN-9:ICA_5 has 144 genes
InN-5:PCA_1 has 161 genes
InN-5:PCA_2 has 167 genes
InN-5:PCA_4 has 156 genes
InN-5:NMF_1 has 182 genes
InN-5:NMF_2 has 151 genes
InN-5:ICA_0 has 146 

### run the GSEA Enrichr

#### if debugging see available GSEA libraries

In [27]:
if DEBUG:
    import gseapy
    gene_set_names = gseapy.get_library_name(organism='Human')
    print(gene_set_names)

['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_2023', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_2024', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'ChEA_2022', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019', 'Descartes_Cell_Types_and_Tissue_2021', 'Diabetes_Perturbations_GEO_2022', 'DisGeNET', 'Disease_Per

In [28]:
%%time
results = []
for latent, gene_list in latent_features.items():
    print(f'\n########### {latent} ###########')
    for gene_set in marker_sets:
        print(f'\n+++++++++++ {gene_set} +++++++++++')
        results.append(find_enrichment(latent, list(gene_list), gene_set, verbose=False))
        sleep(PAUSE_AMT)


########### OD-0:PCA_0 ###########

+++++++++++ MSigDB_Hallmark_2020 +++++++++++

+++++++++++ GO_Biological_Process_2023 +++++++++++

+++++++++++ GO_Cellular_Component_2023 +++++++++++

+++++++++++ GO_Molecular_Function_2023 +++++++++++

+++++++++++ KEGG_2019_Human +++++++++++

########### OD-0:NMF_0 ###########

+++++++++++ MSigDB_Hallmark_2020 +++++++++++

+++++++++++ GO_Biological_Process_2023 +++++++++++

+++++++++++ GO_Cellular_Component_2023 +++++++++++

+++++++++++ GO_Molecular_Function_2023 +++++++++++

+++++++++++ KEGG_2019_Human +++++++++++

########### OD-0:NMF_2 ###########

+++++++++++ MSigDB_Hallmark_2020 +++++++++++

+++++++++++ GO_Biological_Process_2023 +++++++++++

+++++++++++ GO_Cellular_Component_2023 +++++++++++

+++++++++++ GO_Molecular_Function_2023 +++++++++++

+++++++++++ KEGG_2019_Human +++++++++++

########### OD-0:ICA_1 ###########

+++++++++++ MSigDB_Hallmark_2020 +++++++++++

+++++++++++ GO_Biological_Process_2023 +++++++++++

+++++++++++ GO_Cellular_Comp

#### convert full enrichment results into combined data frame

In [29]:
results_df = concat(results)
print(f'full results shape {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))
    display(results_df.Gene_set.value_counts())

full results shape (130635, 11)


Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,factor
42,GO_Cellular_Component_2023,Endocytic Vesicle Membrane (GO:0030666),3/159,0.240247,0.698393,0,0,1.797711,2.563692,RAB35;CYBB;CD36,InN:ICA_5
23,GO_Cellular_Component_2023,Golgi Medial Cisterna (GO:0005797),2/10,0.020764,0.200715,0,0,10.733516,41.587551,YIPF1;ST3GAL1,Micro:PCA_1
344,GO_Biological_Process_2023,Monoatomic Cation Transmembrane Transport (GO:...,2/281,0.310144,0.425697,0,0,1.805073,2.11323,SLC12A1;CACNA1G,ExN-14:ICA_0
729,GO_Biological_Process_2023,Peptidyl-Serine Phosphorylation (GO:0018105),2/158,0.585534,0.768413,0,0,1.021263,0.546612,PRKCQ;STK32B,InN:NMF_3
485,GO_Biological_Process_2023,Positive Regulation Of T Cell Mediated Cytotox...,1/33,0.546366,0.999988,0,0,1.290718,0.780194,FADD,ExN:NMF_2


GO_Biological_Process_2023    86237
GO_Molecular_Function_2023    18389
KEGG_2019_Human               11508
GO_Cellular_Component_2023    11341
MSigDB_Hallmark_2020           3160
Name: Gene_set, dtype: int64

### clean-up the GO term entity

In [30]:
results_df['Gene_set'] = results_df.Gene_set.str.replace('GO_','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2020','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2023','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2019_Human','')
results_df['Term'] = results_df.Gene_set + ': ' + results_df.Term
print(f'shape of GSEA post Term naming cleanup {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))

shape of GSEA post Term naming cleanup (130635, 11)


Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,factor
457,Biological_Process,Biological_Process: Cytoskeleton Organization ...,1/111,0.296135,0.356914,0,0,2.907185,3.537865,DIAPH1,Micro-3:NMF_0
289,Biological_Process,Biological_Process: Membrane Organization (GO:...,1/167,0.279153,0.323433,0,0,3.138079,4.004169,REEP1,InN-10:ICA_0
347,Biological_Process,Biological_Process: Synapse Organization (GO:0...,2/131,0.714387,0.999996,0,0,0.799434,0.268874,MUSK;MYOT,ExN-7:NMF_2
29,Biological_Process,Biological_Process: Cellular Response To Inter...,1/10,0.022768,0.179748,0,0,49.246914,186.270813,IL15RA,ExN-6:NMF_2
49,KEGG,KEGG: Pancreatic secretion,2/98,0.164003,0.517119,0,0,2.819064,5.096495,ADCY3;ADCY2,Astro-1:ICA_4


#### how many are statistically significant

In [31]:
sig_results = results_df.loc[results_df['Adjusted P-value'] <= ALPHA]
print(f'{sig_results.shape[0]} terms were detected')
if sig_results.shape[0] < 20:
    display(sig_results.sort_values('Odds Ratio', ascending=False))
else:
    display(sig_results.sort_values('Odds Ratio', ascending=False).head(20))

949 terms were detected


Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,factor
0,Biological_Process,Biological_Process: Resolution Of Recombinatio...,1/5,0.00025,0.00255,0,0,19995.0,165840.178437,GEN1,Micro:PCA_0
0,Molecular_Function,Molecular_Function: 5'-Flap Endonuclease Activ...,1/5,0.00025,0.0009,0,0,19995.0,165840.178437,GEN1,Micro:PCA_0
1,Molecular_Function,Molecular_Function: Flap Endonuclease Activity...,1/6,0.0003,0.0009,0,0,19994.0,162186.518632,GEN1,Micro:PCA_0
2,Molecular_Function,Molecular_Function: Crossover Junction DNA End...,1/7,0.00035,0.0009,0,0,19993.0,159096.448716,GEN1,Micro:PCA_0
3,Molecular_Function,Molecular_Function: Endodeoxyribonuclease Acti...,1/8,0.0004,0.0009,0,0,19992.0,156418.911417,GEN1,Micro:PCA_0
4,Molecular_Function,"Molecular_Function: DNA Endonuclease Activity,...",1/9,0.00045,0.0009,0,0,19991.0,154056.46924,GEN1,Micro:PCA_0
0,Biological_Process,Biological_Process: Columnar/Cuboidal Epitheli...,1/10,0.0005,0.00765,0,0,19990.0,151942.590868,ROS1,ExN:NMF_3
0,Molecular_Function,Molecular_Function: cAMP-dependent Protein Kin...,1/10,0.0005,0.0012,0,0,19990.0,151942.590868,PKIA,Astro:PCA_0
2,Biological_Process,Biological_Process: Positive Regulation Of Spi...,1/11,0.00055,0.00255,0,0,19989.0,150029.821029,GEN1,Micro:PCA_0
1,Biological_Process,Biological_Process: Positive Regulation Of Mit...,1/11,0.00055,0.00255,0,0,19989.0,150029.821029,GEN1,Micro:PCA_0


### save the GSEA Enrichments detected

In [35]:
sig_results.to_csv(results_file)

In [None]:
!date