## Run Gene Set Enrichment Analysis (GSEA) Enrichr using gseapy for the graph partitions of the age associated features for the broad and specific cell-types

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat, DataFrame, pivot
from gseapy.enrichr import Enrichr
from json import load as json_load
from igraph import Graph
from time import sleep
import statsmodels.stats.multitest as smm
from numpy import log10
from math import ceil
from re import match
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
part_file = f'{figures_dir}/{project}.association.partitioned_factors.json'

# out files
figure_file = f'{figures_dir}/{project}.features.gsea_enrichr.png'
results_file = f'{figures_dir}/{project}.features.gsea_enrichr.csv'

# constants and variables
DEBUG = True
categories = ['broad', 'specific']
# won't use ATAC here only genes
# modalities = ['GEX', 'ATAC']
MODALITY = 'GEX'
REGRESSION_TYPE = 'glm_tweedie'
marker_sets = ['MSigDB_Hallmark_2020',
               'GO_Biological_Process_2023', 
               'GO_Cellular_Component_2023',
               'GO_Molecular_Function_2023', 
               'KEGG_2019_Human']
PAUSE_AMT = 2
dpi_value = 50

### load age associated features

In [None]:
results = []
for category in categories:
    print(category)
    in_file = (f'{results_dir}/{project}.{MODALITY}.{category}.'
               f'{REGRESSION_TYPE}_fdr_filtered.age.csv')
    this_df = read_csv(in_file)
    this_df['category'] = category
    results.append(this_df)
age_glm_df = concat(results)
print(f'shape of all age associated features {age_glm_df.shape}')
if DEBUG:
    display(age_glm_df.sample(4))
    display(age_glm_df.category.value_counts())
    display(age_glm_df.tissue.value_counts())

### load the partitions

In [None]:
with open(part_file, 'r') as in_file:
    partitions = json_load(in_file)
print(f'length of partitions is {len(partitions)}')

#### extract the partition groups and cell types

In [None]:
age_parts = {}
for part_index, cell_types in partitions.items():
    part_name = f'Aging-{part_index}'
    pairs = [element.split(':')[0] for element in cell_types]
    age_parts[part_name] = list(set(pairs))
print(f'age_parts length is {len(age_parts)}')
if DEBUG:
    display(age_parts)  

### resolve the cell-types to their age associate features

In [None]:
part_features = {}
for partition, cell_types in age_parts.items():
    print(partition, cell_types)
    age_features = {}
    for cell_type in cell_types:
        these_results = age_glm_df.loc[age_glm_df.tissue == cell_type]
        if len(age_features) == 0:
            age_features = set(these_results.feature)
        else:
            age_features = age_features | set(these_results.feature)
    part_features[partition] = age_features
    print(f'{partition} has {len(age_features)} features')

### run the GSEA Enrichr

#### if debugging see available GSEA libraries

In [None]:
if DEBUG:
    import gseapy
    gene_set_names = gseapy.get_library_name(organism='Human')
    print(gene_set_names)

#### utility functions for accessing and scoring GSEA Enrichr

In [None]:
def find_enrichment(name: str, genes: list, sets,
                    verbose: bool=False) -> DataFrame:
    enr_res = gseapy.enrichr(gene_list=genes,
                             organism='Human',
                             gene_sets=sets,
                             cutoff=0.5)
    enr_res.results['factor'] = name    
    if verbose:
        print(f'full {sets} results shape{enr_res.results.shape}')        
        sig = enr_res.results.loc[enr_res.results['Adjusted P-value'] <= 0.05]
        print(f'significant {sets} results shape{sig.shape}')
        display(sig)
    return enr_res.results

In [None]:
results = []
for partition, gene_list in part_features.items():
    print(f'\n########### {partition} ###########')
    for gene_set in marker_sets:
        print(f'\n+++++++++++ {gene_set} +++++++++++')
        results.append(find_enrichment(partition, list(gene_list), gene_set, verbose=False))
        sleep(PAUSE_AMT)

#### convert full enrichment results into combined data frame

In [None]:
results_df = concat(results)
print(f'full results shape {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))
    display(results_df.Gene_set.value_counts())

### clean-up the GO term entity

In [None]:
results_df['Gene_set'] = results_df.Gene_set.str.replace('GO_','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2020','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2023','')
results_df['Gene_set'] = results_df.Gene_set.str.replace('_2019_Human','')
results_df['Term'] = results_df.Gene_set + ': ' + results_df.Term
print(f'shape of GSEA post Term naming cleanup {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))

#### how many are statistically significant

In [None]:
alpha = 0.05
sig_results = results_df.loc[results_df['Adjusted P-value'] <= alpha]
print(f'{sig_results.shape[0]} terms were detected')
if sig_results.shape[0] < 20:
    display(sig_results.sort_values('Odds Ratio', ascending=False))
else:
    display(sig_results.sort_values('Odds Ratio', ascending=False).head(20))

### save the GSEA Enrichments detected

In [None]:
sig_results.to_csv(results_file)

### reshape the dataframe from long to wide

In [None]:

# # compute -log10 of p-value
# results_df['log10_pvalue'] = -log10(results_df['P-value'])
# w_df = pivot(results_df.loc[results_df.Term.isin(sig_results.Term)], 
#                   index=['Term'], 
#                   columns=['factor'], values='log10_pvalue')
w_df = pivot(results_df.loc[results_df.Term.isin(sig_results.Term)], 
                  index=['Term'], 
                  columns=['factor'], values='Odds Ratio')
# set precision
w_df = w_df.round(2)
# drop rows that are all null
w_df.dropna(how='all', inplace=True)
print(f'shape of wide reformated results {w_df.shape}')
if DEBUG:
    display(w_df)

### visualize the reformated data as a heatmap

In [None]:
from seaborn import heatmap

if w_df.shape[0] > 9:
    height = 9+ceil(w_df.shape[0]/5)
else:
    height = 9
print(height)        
with rc_context({'figure.figsize': (11, height), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')    
    heatmap(w_df, linecolor='grey', linewidths=0.05, cmap='Purples')    
    plt.title(f'GSEA Enrichr for latent age factors (Odds Ratio)')
    plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
                transparent=True, pad_inches=1)
    plt.show()

### visualize as clustered heatmap

In [None]:
from seaborn import clustermap

# fill the missing
w_df = w_df.fillna(0)

with rc_context({'figure.figsize': (11, height), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')    
    # clustermap(w_df, cmap='Purples', cbar_pos=(0.75, 0.9, 0.05, 0.18))
    clustermap(w_df, cmap='Purples', cbar_pos=None, linecolor='grey', linewidths=0.05)        
    # plt.title('GSEA Enrichr')
    plt.xticks(rotation = 90)
    # plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
    #             transparent=True, pad_inches=1)
    plt.show()

In [None]:
!date