**Purpose**

* Determine the cohorts and mutational signatures where there is an enrichment of low $\Delta$ context scores over expected
* Determine the gene sets for each cohort that have significant enrichment among genes appearing in high context effect variant sets

# Setup

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
pd.set_option('display.max_columns', None)

## Files - read

Enrichment of high context effect variants

In [3]:
thresholds_5_brca_ucec = "../../data/4_tcga_analysis/scores_enriched_on_5quantile_cohortBRCA_UCEC_TCGA_AC0bg_ConNegCP3.tsv"

GSEA outputs for each cohort

In [4]:
#-observed
gsea_brca_5q_filename = "../../data/4_tcga_analysis/gseapy_ora_kegg_go_cohortBRCA_AC0_5q_CG_GA.tsv"
gsea_ucec_5q_filename = "../../data/4_tcga_analysis/gseapy_ora_kegg_go_cohortUCEC_AC0_5q_CT_GA.tsv"
gsea_brca_ucec_5q_filename = "../../data/4_tcga_analysis/gseapy_ora_kegg_go_cohortBRCA_UCEC_AC0_5q_CG_GA_CT.tsv"
#-sampling
gsea_brca_5q_sampled_filename = "../../data/4_tcga_analysis/gseapy_sampled_ora_kegg_go_cohortBRCA_n100_AC0_5q_CG_GA.tsv"
gsea_ucec_5q_sampled_filename = "../../data/4_tcga_analysis/gseapy_sampled_ora_kegg_go_cohortUCEC_n100_AC0_5q_CT_GA.tsv"
gsea_brca_ucec_5q_sampled_filename = "../../data/4_tcga_analysis/gseapy_sampled_ora_kegg_go_cohortBRCA_UCEC_n100_AC0_5q_CG_GA_CT.tsv"


## Files - write

In [5]:
gsea_summary_filename = "../../data/4_tcga_analysis/gseapy_ora_summary_kegg_go_cohortBRCA_UCEC_n100_AC0_5q_CT_CG_GA.tsv"

# Enriched scores in TCGA cohorts

Loop through threshold files and print enriched results

In [9]:
quantiles = [5]
qfiles = [thresholds_5_brca_ucec]
pvalue_cutoff = 0.05/(9*3)

#test stat filter
direction_filter = "cohort_number_in > cohort_expected_in"

for i,q in enumerate(quantiles) :
    tcga_quantile_df = pd.read_csv(qfiles[i],
                                   sep="\t")
    print("Quantile:",q)
    print(tcga_quantile_df.query("test_pv < @pvalue_cutoff &"+
                                 direction_filter))
    

Quantile: 5
          cohort       group context  threshold_quantile  threshold_value  \
3           BRCA  SNVContext     C>G                0.05        -0.699761   
7           BRCA  SNVContext     G>A                0.05        -1.709938   
13          UCEC  SNVContext     C>T                0.05        -1.572423   
16          UCEC  SNVContext     G>A                0.05        -1.709938   
21  grouped_TCGA  SNVContext     C>G                0.05        -0.699761   
22  grouped_TCGA  SNVContext     C>T                0.05        -1.572423   
25  grouped_TCGA  SNVContext     G>A                0.05        -1.709938   

   comparison  cohort_number_in  cohort_number_out  cohort_expected_in  \
3       below               134                911                  53   
7       below               197               2556                 138   
13      below               958              11759                 652   
16      below               643              10396                 555   
2

# Pathway enrichment

* Load the ORA results per cohort, per threshold
* Load corresponding random-sampled result (filter to p-value threshold)
* Mark ones that overlap
* Combine into array and save

In [10]:
adj_pv_threshold = 0.05

In [11]:
gsea_file_dict = {"BRCA_5q": gsea_brca_5q_filename,
                  "UCEC_5q": gsea_ucec_5q_filename,
                  "BRCA_UCEC_5q": gsea_brca_ucec_5q_filename}

In [12]:
gsea_sampled_file_dict = {"BRCA_5q": gsea_brca_5q_sampled_filename,
                          "UCEC_5q": gsea_ucec_5q_sampled_filename,
                          "BRCA_UCEC_5q": gsea_brca_ucec_5q_sampled_filename}

In [13]:
#Store dataframes
gsea_dfs = []

for label, gs_filename in gsea_file_dict.items() :
    
    print(label)
    gsea_df = pd.read_csv(gs_filename,
                          sep="\t")
    gsea_sampled_df = pd.read_csv(gsea_sampled_file_dict[label],
                                   sep="\t")
    
    gsea_filter_df = gsea_df.query("`Adjusted P-value` < @adj_pv_threshold")
    print(gsea_filter_df.shape)
    
    #Grab the matching sampled table
    gsea_sampled_filter = gsea_sampled_df.query("`Adjusted P-value` < @adj_pv_threshold")
    print(gsea_sampled_filter.shape)
    sampling_count = gsea_sampled_filter.value_counts("Term")
    sampling_terms = sampling_count.index.to_list()
    
    #Look at table with matching terms removed
    gsea_notSampled = gsea_filter_df[~gsea_filter_df["Term"].isin(sampling_terms)]
    print(gsea_notSampled.shape)
    print(gsea_notSampled.sort_values("Adjusted P-value")[["Term","Overlap",
                                                           "Adjusted P-value","Odds Ratio"]])
    
    #Mark overlap with random sampling
    #-note: we only retained results when p<0.05, so it doesn't mark
    # every pathway returned in the random sampling, at every p-value
    gsea_df["cohort"] = label
    gsea_df["number_sampled_in_random"] = 0
    for i,g in gsea_filter_df.iterrows() :
        if g["Term"] in sampling_terms :
            gsea_df.at[i,"number_sampled_in_random"] = sampling_count[g["Term"]]
        else :
            pass
    gsea_dfs.append(gsea_df)

BRCA_5q
(2, 10)
(9, 8)
(2, 10)
                                                  Term Overlap  \
224  Somatic Recombination Of Immunoglobulin Genes ...     4/9   
225                     Isotype Switching (GO:0045190)    4/11   

     Adjusted P-value  Odds Ratio  
224          0.014072   49.491824  
225          0.017963   35.347709  
UCEC_5q
(2, 10)
(191, 8)
(2, 10)
                                 Term Overlap  Adjusted P-value  Odds Ratio
0    Herpes simplex virus 1 infection  75/498      2.780892e-09    2.627076
296  DNA Damage Response (GO:0006974)  49/384      2.022286e-02    2.132688
BRCA_UCEC_5q
(14, 10)
(345, 8)
(13, 10)
                                                  Term Overlap  \
0                     Herpes simplex virus 1 infection  89/498   
1                             Homologous recombination   12/41   
301             Nuclear Pore Organization (GO:0006999)    8/14   
302                   DNA Damage Response (GO:0006974)  57/384   
303     Protein Alpha-1,2-Deman

Combine data frames

In [14]:
gsea_summary_df = pd.concat(gsea_dfs,
                            axis=0,
                            ignore_index=True)
gsea_summary_df.head()

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,cohort,number_sampled_in_random
0,KEGG_2021_Human,Protein processing in endoplasmic reticulum,8/171,0.006622,0.958501,0,0,3.050291,15.304373,SEC61A1;SEC24A;MAN1A2;UBE4B;DNAJC5G;STT3B;UGGT...,BRCA_5q,0
1,KEGG_2021_Human,Non-homologous end-joining,2/13,0.017927,0.958501,0,0,11.174432,44.937142,RAD50;LIG4,BRCA_5q,0
2,KEGG_2021_Human,Ubiquitin mediated proteolysis,6/140,0.025962,0.958501,0,0,2.769318,10.111073,HERC2;UBR5;UBE4B;BRCA1;BTRC;CBL,BRCA_5q,0
3,KEGG_2021_Human,Homologous recombination,3/41,0.028089,0.958501,0,0,4.860584,17.363799,RAD50;RAD51C;BRCA1,BRCA_5q,0
4,KEGG_2021_Human,PPAR signaling pathway,4/74,0.031491,0.958501,0,0,3.52345,12.184308,HMGCS1;ACOX1;EHHADH;PLIN2,BRCA_5q,0


In [16]:
gsea_summary_df.loc[gsea_summary_df["Adjusted P-value"].sort_values().iloc[:8].index]

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,cohort,number_sampled_in_random
5956,KEGG_2021_Human,Herpes simplex virus 1 infection,89/498,5.323906e-12,1.602496e-09,0,0,2.456442,63.766331,ZNF254;ZNF573;IFNA5;ZNF253;ZNF492;ITGB3;TBK1;Z...,BRCA_UCEC_5q,0
2029,KEGG_2021_Human,Herpes simplex virus 1 infection,75/498,9.394906e-12,2.780892e-09,0,0,2.627076,66.703695,ZNF254;ZNF573;ZNF492;ITGB3;TBK1;ZNF729;ZNF529;...,UCEC_5q,0
5957,KEGG_2021_Human,Homologous recombination,12/41,9.141805e-05,0.01375842,0,0,4.549486,42.310528,POLD3;TOP3B;BRIP1;RAD50;RAD51C;XRCC2;RPA2;ATM;...,BRCA_UCEC_5q,0
224,GO_Biological_Process_2023,Somatic Recombination Of Immunoglobulin Genes ...,4/9,7.796097e-06,0.01407195,0,0,49.491824,582.117258,MSH6;SANBR;RNF8;LIG4,BRCA_5q,0
225,GO_Biological_Process_2023,Isotype Switching (GO:0045190),4/11,1.990371e-05,0.0179631,0,0,35.347709,382.624963,MSH6;SANBR;RNF8;LIG4,BRCA_5q,0
6257,GO_Biological_Process_2023,Nuclear Pore Organization (GO:0006999),8/14,4.534526e-06,0.01810183,0,0,14.642686,180.16053,NDC1;NUP107;SEH1L;NUP133;TPR;NUP153;NUP98;RTN4,BRCA_UCEC_5q,0
2325,GO_Biological_Process_2023,DNA Damage Response (GO:0006974),49/384,5.569501e-06,0.02022286,0,0,2.132688,25.801692,TOP2A;WDR48;BRCA1;FMN2;INSL6;NIPBL;KMT5B;MACRO...,UCEC_5q,0
6259,GO_Biological_Process_2023,"Protein Alpha-1,2-Demannosylation (GO:0036508)",8/16,1.667552e-05,0.02218956,0,0,10.980815,120.806196,EDEM3;RNF139;MAN1A2;EDEM2;MAN1C1;UGGT2;UGGT1;M...,BRCA_UCEC_5q,0


Save combined tables:

In [17]:
gsea_summary_df.to_csv(gsea_summary_filename,
                       sep="\t",
                       index=False)