Notebook to send jobs to the Ubelix HPC cluster at the University of Bern

In [None]:
import sys
import os
import logging
import glob
import pickle
import json
from pathlib import Path
import rpy2.robjects as ro
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

%load_ext rpy2.ipython

logging.basicConfig(filename='example.log', 
                    encoding='utf-8', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))

datapath = Path("../data")
#datapath = Path("/storage/homefs/pd21v747/datanew")

modpath = Path("../scripts")
sys.path.append(os.path.relpath(modpath))

from misc import Timer, pickler, open_table

In [None]:
sites = {"liver": "LIHC",
         "thyroid": "THCA",
         "lung": "LUAD",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "prostate": "PRAD"}

datasets = {sites[s]: {} for s in sites}

for s in sites:
    f = Path(f"{datapath}/{s}/{sites[s]}/{sites[s]}.csv")
    df = pd.read_csv(f, index_col=0)
    datasets[sites[s]]["genes"] = len(df)
    datasets[sites[s]]["site"] = s
    datasets[sites[s]]["datapath"] = f
    datasets[sites[s]]["outpath"] = f.parent
    datasets[sites[s]]["patients"] = len(df.columns)//2
    print(f"{s:<10}", datasets[sites[s]]["genes"], datasets[sites[s]]["patients"])
    
# Pretty names
cleanout = {"jk": "ReBoost",
            "pcah": "rPCA",
            "none": "None"}
cleandea = {"edger": "edgeR QLF",
            "edgerlrt": "edgeR LRT",
            "deseq2": "DESeq2"}

# Differential expression analysis

## Define ground truth

Define ground truth DEGs for a given FDR, logFC cutoff as the intersection of DEGs from all three DEA tests (Wald, LRT, QLF)

In [None]:
from DEA import run_dea_on_full_data
from process import find_ground_truth

DEAs = ["edgerlrt", "edger", "deseq2"]
FDRs = [0.1,0.05,0.01,0.001]
logFCs = [0, 1] # formal lfc threhsold in edger or deseq2
logFCs_post = [0,0.5,1,1.5,2] # post hoc thresholds

run_dea_on_full_data(datasets, DEAs, overwrite = False, lfcs = logFCs)

datasets = find_ground_truth(datasets, DEAs, FDRs, logFCs_post, lfc_test = 0)
datasets = find_ground_truth(datasets, DEAs, FDRs, [1], lfc_test = 1)

## Send batch jobs for selected data set

In [None]:
selected_data = "LIHC"
outpath = datasets[selected_data]["outpath"]
outname = outpath.name
outpath, outname

In [None]:
from ubelix import run_multi_batch

script_path = Path("../scripts/send_batch.sh")
DEA_methods = ["edgerqlf"]#,"edgerlrt", "deseq2"] # finish edgerqlf jobs before sending other jobs
outlier_methods = ["none"]#, "pcah", "jk"] # only use none for p2 and p3
all_N = [3,7,15]#,4,5,6,7,8,9,10,12,15]
n_cohorts = 100

assert outname in str(outpath)

config_params_1 = {
    
    "param_set": "p1", # id for this set of parameters
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    
    "outlier_kwargs": {
        "none": {},
        "jk": {
            "FDR": 0.01,
            "overwrite": False, # overwrite existing jk tab
            "max_removed_frac": 0.5, # fraction of patients; after 1st iteration, don't jackknife bottom frac patients
            "efficient": True,
            "cols_to_keep": ["FDR"],
            "cleanup": True # remove individual jk tabs and iterations after merger
        },
        "pcah": {"k": 2}
    },
    
    "DEA_kwargs": {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "design": "paired"},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt", "design": "paired"},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"], "design": "paired"}
    }
    
}

# lfc = 1 threshold
config_params_2 = {
    
    "param_set": "p2", # id for this set of parameters
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    
    "outlier_kwargs": {
        "none": {},
        "jk": {
            "FDR": 0.01,
            "overwrite": False, # overwrite existing jk tab
            "max_removed_frac": 0.5, # fraction of patients; after 1st iteration, don't jackknife bottom frac patients
            "efficient": True,
            "cols_to_keep": ["FDR"],
            "cleanup": True # remove individual jk tabs and iterations after merger
        },
        "pcah": {"k": 2}
    },
    
    "DEA_kwargs": {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 1, "design": "paired"},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt", "lfc": 1, "design": "paired"},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": 1, "design": "paired"}
    }
    
}

# lfc = 1 threshold, unpaired
config_params_3 = {
    
    "param_set": "p3", # id for this set of parameters
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    
    "outlier_kwargs": {
        "none": {},
        "jk": {},
        "pcah": {}
    },
    
    "DEA_kwargs": {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 1, "design": "unpaired"},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt", "lfc": 1, "design": "unpaired"},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": 1, "design": "unpaired"}
    }
    
}

import subprocess as sp
output = sp.getoutput('squeue -u pd21v747')
jobs_running = output.find("send_bat") > 0

# mode = "send jobs" # does not work in newer version of slurm
# https://harvardmed.atlassian.net/wiki/spaces/O2/pages/1586793613/Troubleshooting+Slurm+Jobs#Jobs-fail-with-the-message%3A-Unable-to-satisfy-CPU-bind-request
# workaround: use mode=just testing, copy the command and send job from terminal in submit node; wd must be notebooks folder, must load environment beforehand

#mode = "test main terminal"
mode = "just testing"

do_nothing = False
config_params = config_params_3

if not jobs_running and not do_nothing:
    run_multi_batch(config_params, all_N, n_cohorts, script_path, mode = mode)
elif jobs_running:
    print("Jobs running")

In [None]:
!squeue -u pd21v747

## Process jobs

In [None]:
DEAs = ["edgerqlf"]#, "edgerlrt", "deseq2"]
outlier_methods = ["none", "pcah", "jk"]
FDRs = [0.1,0.05,0.01,0.001]
logFCs = [0, 0.5, 1, 1.5, 2]
all_N = [3]#,4,5,6,7,8,9,10,12,15]
lfc_test = 0
param_set = "p1"

# DEAs = ["edgerqlf"]#"deseq2", "edgerlrt"]
# outlier_methods = ["none"]#, "pcah", "jk"]
# FDRs = [0.1,0.05,0.01,0.001]
# logFCs = [1]
# all_N = [3]#,4,5,6,7,8,9,10,12,15]
# lfc_test = 1
# param_set = "p2"

DEAs = ["edgerqlf"]#, "edgerlrt", "deseq2"]
outlier_methods = ["none"]
FDRs = [0.05] #[0.1,0.05,0.01,0.001]
logFCs = [1]
all_N = [3,7,15]
lfc_test = 1
param_set = "p3"

param_sets = ["p1","p2","p3"]

In [None]:
from process import process_pipeline
from misc import profile_func
import pstats

kwargs = {"outpath":outpath, "outname":outname, "all_N": all_N, "DEAs":DEAs, "outlier_methods": outlier_methods, 
          "FDRs":FDRs, "logFCs":logFCs, "lfc_test": lfc_test, "param_set":param_set, "overwrite": 1, "overwrite_merged": 1, "n_cohorts": 10}

do_process = True
if do_process:
    prof = profile_func(process_pipeline, kwargs)
    stats = pstats.Stats(prof).strip_dirs().sort_stats("cumtime")
    stats.print_stats(50)

In [None]:
from misc import open_table
site = "liver"
f=f"../data/{site}/{sites[site]}/{sites[site]}_N3/all.logFC.none.edgerqlf.p2.feather"
tab = open_table(f)
tab

In [None]:
# ## Multi process
# import process, sys, importlib
# importlib.reload(sys.modules["process"])

# from process import process_pipeline

# for data in datasets:
#     #if data == "PRAD": continue
#     print(data)
    
#     outpath = datasets[data]["outpath"]
#     outname = outpath.split("/")[-1]
#     kwargs = {"outpath":outpath, "outname":outname, "all_N": all_N, "DEAs":DEAs, "outlier_methods": outlier_methods, 
#               "FDRs":FDRs, "logFCs":logFCs, "lfc_test": lfc_test, "param_set":param_set, "overwrite": 0, "overwrite_merged": 0, "n_cohorts": 100}
#     process_pipeline(**kwargs)

### Merge processed data sets

In [None]:
def load_results(path, site, name, param_set):
    resultsfile = f"{path}/{site}/{name}/results.{param_set}.txt"
    with open(resultsfile, "rb") as f:
        return pickle.load(f)
    
for s in sites:
    
    #if s not in ["colorectal","prostate","breast","thyroid","liver"]: continue

    for ps in param_sets:
        res = load_results(datapath, s, sites[s], ps)
        if ps not in datasets[sites[s]]:
            datasets[sites[s]][ps] = {}
        for key in res.keys():
            datasets[sites[s]][ps][key] = res[key]

assert "truth_stats" in datasets[list(datasets.keys())[0]] # don't forget to load ground truth
datasetsfile = Path(datapath, "multi/datasets.txt")
pickler(datasets, datasetsfile)

In [None]:
# Create tidy dataframe

def tidy_df(param_set,logFCs):
    quantity = ["median_rep", "median_deg", "median_rep_adj", "median_deg_adj",
                "median_mcc", "median_prec", "median_rec", "median_mcc_adj", "median_prec_adj", "median_rec_adj"]
    iterables = [datasets,all_N,outlier_methods,DEAs,FDRs,logFCs,quantity]
    multi_cols = pd.MultiIndex.from_product(iterables, names=["Data", "N", "Out", "DEA", "FDR", "logFC", "Val"])
    combined = pd.DataFrame(columns=multi_cols)

    ids = [] # id connecting samples from same dataset and cohort size, used for paired-sample testing pre vs. post outlier removal
    for i, d in enumerate(datasets):
        #if d not in ["COAD","PRAD","BRCA","THCA","LIHC"]: continue
        for j, N in enumerate(all_N):
            k = i*len(all_N)+j 
            for out in outlier_methods:
                for dea in DEAs:
                    for fdr in FDRs:
                        for logFC in logFCs:
                            for quant in quantity:
                                #print(d,N,param_set,out,dea,quant,fdr,logFC,datasets[d][param_set][N][out][dea][quant][fdr][logFC])
                                col = (d,N,out,dea,fdr,logFC,quant)
                                try:
                                    combined.loc[0,col] = datasets[d][param_set][N][out][dea][quant][fdr][logFC]
                                except KeyError:
                                    if "syn_hom" in d and out != "None": pass
                                    else: raise Exception("KeyError")
                                if quant == "median_rep": ids.append(k)

    combined_td = combined.unstack().unstack(level="Val").reset_index(level=["Data","N","Out","DEA","FDR","logFC"], drop=False)
    combined_td["id"] = ids
    combined_td.reset_index(drop=True, inplace=True)

    # For outlier_method == "none", no adjustment is necessary, hence copy unadjusted values
    none_ix = combined_td[combined_td["Out"] == "none"].index # avoid setting with copy warning
    combined_td.loc[none_ix,"median_deg_adj"] = combined_td.loc[none_ix,"median_deg"]
    combined_td.loc[none_ix,"median_rep_adj"] = combined_td.loc[none_ix,"median_rep"]
    combined_td.loc[none_ix,"median_mcc_adj"] = combined_td.loc[none_ix,"median_mcc"]
    combined_td.loc[none_ix,"median_prec_adj"] = combined_td.loc[none_ix,"median_prec"]
    combined_td.loc[none_ix,"median_rec_adj"] = combined_td.loc[none_ix,"median_rec"]


    for clean in cleanout:
        combined_td.loc[(combined_td[combined_td["Out"] == clean]).index, "Out"] = cleanout[clean]
    for clean in cleandea:
        combined_td.loc[(combined_td[combined_td["DEA"] == clean]).index, "DEA"] = cleandea[clean]
    combined_td.to_csv(Path(datapath, f"multi/combined_td.{param_set}.csv"))
    return combined_td

In [None]:
# outlier_methods = ["none","jk","pcah"]
# combined_td = tidy_df("p1",[0,1])
# outlier_methods = ["none"]
# combined_td2 = tidy_df("p2",[1])
# combined_td.head()

In [None]:
combined_td3 = tidy_df("p3",[1])
combined_td3

## Inspect results

In [None]:
from misc import get_grid_size

x = "median_deg"
y = "median_rep"

a  = combined_td3[combined_td3["logFC"]==1]
a = a[a["DEA"]=="edgerqlf"]
hues = ["Data","logFC","DEA","Out","N","FDR"]
grid = get_grid_size(len(hues), k=0, fill=True)
fig, ax = plt.subplots(grid[0],grid[1],figsize=(6*grid[0],3*grid[1]), sharex=True,sharey=True)
ax=ax.flatten()
cube = sns.cubehelix_palette(as_cmap=False, n_colors=5)

for i, hue in enumerate(hues):
    sns.scatterplot(data=a, x=x, y=y, hue=hue, ax=ax[i])
for axx in ax: axx.invert_yaxis()
fig.tight_layout()

# Enrichment analysis

## Ground truth definition

Run GSEA on ground truth logFC estimates

In [None]:
from enrichment import prepare_gsea, run_gseapy_libraries, convert_ids_biomart, find_conv_table_all, clean_tab

libraries = [
  "GO_Biological_Process_2021",
  "KEGG_2021_Human"
  # "MSigDB_Oncogenic_Signatures",
  # "Cancer_Cell_Line_Encyclopedia",
  # "GO_Molecular_Function_2021",
  # "GeneSigDB",
  # "GO_Cellular_Component_2021",
  # "MSigDB_Hallmark_2020",
  # "MSigDB_Computational",
  # "WikiPathway_2021_Human"
]

datasetsfile = Path(datapath, "multi/datasets.txt")
with open(datasetsfile, "rb") as f:
    datasets = pickle.load(f)
    
if "KIRC_syn_hom" in datasets:
    del datasets["KIRC_syn_hom"]

# Create table for converting ENSG to Gene Symbols and Entrez IDs
conv_file = Path(datapath, "multi/conv_table.csv")
find_conv_table_all(datasets, conv_file, overwrite=False)
conv_table = pd.read_csv(conv_file, index_col=0)
conv_table.head()

### GSEApy

In [None]:
from enrichment import prepare_gsea, run_gseapy_libraries, convert_ids_biomart, find_conv_table_all, clean_tab
from process import signal_to_noise
from DEA import normalize_counts
            
overwrite_all_gsea = True

for k, data in enumerate(datasets):
    
    outpath = datasets[data]["outpath"]
    gseapath = f"{outpath}/gsea"   
    
    if not overwrite_all_gsea: continue

    #os.system(f"mkdir {gseapath}")
    tab = pd.read_csv(f"{outpath}/truth_lfc.csv", index_col=0)

    counts = pd.read_csv(datasets[data]["datapath"], index_col=0)
    counts = normalize_counts(counts)
    tab.loc[counts.index.intersection(tab.index), "|S2N|"] = signal_to_noise(counts)

    tab_cleaned = tab.loc[tab.index.intersection(conv_table.index)]
    tab_cleaned["Symbol"] = conv_table.loc[tab_cleaned.index,"Symbol"]
    logging.info(f"{data}\nOriginal tab: {len(tab)} genes\nCleaned tab: {len(tab_cleaned)} genes\n")
    
    with Timer(name="context manager"):
        run_gseapy_libraries(tab_cleaned, gseapath, libraries, overwrite_all_gsea, permutation_num=1000, save_full_results=True,
                        ranking="logFC", min_size=15, max_size=500)
        run_gseapy_libraries(tab_cleaned, gseapath, libraries, overwrite_all_gsea, permutation_num=1000, save_full_results=True,
                            ranking="|S2N|", min_size=15, max_size=500)

In [None]:
# Inspect results

gsea_out = f"{gseapath}/gseapy.{libraries[1]}.txt"
with open(gsea_out, "rb") as fp:
    gsea_results = pickle.load(fp)
    
print(gsea_out)

In [None]:
import gseapy

terms = gsea_results.res2d.sort_values(by="FDR q-val")
display(gsea_results.res2d.head(10))
top_term = terms["Term"][0]
print(f"Total gene sets tested: {len(terms)}")
print(f"Significant gene sets at 5% FDR: {len(terms[terms['FDR q-val']<0.05])}\n")
gseapy.gseaplot(rank_metric=gsea_results.ranking, term=top_term, **gsea_results.results[top_term])

In [None]:
# Make results dictionary

FDR = 0.05
rankings = ["logFC", "|S2N|"]
gsea_dict = {data: {"truth": {"gseapy": {rnk: {lib: {} for lib in libraries} for rnk in rankings}}} for data in datasets}

for data in datasets:
    
    outpath = datasets[data]["outpath"]
    gseapath = f"{outpath}/gsea"
    
    for rnk in rankings:
    
        for library in libraries:

            suffix = "" if rnk == "logFC" else "_s2n"
            reportfile = f"{gseapath}/gseapy{suffix}.{library}.feather"
            terms = open_table(reportfile).sort_values(by="FDR")
            gsea_dict[data]["truth"]["gseapy"][rnk][library] = terms
            print(data, library, rnk, len(terms), len(terms[terms['FDR']<FDR]))

### clusterProfiler ORA

In [None]:
from enrichment import run_clusterORA, convert_ensg

overwrite_clusterORA = False
FDRs = [0.05]
logFCs = [0,1] # formal lfc threshold

ORA_kwargs = {"FDRs": [0.05], "logFCs": [0,1], "minGSSize": 15, "maxGSSize": 500,
               "use_internal_data": True, "internal_data_path": "../data/clusterORA/KEGG_DATA.RData"}

for k, data in enumerate(datasets):
    print(data)
    
    outpath = datasets[data]["outpath"]
    gseapath = f"{outpath}/gsea"
    #os.system(f"mkdir {gseapath}")
    universe = pd.read_csv(f"{outpath}/{data}.csv", usecols=["Unnamed: 0"], index_col=0)
    universe = list(convert_ensg(universe,conv_table,target="Entrez").index)

    for fdr in FDRs:
        for logFC in logFCs:
            degs = pd.read_csv(f"{outpath}/truth.fdr{fdr}.post_lfc{logFC}.lfc{logFC}.csv", usecols=["Unnamed: 0"], index_col=0)
            degs = list(convert_ensg(degs,conv_table,target="Entrez").index)
    
            s = "_lfc" if logFC > 0 else ""
            prefix = f"{gseapath}/clusterORA{s}.fdr{fdr}.post_lfc{logFC}.lfc{logFC}"
            run_clusterORA(degs, universe, file_id="", go_ont="BP", prefix=prefix, overwrite=overwrite_clusterORA, **ORA_kwargs)

In [None]:
for data in datasets:
    
    outpath = datasets[data]["outpath"]
    gseapath = f"{outpath}/gsea"
        
    for fdr in FDRs:
        for logFC in logFCs:
            
            s = "_lfc" if logFC > 0 else ""
            method = f"clusterORA{s}.fdr{fdr}.post_lfc{logFC}.lfc{logFC}"
            gsea_dict[data]["truth"][method] = {lib: None for lib in libraries}
            
            for library in libraries:
                #if "KEGG" not in library: continue
                    
                reportfile = f"{gseapath}/{method}.{library}.feather"         
                terms = open_table(reportfile).sort_values(by="FDR")
                
                ## for KEGG: switch term ID and description for index
                if terms.index[0].startswith("hsa") and "Term" in terms:
                    terms["Term ID"] = terms.index
                    terms = terms.set_index("Term")
                
                gsea_dict[data]["truth"][method][library] = terms
                print(data, method, library, len(terms), len(terms[terms['FDR']<FDR]))

In [None]:
# For KEGG ORA,set term name as index instead of term ID

for site in sites:
    for fdr in FDRs:
        for lfc in logFCs:
            s = "_lfc" if lfc > 0 else ""
            method = f"clusterORA{s}.fdr{fdr}.post_lfc{lfc}.lfc{lfc}"
            f = f"../data/{site}/{sites[site]}/gsea/{method}.KEGG_2021_Human.feather"
            t=open_table(f)
            if "Term ID" in t or t.index.name == "Term": continue
            t["Term ID"] = t.index
            t.set_index("Term",inplace=True)
            t = t.reset_index()
            t.to_feather(f)
            display(t)

### Common terms

We will define an additional, common ground truth, restricted to terms to terms that are shared between GSEApy, clusterProfiler ORA and all data sets.

In [None]:
def get_common_terms():
    common_kegg, common_gobp = set(), set()
    for data in gsea_dict:
        for lfc in [0,1]:
            s = "_lfc" if lfc == 1 else ""
            # KEGG
            gseapy_terms = gsea_dict[data]["truth"]["gseapy"]["logFC"]["KEGG_2021_Human"].index
            clusterORA_terms = set(gsea_dict[data]["truth"][f"clusterORA{s}.fdr0.05.post_lfc{lfc}.lfc{lfc}"]["KEGG_2021_Human"].index)

            if len(common_kegg)<1: common_kegg = gseapy_terms.intersection(clusterORA_terms)
            else: common_kegg = common_kegg.intersection(gseapy_terms).intersection(clusterORA_terms)

            # GO BP
            gseapy_terms = gsea_dict[data]["truth"]["gseapy"]["logFC"]["GO_Biological_Process_2021"].index
            clusterORA_terms = gsea_dict[data]["truth"][f"clusterORA{s}.fdr0.05.post_lfc{lfc}.lfc{lfc}"]["GO_Biological_Process_2021"].index    

            if len(common_gobp)<1: common_gobp = gseapy_terms.intersection(clusterORA_terms)
            else: common_gobp = common_gobp.intersection(gseapy_terms).intersection(clusterORA_terms)

    return common_kegg, common_gobp
    
file_gobp = Path("../data/multi/common_gobp.txt")
file_kegg = Path("../data/multi/common_kegg.txt")
if (not file_gobp.is_file() or not file_kegg.is_file()):
    common_kegg, common_gobp = get_common_terms()
    pickler(common_gobp, file_gobp)
    pickler(common_kegg, file_kegg)
else:
    with open(file_gobp, "rb") as f:
        common_gobp = pickle.load(f)
    with open(file_kegg, "rb") as f:
        common_kegg = pickle.load(f)
        
print("KEGG Terms:", len(common_kegg))
print("GO BP Terms:", len(common_gobp))

As we have restricted the number of terms (hypotheses), we need to recalculate the adjusted pvalues for appropriate FDR control.

In [None]:
from statsmodels.stats.multitest import multipletests

redo_common_fdr = True
FDRs = [0.05]
logFCs = [0,1]
rankings = ["logFC", "|S2N|"]

if redo_common_fdr:

    for data in datasets:

        outpath = datasets[data]["outpath"]
        gseapath = f"{outpath}/gsea"
        
        for library in libraries:
            
            common = common_gobp if library == "GO_Biological_Process_2021" else common_kegg
    
            ### GSEA
            for rnk in rankings:
                suffix = "" if rnk == "logFC" else "_s2n"
                reportfile = f"{gseapath}/gseapy{suffix}.{library}.feather"
                terms = open_table(reportfile).sort_values(by="FDR")
                terms["FDR.common"] = np.full(len(terms),np.nan)
                terms.loc[common, "FDR.common"] = multipletests(terms.loc[common, "NOM p-val"], method="fdr_bh")[1]
                terms.reset_index().to_feather(reportfile)


            ### ORA
            for fdr in FDRs:
                for logFC in logFCs:
                    s = "_lfc" if logFC > 0 else ""
                    method = f"clusterORA{s}.fdr{fdr}.post_lfc{logFC}.lfc{logFC}"

                    reportfile = f"{gseapath}/{method}.{library}.feather"         
                    terms = open_table(reportfile).sort_values(by="FDR")
                    #assert terms.index.duplicated().sum() == 0
                    terms["FDR.common"] = np.full(len(terms),np.nan)
                    
                    if terms.index[0].startswith("hsa"):
                        terms["hsa"] = terms.index
                        terms.set_index("Term", inplace=True)
                    
                    ix = common #if library == "GO_Biological_Process_2021" else terms[terms["Term"].isin(common)].index
                    print(data,library,fdr,logFC,method)
                    terms.loc[ix, "FDR.common"] = multipletests(terms.loc[ix, "pvalue"], method="fdr_bh")[1]
                    terms.reset_index().to_feather(reportfile)

In [None]:
gsea_methods = ['gseapy', 'gseapy_s2n',
               'clusterORA.fdr0.05.post_lfc0.lfc0',
               'clusterORA_lfc.fdr0.05.post_lfc1.lfc1']


from process import get_n_gsea_truth
FDRs=[0.05]
gsea_param_set = "p1"
gsea_truth = {data: {out: {dea: {gsea: {lib: {"truth"+mode: {fdr: None for fdr in FDRs} for mode in ["","_common"]} for lib in libraries} for gsea in gsea_methods} for dea in DEA_methods} for out in outlier_methods} for data in datasets if "syn" not in data}

for data in datasets:
    print(data)
    outpath_d = datasets[data]["outpath"]
    outname_d = str(outpath_d).split("/")[-1]
    gsea_truth[data] = get_n_gsea_truth(gsea_truth[data], outpath_d, outname_d, all_N, DEA_methods, outlier_methods, gsea_methods, libraries, FDRs, gsea_param_set, overwrite=False)


In [None]:
gsea_datasets = {sites[s]: {} for s in sites}

DEAs = DEA_methods
FDRs = [0.05]
all_N=[3]#,5,7,9,15]
    
quantity = ["truth"]
quantity += [q+"_common" for q in quantity]

iterables = [gsea_datasets,outlier_methods,DEAs,gsea_methods,libraries,FDRs,quantity]
multi_cols = pd.MultiIndex.from_product(iterables, names=["Data", "Out", "DEA", "Enrichment", "Library", "FDR", "Val"])
gsea_truth_df = pd.DataFrame(columns=multi_cols)

ids = [] # id connecting samples from same dataset and cohort size, used for paired-sample testing pre vs. post outlier removal
for d in gsea_datasets:
    for out in outlier_methods:
        for dea in DEAs:
            for gsea in gsea_methods:
                #if dea != "edger" and gsea_methods == "gsea_s2n": continue
                for fdr in FDRs:
                    for lib in libraries:
                        for quant in quantity:
                            #print(d,N,out,dea,fdr)
                            col = (d,out,dea,gsea,lib,fdr,quant)
                            if dea != "edgerqlf" and "s2n" in gsea: dea_source = "edgerqlf"
                            else: dea_source = dea
                            gsea_truth_df.loc[0,col] = gsea_truth[d][out][dea_source][gsea][lib][quant][fdr]
                            
gsea_truth_df = gsea_truth_df.unstack().unstack(level="Val").reset_index(level=["Data","Out","DEA","Enrichment","Library","FDR"], drop=False)
gsea_truth_df.reset_index(drop=True, inplace=True)

# For outlier_method == "none", no adjustment is necessary, hence copy unadjusted values
#none_ix = combined_gsea_td[combined_gsea_td["Out"] == "none"].index # avoid setting with copy warning
#combined_gsea_td.loc[none_ix,"median_terms_adj"] = combined_gsea_td.loc[none_ix,"median_terms"]
#combined_gsea_td.loc[none_ix,"median_rep_adj"] = combined_gsea_td.loc[none_ix,"median_rep"]

for clean in cleanout:
    gsea_truth_df.loc[(gsea_truth_df[gsea_truth_df["Out"] == clean]).index, "Out"] = cleanout[clean]
for clean in cleandea:
    gsea_truth_df.loc[(gsea_truth_df[gsea_truth_df["DEA"] == clean]).index, "DEA"] = cleandea[clean]
gsea_truth_df.to_csv(f"../data/multi/gsea_truth_df.csv")
gsea_truth_df.head()

## Send batch jobs

In [None]:
selected_data = "LIHC"
outpath = datasets[selected_data]["outpath"]
outname = outpath.name
outpath, outname

In [None]:
from ubelix import run_gsea_batch
    
gsea_script_path = "../scripts/send_gsea_batch.sh"

n_cohorts = 2
all_N = [3]#,5,5,7,9,15]
gsea_methods = ["clusterORA_lfc","clusterORA","gseapy"] #"gseapy_s2n"
outlier_methods = ["none"]#,"jk"] #########,"pcah"]
DEA_methods = ["edgerqlf"]#["deseq2","edgerlrt"] ## reminder: do only edgerqlf for gsea S2N
libraries = ['GO_Biological_Process_2021', 'KEGG_2021_Human']

gsea_config_params = {
    
    "gsea_param_set": "p1", # id for this set of parameters
    "dea_param_set": "p1",
    "dea_param_set_lfc": "p2",
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    "gsea_methods": gsea_methods,
    "libraries": libraries,
    
    "gsea_kwargs":  {
                    "gseapy_s2n": {"permutation_num": 200, "save_full_results": False, "threads": 4, "ranking":"|S2N|", "min_size": 15, "max_size": 500},
                    "gseapy": {"permutation_num": 200, "save_full_results": False, "threads": 4, "ranking":"logFC", "min_size": 15, "max_size": 500},
                    "clusterORA": {"FDRs": [0.05], "logFCs": [0], "go_ont": "BP", "ranking":"n/a", "minGSSize": 15, "maxGSSize": 500, # KIRC/COAD use minGSSize 10, no problem since we use intersection with GSEA terms afterwards
                                   "use_internal_data": True, "internal_data_path": "../data/clusterORA/KEGG_DATA.RData"},
                    "clusterORA_lfc": {"FDRs": [0.05], "logFCs": [1], "go_ont": "BP", "ranking":"n/a", "minGSSize": 15, "maxGSSize": 500,
                                   "use_internal_data": True, "internal_data_path": "../data/clusterORA/KEGG_DATA.RData"}
                    } 
}


mode = "send jobs"
#mode = "test main"
mode = "just testing"
sleep_seconds = 0

run_gsea_batch(gsea_config_params, all_N, n_cohorts, libraries, gsea_script_path=gsea_script_path, mode=mode, sleep_seconds=sleep_seconds)

In [None]:
!squeue -u pd21v747

In [None]:
data = "LIHC"
N = 3
library = libraries[0]
cohort = 1
dea = "edgerqlf"
out = "none"
gsea_method = "clusterORA_lfc.fdr0.05.post_lfc1.lfc1"
param_set = "p1"
site = datasets[data]["site"]
p=f"../data/{site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}/gsea/{gsea_method}.{library}.{dea}.{out}.{param_set}.feather"
t=open_table(p)
print(len(t[t["FDR"]<0.05]))
t.sort_values(by="FDR")

## Process jobs

In [None]:
import sys, importlib
importlib.reload(sys.modules["process"])

from process import gsea_process_pipeline
gFDRs = [0.05]
gsea_methods = ["clusterORA_lfc.fdr0.05.post_lfc1.lfc1", "gseapy","clusterORA.fdr0.05.post_lfc0.lfc0"] #gseapy_s2n, "clusterORA.fdr0.05.post_lfc1.lfc0"
DEA_methods = ["edgerqlf"]#["deseq2", "edgerlrt"]
all_N = [3]#,5,7,9,15] 

import subprocess as sp
output = sp.getoutput('squeue -u pd21v747')
jobs_running = output.find("send_gse") > 0

if not jobs_running:
    gsea_process_pipeline(outpath, outname, all_N, DEA_methods, outlier_methods, gsea_methods, libraries, 
                          gFDRs, "p1", overwrite=1, overwrite_merged=1, n_cohorts=50, calculate_common=1)
    # when adding new method: overwrite merged and calculcate common only with new method, then only overwrite with all emthods

## Merge processed data sets

In [None]:
gsea_datasets = {sites[s]: {} for s in sites}

def load_gsea_results(path, site, name):
    resultsfile = f"{path}/{site}/{name}/gsea_results.txt"
    with open(resultsfile, "rb") as f:
        return pickle.load(f)
    
for s in sites:
    #if s not in ["breast","colorectal","kidney","lung","liver"]: continue
    gsea_datasets[sites[s]]["site"] = s
    res = load_gsea_results(datapath, s, sites[s])
    for key in res.keys():
        gsea_datasets[sites[s]][key] = res[key]

gsea_datasetsfile = "/storage/homefs/pd21v747/datanew/multi/gsea_datasets.txt"
pickler(gsea_datasets, gsea_datasetsfile)

In [None]:
# Create tidy dataframe

DEAs = DEA_methods
FDRs = [0.05]
all_N=[3]#,5,7,9,15]

    
quantity = ["median_rep", "median_terms",
            "median_mcc", "median_prec", "median_rec"]
quantity += [q+"_common" for q in quantity]

iterables = [gsea_datasets,all_N,outlier_methods,DEAs,gsea_methods,libraries,FDRs,quantity]
multi_cols = pd.MultiIndex.from_product(iterables, names=["Data", "N", "Out", "DEA", "Enrichment", "Library", "FDR", "Val"])
combined = pd.DataFrame(columns=multi_cols)

ids = [] # id connecting samples from same dataset and cohort size, used for paired-sample testing pre vs. post outlier removal
for i, d in enumerate(gsea_datasets):
    for j, N in enumerate(all_N):
        k = i*len(all_N)+j 
        for out in outlier_methods:
            for dea in DEAs:
                for gsea in gsea_methods:
                    #if dea != "edger" and gsea_methods == "gsea_s2n": continue
                    for fdr in FDRs:
                        for lib in libraries:
                            for quant in quantity:
                                #print(d,N,out,dea,fdr)
                                col = (d,N,out,dea,gsea,lib,fdr,quant)
                                combined.loc[0,col] = gsea_datasets[d][N][out][dea][gsea][lib][quant][fdr]
                                if quant == "median_rep": ids.append(k)
                            
combined_gsea_td = combined.unstack().unstack(level="Val").reset_index(level=["Data","N","Out","DEA","Enrichment","Library","FDR"], drop=False)
combined_gsea_td["id"] = ids
combined_gsea_td.reset_index(drop=True, inplace=True)

# For outlier_method == "none", no adjustment is necessary, hence copy unadjusted values
#none_ix = combined_gsea_td[combined_gsea_td["Out"] == "none"].index # avoid setting with copy warning
#combined_gsea_td.loc[none_ix,"median_terms_adj"] = combined_gsea_td.loc[none_ix,"median_terms"]
#combined_gsea_td.loc[none_ix,"median_rep_adj"] = combined_gsea_td.loc[none_ix,"median_rep"]

for clean in cleanout:
    combined_gsea_td.loc[(combined_gsea_td[combined_gsea_td["Out"] == clean]).index, "Out"] = cleanout[clean]
for clean in cleandea:
    combined_gsea_td.loc[(combined_gsea_td[combined_gsea_td["DEA"] == clean]).index, "DEA"] = cleandea[clean]
combined_gsea_td.to_csv(f"../data/multi/combined_gsea_td.csv")
combined_gsea_td.head()

## Inspect results

In [None]:
from misc import get_grid_size

x = "median_terms_common"
y = "median_rep_common"

a  = combined_gsea_td
#a = a[a["Library"]=="KEGG_2021_Human"]
hues = ["Data","Library","DEA","Out","N", "Enrichment"]
grid = get_grid_size(len(hues), k=0, fill=True)
fig, ax = plt.subplots(grid[0],grid[1],figsize=(6*grid[0],3*grid[1]), sharex=True,sharey=True)
ax=ax.flatten()

for i, hue in enumerate(hues):
    sns.scatterplot(data=a, x=x, y=y, hue=hue, ax=ax[i])    
    ax[i].invert_yaxis()
fig.tight_layout()

# Misc.

In [None]:
from process import delete_redundant_slurmfiles

delete_redundant_slurmfiles(outpath, outname, all_N)            

In [None]:
import sys, importlib
importlib.reload(sys.modules["misc"])
importlib.reload(sys.modules["DEA"])
importlib.reload(sys.modules["enrichment"])
importlib.reload(sys.modules["process"])