Notebook to send jobs to the Ubelix HPC cluster at the University of Bern

In [None]:
import sys
import os
import logging
import glob
import pickle
import json
from pathlib import Path
import rpy2.robjects as ro
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

%load_ext rpy2.ipython

logging.basicConfig(filename='example.log', 
                    encoding='utf-8', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))

datapath = Path("../data")
#datapath = Path("/storage/homefs/pd21v747/datanew")

modpath = Path("../scripts")
sys.path.append(os.path.relpath(modpath))

#from misc import Timer, pickler, open_table

In [None]:
sites = {"liver": "LIHC"}
         # "thyroid": "THCA",
         # "lung": "LUAD",
         # "kidney": "KIRC",
         # "colorectal": "COAD",
         # "breast": "BRCA",
         # "prostate": "PRAD"}

datasets = {sites[s]: {} for s in sites}

for s in sites:
    f = Path(f"{datapath}/{s}/{sites[s]}/{sites[s]}.csv")
    df = pd.read_csv(f, index_col=0)
    datasets[sites[s]]["genes"] = len(df)
    datasets[sites[s]]["site"] = s
    datasets[sites[s]]["datapath"] = f
    datasets[sites[s]]["outpath"] = f.parent
    datasets[sites[s]]["patients"] = len(df.columns)//2
    print(f"{s:<10}", datasets[sites[s]]["genes"], datasets[sites[s]]["patients"])
    
# Pretty names
cleanout = {"jk": "ReBoost",
            "pcah": "rPCA",
            "none": "None"}
cleandea = {"edger": "edgeR QLF",
            "edgerlrt": "edgeR LRT",
            "deseq2": "DESeq2"}

# Differential expression analysis

## Define ground truth

Define ground truth DEGs for a given FDR, logFC cutoff as the intersection of DEGs from all three DEA tests (Wald, LRT, QLF)

In [None]:
from DEA import run_dea_on_full_data
from process import find_ground_truth

DEAs = ["edgerlrt", "edgerqlf", "deseq2"]
FDRs = [0.1,0.05,0.01,0.001]
logFCs = [0, 1] # formal lfc threhsold in edger or deseq2
logFCs_post = [0,0.5,1,1.5,2] # post hoc thresholds

run_dea_on_full_data(datasets, DEAs, overwrite = False, lfcs = logFCs)

datasets = find_ground_truth(datasets, DEAs, FDRs, logFCs_post, lfc_test = 0)
datasets = find_ground_truth(datasets, DEAs, FDRs, [1], lfc_test = 1)

## Send batch jobs for selected data set

In [None]:
selected_data = "LIHC"
outpath = datasets[selected_data]["outpath"]
outname = outpath.name
outpath, outname

In [None]:
from ubelix import run_multi_batch

script_path = Path("../scripts/send_batch.sh")
DEA_methods = ["edgerqlf"]#,"edgerlrt", "deseq2"] # finish edgerqlf jobs before sending other jobs
outlier_methods = ["none", "pcah", "jk"] # only use none for p2
all_N = [3]#,4,5,6,7,8,9,10,12,15]
n_cohorts = 2#100

assert outname in str(outpath)

config_params_1 = {
    
    "param_set": "p1", # id for this set of parameters
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    
    "outlier_kwargs": {
        "none": {},
        "jk": {
            "FDR": 0.01,
            "overwrite": False, # overwrite existing jk tab
            "max_removed_frac": 0.5, # fraction of patients; after 1st iteration, don't jackknife bottom frac patients
            "efficient": True,
            "cols_to_keep": ["FDR"],
            "cleanup": True # remove individual jk tabs and iterations after merger
        },
        "pcah": {"k": 2}
    },
    
    "DEA_kwargs": {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"]},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt"},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"]}
    }
    
}

# lfc = 1 threshold
config_params_2 = {
    
    "param_set": "p2", # id for this set of parameters
    
    "overwrite": False, # overwrite existing tabs
    "data": str(outpath) + "/" + outname + ".csv",
    "outpath": str(outpath),
    "outname": outname,
    
    "DEA_methods": DEA_methods,
    "outlier_methods": outlier_methods,
    
    "outlier_kwargs": {
        "none": {},
        "jk": {
            "FDR": 0.01,
            "overwrite": False, # overwrite existing jk tab
            "max_removed_frac": 0.5, # fraction of patients; after 1st iteration, don't jackknife bottom frac patients
            "efficient": True,
            "cols_to_keep": ["FDR"],
            "cleanup": True # remove individual jk tabs and iterations after merger
        },
        "pcah": {"k": 2}
    },
    
    "DEA_kwargs": {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 1},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt", "lfc": 1},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": 1}
    }
    
}
import subprocess as sp
output = sp.getoutput('squeue -u pd21v747')
jobs_running = output.find("send_bat") > 0

mode = "send jobs"
#mode = "test main terminal"
#mode = "just testing"
do_nothing = False
config_params = config_params_2

if not jobs_running and not do_nothing:
    run_multi_batch(config_params, all_N, n_cohorts, script_path, mode = mode)
elif jobs_running:
    print("Jobs running")

In [None]:
!squeue -u pd21v747

## Process jobs

In [None]:
DEAs = ["edgerqlf", "edgerlrt", "deseq2"]
outlier_methods = ["none", "pcah", "jk"]
FDRs = [0.1,0.05,0.01,0.001]
logFCs = [0, 0.5, 1, 1.5, 2]
all_N = [3,4,5,6,7,8,9,10,12,15]
lfc_test = 0
param_set = "p1"

DEAs = ["edgerqlf"]#"deseq2", "edgerlrt"]
outlier_methods = ["none"]#, "pcah", "jk"]
FDRs = [0.1,0.05,0.01,0.001]
logFCs = [1]
all_N = [3]#,4,5,6,7,8,9,10,12,15]
lfc_test = 1
param_set = "p2"

param_sets = ["p1","p2"]

In [None]:
from process import process_pipeline
from misc import profile_func
import pstats

kwargs = {"outpath":outpath, "outname":outname, "all_N": all_N, "DEAs":DEAs, "outlier_methods": outlier_methods, 
          "FDRs":FDRs, "logFCs":logFCs, "lfc_test": lfc_test, "param_set":param_set, "overwrite": 1, "overwrite_merged": 1, "n_cohorts": 2}

do_process = True
if do_process:
    prof = profile_func(process_pipeline, kwargs)
    stats = pstats.Stats(prof).strip_dirs().sort_stats("cumtime")
    stats.print_stats(50)

In [None]:
from misc import open_table
site = "liver"
f=f"../data/{site}/{sites[site]}/{sites[site]}_N3/all.logFC.none.edgerqlf.p2.feather"
tab = open_table(f)
tab

In [None]:
import sys, importlib
importlib.reload(sys.modules["misc"])
importlib.reload(sys.modules["process"])