# Bootstrap estimation of DEG reliability

**Scenario:**

- We have an RNA-Seq data set with a relativeily small number of N replicates and wish to perform DEG analysis.

**Observation 1:**

- SNF2 data set has very high precision for small N (~3)
- LMAB data set has very bad precision even for N much larger than 15
- Most other data sets we tested are between these two extremes.

**Goal:**

Can we bootstrap the replicates to predict how "well-behaved" the data set is? Concretely, we calculate a metric for each bootstrap sample (e.g. the Kullback–Leibler divergence) and check if the metrics correlates with the precision.

- What is the minimum N?
- What is the minimum number of bootstrap trials?

In [None]:
import os
import sys
import glob
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path

datapath = Path("../data")

pd.set_option('display.max_rows', 50)

modpath = Path("../scripts")
sys.path.append(os.path.abspath(modpath))

from misc import open_table

# Select data

In [None]:
cancer_sites_paired = {"liver": "LIHC",
         "thyroid": "THCA",
         "lung": "LUAD",
         "lung2": "LUSC",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "prostate": "PRAD"}

# Misc unpaired only datasets added in revision
misc_unpaired = {
         "GSETB":"LWPL",
         "yeast":"SNF2"
}

misc_custom_design = {
            "GSEPN":"GIPF",
            "breast_basher": "BASHER", # Basal vs HER2+
            "breast_basluma": "BASLUMA", # Basal vs Luminal A
            "breast_baslumb": "BASLUMB", # Basal vs Luminal B
            "breast_herluma": "HERLUMA", # HER2 vs Luminal A
            #"breast_herlumb": "HERLUMB", # HER2 vs Luminal B
            "breast_lumab": "LUMAB" # LumA vs LumB
}

sites = cancer_sites_paired | misc_unpaired | misc_custom_design
sites = {k:{"data":v,"confounders":None} for k,v in sites.items()}

for site in misc_custom_design:
    if site in ["GSEPN"]:
        sites[site]["confounders"] = ["age","ever_smoked","Sex"]
    else:
        sites[site]["confounders"] = ["TumorPurity", "days_to_birth"]
    sites[site]["paramset"] = "p2c"
    sites[site]["design"] = "custom"
    sites[site]["lfc"] = 1

for site in cancer_sites_paired:
    sites[site]["paramset"] = "p2"
    sites[site]["design"] = "paired"
    sites[site]["lfc"] = 1

for site in misc_unpaired:
    sites[site]["paramset"] = "p3"
    sites[site]["design"] = "unpaired"
    sites[site]["lfc"] = 1

In [None]:
meta = None
metafile = None
confounders = None

selected_site = "yeast"
data = sites[selected_site]["data"]
design = sites[selected_site]["design"]
lfc = sites[selected_site]["lfc"]
paramset = sites[selected_site]["paramset"]

print("Data:", data, "\nDesign:", design, "\nFormal fold change threshold:", lfc)

dffile = f"../data/{selected_site}/{data}/{data}.csv"
confounders = sites[selected_site]["confounders"]
if confounders:
    metafile = dffile.replace(".csv",".meta.csv")

# Load the full, unsubsampled dataset
df = pd.read_csv(dffile, index_col=0)
data = dffile.split(".csv")[0].split("/")[-1]

# We can shuffle the genes to perform a control experiment
shuffle = False
if shuffle:
    ix = list(df.columns.values)
    np.random.shuffle(ix)
    df.columns = ix
    df = df[sorted(df.columns)]

# Load meta data with confoundes to contol for if available
if metafile:
    meta = pd.read_csv(metafile, index_col=0)
    meta = meta.loc[df.columns]
    if confounders:
        meta = meta[confounders + ["Condition"]]
        metafile = metafile.replace(".csv",".sub.csv")
        #meta.rename({condition_col: "Condition"}, inplace=True, axis=1)
        meta.index.name = "Sample"
        meta.to_csv(metafile)#, index=False)
    
    display(meta.head())
    print(meta.shape)
    
display(df.head())
print(df.shape)

# Calculate bootstrap trials

We will re-use the already subsampled cohorts from the main experiment.

In [None]:
# Select subsample size
N = 10

# Select one of the 100 cohorts already subsampled
cohort = 1

method = "deseq2" #  one in ["deseq2", "edgerqlf", "edgerlrt"]

edgerqlf_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design,
                   "check_gof": False, "verbose": False}
edgerlrt_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design,
                   "check_gof": False, "verbose": False}
deseq2_kwargs = {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": lfc, "design": design}

cohort_path = f"../data/{selected_site}//{data}/{data}_N{N}/{data}_N{N}_{cohort:04}"
cohort_path

In [None]:
# If exists, read results file where trial results will be concatenated to
results_file = f"{cohort_path}/tab.bagged.trials*.{method}.{paramset}.csv"
matched_files = glob.glob(results_file)
if matched_files:
    results_file = matched_files[0]
    print(results_file)
    existing_trials = int(results_file.split(".trials")[1].split(".")[0])
    if existing_trials > 0:
        results = open_table(results_file)
else:
    print("Initializing resultsfile")
    results_file = results_file.replace('trials*','trials0')
    os.system(f"touch {results_file}")
    existing_trials = 0

existing_trials

results_file_p1 = re.sub(r"trials(\d+)", lambda m: f"trials{int(m.group(1)) + 1}", results_file)

results_file_p1

In [None]:
from DEA import run_dea

def open_bootstrap_results(cohort_path, method, paramset):
    results_file = f"{cohort_path}/tab.bagged.trials*.{method}.{paramset}.csv"
    matched_files = glob.glob(results_file)
    if matched_files:
        results_file = matched_files[0]
        existing_trials = int(results_file.split(".trials")[1].split(".")[0])
        if existing_trials > 0:
            return open_table(results_file), results_file, existing_trials
    print("No bootstrap results file found")
    results_file = f"{cohort_path}/tab.bagged.trials0.{method}.{paramset}.csv"
    return None, results_file, 0

def bootstrap_data(df, design, method, paramset, cohort_path, trials, meta=None):

    results = None
    
    # If exists, read results file where trial results will be concatenated to
    results, results_file, existing_trials = open_bootstrap_results(cohort_path, method, paramset)
    if results is None:
        print("Initializing resultsfile")
        results_file = results_file.replace('trials*','trials0')
        os.system(f"touch {results_file}")
        existing_trials = 0

    if existing_trials >= trials :
        print(f"Already have {existing_trials} trials, returning")
        return
        
    for trial in range(existing_trials+1, trials+1):
        print(f"Running trial {trial}")
        reps = len(df.columns) // 2
        bootstrap_samples_N = np.random.choice(df.columns[:reps], N)
        bootstrap_samples_T = np.random.choice(df.columns[reps:], N)
        bs = list(bootstrap_samples_N)+list(bootstrap_samples_T)
        df_bag = df[bs]
    
        if design == "custom":
            meta_sub = meta.loc[df_bag.columns]
            meta_sub.copy()
            design_sub = "../data/test/design.csv"
            meta_sub.index = [col+str(i) for i, col in enumerate(meta_sub.index)]
            meta_sub.to_csv(design_sub)
        elif design in ["paired", "unpaired"]:
            design_sub = design
    
        df_bag.columns = [col+str(i) for i, col in enumerate(df_bag.columns)]

        # Store dea output file in temporary file that will be overwritten in next trial
        outfile_dea = f"../data/test/bagg.tmp.csv"
        
        match method:
            case "edgerqlf":
                run_dea(df_bag, outfile_dea, "edgerqlf", True, verbose=False, **edgerqlf_kwargs)
            case "edgerlrt":
                run_dea(df_bag, outfile_dea, "edgerlrt", True, verbose=False, **edgerlrt_kwargs)
            case "deseq2":
                run_dea(df_bag, outfile_dea, "deseq2", True, verbose=False, **deseq2_kwargs)

        trial_results = open_table(outfile_dea)

        if results is None:
            results = trial_results            
        else:
            results = pd.concat([results,trial_results])

        # Increment file name trials
        results_file_p1 = re.sub(r"trials(\d+)", lambda m: f"trials{int(m.group(1)) + 1}", results_file)
        results.to_csv(results_file_p1)
        os.system(f"rm {results_file}")
        results_file = results_file_p1

In [None]:
bootstrap_data(df, design, method, paramset, cohort_path, trials=15, meta=meta)

In [None]:
results, _, trials = open_bootstrap_results(cohort_path, method, paramset)
print(trials, len(results) // trials)
results

In [None]:
import sys, importlib
importlib.reload(sys.modules["DEA"])
from DEA import run_dea