# Bootstrap estimation of DEG reliability

**Scenario:**

- We have an RNA-Seq data set with a relativeily small number of N replicates and wish to perform DEG analysis.

**Observation 1:**

- SNF2 data set has very high precision for small N (~3)
- LMAB data set has very bad precision even for N much larger than 15
- Most other data sets we tested are between these two extremes.

**Goal:**

Can we bootstrap the replicates to predict how "well-behaved" the data set is? Concretely, we calculate a metric for each bootstrap sample (e.g. the Kullback–Leibler divergence) and check if the metrics correlates with the precision.

- What is the minimum N?
- What is the minimum number of bootstrap trials?

In [None]:
import os
import sys
import glob
import re
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path

datapath = Path("../data")

pd.set_option('display.max_rows', 50)

modpath = Path("../scripts")
sys.path.append(os.path.abspath(modpath))

from misc import open_table

# Select data

In [None]:
cancer_sites_paired = {"liver": "LIHC",
         "thyroid": "THCA",
         "lung": "LUAD",
         "lung2": "LUSC",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "prostate": "PRAD"}

# Misc unpaired only datasets added in revision
misc_unpaired = {
         "GSETB":"LWPL",
         "yeast":"SNF2"
}

misc_custom_design = {
            "GSEPN":"GIPF",
            "breast_basher": "BASHER", # Basal vs HER2+
            "breast_basluma": "BASLUMA", # Basal vs Luminal A
            "breast_baslumb": "BASLUMB", # Basal vs Luminal B
            "breast_herluma": "HERLUMA", # HER2 vs Luminal A
            #"breast_herlumb": "HERLUMB", # HER2 vs Luminal B
            "breast_lumab": "LUMAB" # LumA vs LumB
}

sites = cancer_sites_paired | misc_unpaired | misc_custom_design
sites = {k:{"data":v,"confounders":None} for k,v in sites.items()}

for site in misc_custom_design:
    if site in ["GSEPN"]:
        sites[site]["confounders"] = ["age","ever_smoked","Sex"]
    else:
        sites[site]["confounders"] = ["TumorPurity", "days_to_birth"]
    sites[site]["paramset"] = "p2c"
    sites[site]["design"] = "custom"
    sites[site]["lfc"] = 1

for site in cancer_sites_paired:
    sites[site]["paramset"] = "p2"
    sites[site]["design"] = "paired"
    sites[site]["lfc"] = 1

for site in misc_unpaired:
    sites[site]["paramset"] = "p3"
    sites[site]["design"] = "unpaired"
    sites[site]["lfc"] = 1

In [None]:
meta = None
metafile = None
confounders = None

selected_site = "breast"
data = sites[selected_site]["data"]
design = sites[selected_site]["design"]
lfc = sites[selected_site]["lfc"]
paramset = sites[selected_site]["paramset"]

print("Data:", data, "\nDesign:", design, "\nFormal fold change threshold:", lfc)

dffile = f"../data/{selected_site}/{data}/{data}.csv"
confounders = sites[selected_site]["confounders"]
if confounders:
    metafile = dffile.replace(".csv",".meta.csv")

data = dffile.split(".csv")[0].split("/")[-1]

# Load the full, unsubsampled dataset (later reduced to cohort)
df = pd.read_csv(dffile, index_col=0)

# We can shuffle the genes to perform a control experiment
shuffle = False
if shuffle:
    ix = list(df.columns.values)
    np.random.shuffle(ix)
    df.columns = ix
    df = df[sorted(df.columns)]

# Load meta data with confoundes to contol for if available
if metafile:
    meta = pd.read_csv(metafile, index_col=0)
    meta = meta.loc[df.columns]
    if confounders:
        meta = meta[confounders + ["Condition"]]
        metafile = metafile.replace(".csv",".sub.csv")
        #meta.rename({condition_col: "Condition"}, inplace=True, axis=1)
        meta.index.name = "Sample"
        meta.to_csv(metafile)#, index=False)
    
    display(meta.head())
    print(meta.shape)
    
display(df.head())
print(df.shape)

# Calculate bootstrap trials

We will re-use the already subsampled cohorts from the main experiment.

In [None]:
# Select subsample size
N = 10

# Select one of the 100 cohorts already subsampled
cohort = 1

method = "deseq2" #  one in ["deseq2", "edgerqlf", "edgerlrt"]

# Load subsampled cohort DEA results
cohort_path = f"../data/{selected_site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}"
cohortfile = f"{cohort_path}/tab.none.{method}.{paramset}"
tab_sub = open_table(cohortfile)

# Load DEA method truth
truthfile = f"../data/{selected_site}/{data}/{data}.{method}.lfc{lfc}.csv"
tab_truth = open_table(truthfile)

edgerqlf_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design,
                   "check_gof": False, "verbose": False}
edgerlrt_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design,
                   "check_gof": False, "verbose": False}
deseq2_kwargs = {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": lfc, "design": design}


def open_df_cohort(cohort_path):
    with open(f"{cohort_path}/config.json", "r+") as f:
        configdict = json.load(f)
        cohort_samples = configdict["samples_i"]
        return df[cohort_samples]

df_sub = open_df_cohort(cohort_path)
cohort_path

In [None]:
from DEA import run_dea

def open_bootstrap_results(cohort_path, method, paramset):
    results_file = f"{cohort_path}/tab.bagged.trials*.{method}.{paramset}.csv"
    matched_files = glob.glob(results_file)
    if matched_files:
        results_file = matched_files[0]
        existing_trials = int(results_file.split(".trials")[1].split(".")[0])
        if existing_trials > 0:
            return open_table(results_file), results_file, existing_trials
    print("No bootstrap results file found")
    results_file = f"{cohort_path}/tab.bagged.trials0.{method}.{paramset}.csv"
    return None, results_file, 0

def bootstrap_data(df, design, method, paramset, cohort_path, trials, meta=None):

    results = None
    
    # If exists, read results file where trial results will be concatenated to
    results, results_file, existing_trials = open_bootstrap_results(cohort_path, method, paramset)
    if results is None:
        print("Initializing resultsfile")
        results_file = results_file.replace('trials*','trials0')
        os.system(f"touch {results_file}")
        existing_trials = 0

    if existing_trials >= trials :
        print(f"Already have {existing_trials} trials, returning")
        return
            
    for trial in range(existing_trials+1, trials+1):
        print(f"Running trial {trial}")
        reps = len(df.columns) // 2
        bootstrap_samples_N = np.random.choice(df.columns[:reps], N)
        bootstrap_samples_T = np.random.choice(df.columns[reps:], N)
        bs = list(bootstrap_samples_N)+list(bootstrap_samples_T)
        df_bag = df[bs]
    
        if design == "custom":
            meta_sub = meta.loc[df_bag.columns]
            meta_sub.copy()
            design_sub = "../data/test/design.csv"
            meta_sub.index = [col+str(i) for i, col in enumerate(meta_sub.index)]
            meta_sub.to_csv(design_sub)
        elif design in ["paired", "unpaired"]:
            design_sub = design
    
        df_bag.columns = [col+str(i) for i, col in enumerate(df_bag.columns)]

        # Store dea output file in temporary file that will be overwritten in next trial
        outfile_dea = f"../data/test/bagg.tmp.csv"
        
        match method:
            case "edgerqlf":
                edgerqlf_kwargs["design"] = design_sub
                run_dea(df_bag, outfile_dea, "edgerqlf", True, verbose=False, **edgerqlf_kwargs)
            case "edgerlrt":
                edgerlrt_kwargs["design"] = design_sub
                run_dea(df_bag, outfile_dea, "edgerlrt", True, verbose=False, **edgerlrt_kwargs)
            case "deseq2":
                deseq2_kwargs["design"] = design_sub
                run_dea(df_bag, outfile_dea, "deseq2", True, verbose=False, **deseq2_kwargs)

        trial_results = open_table(outfile_dea)

        if results is None:
            results = trial_results            
        else:
            results = pd.concat([results,trial_results])

        # Increment file name, save new bagged result, rm old
        results_file_p1 = re.sub(r"trials(\d+)", lambda m: f"trials{int(m.group(1)) + 1}", results_file)
        results.to_csv(results_file_p1)
        os.system(f"rm {results_file}")
        results_file = results_file_p1

In [None]:
bootstrap_data(df_sub, design, method, paramset, cohort_path, trials=25, meta=meta)

In [None]:
results, _, trials = open_bootstrap_results(cohort_path, method, paramset)
print(trials, len(results) // trials)
results

# Inspect results

## Divergence

Calculate Kullback-Leibler divergence between bootstrap samples, subsampled cohort, and ground truth

In [None]:
from misc import get_kl_div

fig, ax = plt.subplots(1,2,figsize=(10,5))

def get_kls_lists(tab_truth, tab_sub, results, trials, ax=None):
    kls_sub, kls_truth = [], []
    for trial in range(trials):
        ix = np.arange(trial*len(results)//trials, (trial+1)*len(results)//trials)
        bag = results.iloc[ix]["logFC"]
        if ax:
            sns.kdeplot(bag,color="grey", ax=ax, label="Bootstrapped" if trial==0 else None)
        kl = get_kl_div(tab_sub["logFC"], bag, bins=np.linspace(-4,4,50))
        kls_sub.append(kl)
        kl = get_kl_div(tab_truth["logFC"], bag, bins=np.linspace(-4,4,50))
        kls_truth.append(kl)
    return kls_truth, kls_sub

kls_truth, kls_sub = get_kls_lists(tab_truth, tab_sub, results, trials, ax=ax[0])

sns.kdeplot(tab_truth["logFC"], alpha=1, ax=ax[0], label="Truth",color="cyan")
sns.kdeplot(tab_sub["logFC"], alpha=1, ls="--", color="red",ax=ax[0],label=f"N{N} Cohort {cohort}")

kls_df = pd.DataFrame(np.array([kls_truth,kls_sub]).T, columns=["Truth", "Cohort"])

def kls_box(kls_df, ax):
    sns.boxplot(kls_df, ax=ax)
    sns.stripplot(kls_df, color="black", ax=ax)
    ax.set(ylabel="KL Divergence")    

kls_box(kls_df, ax=ax[1])
kl_truth_vs_sub = get_kl_div(tab_truth["logFC"], tab_sub["logFC"], bins=np.linspace(-4,4,50))
ax[1].axhline(kl_truth_vs_sub, label="Truh vs Cohort", color="green")

for a in ax:
    a.legend(loc="best")

fig.suptitle(f"Data: {data} | Trials: {trials}")
fig.tight_layout()
figpath = f"../figures/boot.kl.{data}.N{N}.pdf"
fig.savefig(figpath)
print(figpath)

# Multidata plots

In [None]:
selected_sites = ["yeast", "GSETB","GSEPN","breast_lumab", "breast_herluma","breast_basher","breast_basluma","breast_baslumb","thyroid","lung","breast","prostate","kidney","colorectal","liver","lung2"]
N = 10
cohort = 1

cols = 4
rows = len(selected_sites) / cols
rows = int(np.ceil(rows))
fig, axes = plt.subplots(rows, cols, figsize=(cols*5,rows*5), sharey=True)
axes = axes.flatten()

for ax, site in zip(axes, selected_sites):
    data_site = sites[site]['data']
    paramset_site = sites[site]["paramset"]
    cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
    results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
    print(site, trials_site, len(results_site) // trials_site)

    cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
    tab_sub_site = open_table(cohortfile_site)
    truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
    tab_truth_site = open_table(truthfile_site)
    kls_truth_site, kls_sub_site = get_kls_lists(tab_truth_site, tab_sub_site, results_site, trials_site)
    kls_df_site = pd.DataFrame(np.array([kls_truth_site,kls_sub_site]).T, columns=["Truth", "Cohort"])
    
    kls_box(kls_df_site, ax)
    ax.set(title=f"Data: {data_site} N{N} | Trials: {trials_site}")

    truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
    tab_truth_site = open_table(truthfile_site)

    cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{sites[site]["paramset"]}"
    tab_sub_site = open_table(cohortfile_site)

    kl_truth_vs_sub_site = get_kl_div(tab_truth_site["logFC"], tab_sub_site["logFC"], bins=np.linspace(-4,4,50))
    ax.axhline(kl_truth_vs_sub_site, label="Truh vs Cohort", color="green")

In [None]:
N = 10
cohort = 1

ksl_df_all_sites = []

for site in selected_sites:
    data_site = sites[site]['data']
    paramset_site = sites[site]["paramset"]
    cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
    results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
    print(site, trials_site, len(results_site) // trials_site)

    cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
    tab_sub_site = open_table(cohortfile_site)
    truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
    tab_truth_site = open_table(truthfile_site)
    kls_truth_site, kls_sub_site = get_kls_lists(tab_truth_site, tab_sub_site, results_site, trials_site)
    kls_df_site = pd.DataFrame(np.array([kls_truth_site,kls_sub_site]).T, columns=["Truth", "Cohort"])
    kls_df_site = kls_df_site.melt(var_name='Reference', value_name='KL')
    kls_df_site["Data"] = data_site
    ksl_df_all_sites.append(kls_df_site)

ksl_df_all_sites = pd.concat(ksl_df_all_sites)

alt_data_names = {"LWPL":"GATB", "LUMAB":"LMAB","HERLUMA":"HRLA","BASHER":"BSHR","BASLUMA":"BSLA","BASLUMB":"BSLB"}
ksl_df_all_sites.replace({"Data": alt_data_names}, inplace=True)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))
reference = "Cohort"
df_ref = ksl_df_all_sites[ksl_df_all_sites["Reference"]==reference]
sns.boxplot(data=df_ref, x="Data", y="KL", ax=ax)

In [None]:
combined_all = pd.read_csv("../data/multi/combined_all.csv", index_col=0)
combined_all = combined_all[~combined_all["isSynthetic"]]

In [None]:
dfm = pd.DataFrame(index = list(set(combined_all["Data"])))
for N_ in [3,4,5,6,7,8,9,10,12,15]:
    c = combined_all
    c = c[(c["N"]==N_)&(c["DEA"]=="DESeq2 Wald")&(c["logFC"]==1)&(c["lfc_mode"]=="formal")]
    c.set_index("Data", inplace=True)
    dfm[f"Prec_N{N_}"] = c["median_prec"]

for ref in ["Truth","Cohort"]:
    df_ref = ksl_df_all_sites[ksl_df_all_sites["Reference"]==ref]
    dfm[f"KL_{ref}"] = df_ref.groupby("Data")["KL"].median()

In [None]:
import scipy.stats as stats

fig, ax = plt.subplots(1, 2, figsize=(10,4),sharex=True)
reference = "Cohort"
df_ref = ksl_df_all_sites[ksl_df_all_sites["Reference"]==reference]
sns.scatterplot(data=dfm, y="Prec_N3", x=f"KL_{reference}", hue=dfm.index, style=dfm.index, s=200, ax=ax[0])
sns.regplot(data=dfm, y="Prec_N3", x=f"KL_{reference}", ax=ax[0], scatter_kws={'s':0})
    
sns.scatterplot(data=dfm, y="Prec_N10", x=f"KL_{reference}", hue=dfm.index, style=dfm.index, s=200, ax=ax[1])
sns.regplot(data=dfm, y="Prec_N10", x=f"KL_{reference}", scatter_kws={'s':0}, ax=ax[1])

for a in ax:
    r_val, p_val = stats.pearsonr(dfm[f"KL_{reference}"], dfm[f"Prec_N{3 if a == ax[0] else 10}"])
    r2_val = r_val ** 2
    a.text(0.05, 0.05, f"r = {r_val:.2f}\nr² = {r2_val:.2f}\np = {p_val:.3g}", 
           transform=a.transAxes, fontsize=10, verticalalignment='bottom')

ax[0].legend().remove()
ax[1].legend(bbox_to_anchor=(1,1.06))
ax[0].set(ylabel="Median Precision (N=3)")
ax[1].set(ylabel="Median Precision (N=10)")
for a in ax:
    a.set(xlabel=("Median KL Divergence"))
    a.set(ylim=(-0.05,1.05))
fig.suptitle("KL of 25 Bootstrap trials relative to 1 Cohort (N=10)")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def running_mean(x):
    N = len(x)
    running_mean = np.cumsum(x) / np.arange(1, N + 1)
    plt.figure(figsize=(8, 5))
    plt.plot(np.arange(1, N + 1), running_mean, label="Running Mean", color='b')
    plt.axhline(y=np.mean(x), color='r', linestyle='--', label="Final Mean")
    plt.xlabel("Bootstrap Trial Number (i)")
    plt.ylabel("Mean of First i Trials")
    plt.title("Running Mean of Bootstrap Estimates")
    plt.legend()
    plt.grid()
    plt.show()

d=df_ref
d=d[d["Data"]=="LMAB"]

running_mean(d["KL"])

# Misc.

In [None]:
import sys, importlib
importlib.reload(sys.modules["misc"])
from DEA import run_dea