# Bootstrap estimation of DEG reliability

**Scenario:**

- We have an RNA-Seq data set with a relativeily small number of N replicates and wish to perform DEG analysis.

**Observation 1:**

- SNF2 data set has very high precision for small N (~3)
- LMAB data set has very bad precision even for N much larger than 15
- Most other data sets we tested are between these two extremes.

**Goal:**

Can we bootstrap the replicates to predict how "well-behaved" the data set is? Concretely, we calculate a metric for each bootstrap sample (e.g. the Kullback–Leibler divergence) and check if the metrics correlates with the precision.

- What is the minimum N?
- What is the minimum number of bootstrap trials?

In [None]:
import os
import sys
import glob
import re
import json
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path

datapath = Path("../data")

pd.set_option('display.max_rows', 50)

modpath = Path("../scripts")
sys.path.append(os.path.abspath(modpath))

from misc import open_table

# when using UBELIX on-demand
os.environ['R_HOME'] = '/storage/homefs/pd21v747/.conda/rna-rep/lib/R/'

cores = int(os.getenv("SLURM_CPUS_PER_TASK"))
print("Cores:", cores)

# Select data

In [None]:
cancer_sites_paired = {"liver": "LIHC",
         "thyroid": "THCA",
         "lung": "LUAD",
         "lung2": "LUSC",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "prostate": "PRAD"}

# Misc unpaired only datasets added in revision
misc_unpaired = {
         "GSETB":"LWPL",
         "yeast":"SNF2"
}

misc_custom_design = {
            "GSEPN":"GIPF",
            "breast_basher": "BASHER", # Basal vs HER2+
            "breast_basluma": "BASLUMA", # Basal vs Luminal A
            "breast_baslumb": "BASLUMB", # Basal vs Luminal B
            "breast_herluma": "HERLUMA", # HER2 vs Luminal A
            "breast_herlumb": "HERLUMB", # HER2 vs Luminal B
            "breast_lumab": "LUMAB" # LumA vs LumB
}

sites = cancer_sites_paired | misc_unpaired | misc_custom_design
sites = {k:{"data":v,"confounders":None} for k,v in sites.items()}

for site in misc_custom_design:
    if site in ["GSEPN"]:
        sites[site]["confounders"] = ["age","ever_smoked","Sex"]
    else:
        sites[site]["confounders"] = ["TumorPurity", "days_to_birth"]
    sites[site]["paramset"] = "p2c"
    sites[site]["design"] = "custom"
    sites[site]["lfc"] = 1

for site in cancer_sites_paired:
    sites[site]["paramset"] = "p2"
    sites[site]["design"] = "paired"
    sites[site]["lfc"] = 1

for site in misc_unpaired:
    sites[site]["paramset"] = "p3"
    sites[site]["design"] = "unpaired"
    sites[site]["lfc"] = 1

alt_data_names = {"LWPL":"GATB","LUMAB":"LMAB", "HERLUMA":"HRLA","HERLUMB":"HRLB", "BASHER":"BSHR","BASLUMA":"BSLA", "BASLUMB":"BSLB"}

In [None]:
def perpare(selected_site,verbose=False):
    meta = None
    metafile = None
    confounders = None
    #selected_site = "breast_basher"
    data = sites[selected_site]["data"]
    design = sites[selected_site]["design"]
    lfc = sites[selected_site]["lfc"]
    paramset = sites[selected_site]["paramset"]
    
    print("Data:", data, "\nDesign:", design, "\nFormal fold change threshold:", lfc)
    
    dffile = f"../data/{selected_site}/{data}/{data}.csv"
    confounders = sites[selected_site]["confounders"]
    if confounders:
        metafile = dffile.replace(".csv",".meta.csv")
    
    data = dffile.split(".csv")[0].split("/")[-1]
    
    # Load the full, unsubsampled dataset (later reduced to cohort)
    df = pd.read_csv(dffile, index_col=0)
    
    # We can shuffle the genes to perform a control experiment
    shuffle = False
    if shuffle:
        ix = list(df.columns.values)
        np.random.shuffle(ix)
        df.columns = ix
        df = df[sorted(df.columns)]
    
    # Load meta data with confoundes to contol for if available
    if metafile:
        meta = pd.read_csv(metafile, index_col=0)
        meta = meta.loc[df.columns]
        if confounders:
            meta = meta[confounders + ["Condition"]]
            metafile = metafile.replace(".csv",".sub.csv")
            #meta.rename({condition_col: "Condition"}, inplace=True, axis=1)
            meta.index.name = "Sample"
            meta.to_csv(metafile)#, index=False)

        if verbose:
            display(meta.head())
            print(meta.shape)

    if verbose:
        display(df.head())
        print(df.shape)

    return df, meta, data, design, lfc, paramset

selected_site = "GSEPN"
df, meta, data, design, lfc, paramset = perpare(selected_site=selected_site,verbose=True)

# Run bootstrap trials

We will re-use the already subsampled cohorts from the main experiment.

In [None]:
from DEA import run_dea

def prepare2(df,selected_site,data,method,lfc,design,N,cohort,paramset):
    # Load subsampled cohort DEA results
    cohort_path = f"../data/{selected_site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}"
    cohortfile = f"{cohort_path}/tab.none.{method}.{paramset}"
    tab_sub = open_table(cohortfile)
    
    # Load DEA method truth
    truthfile = f"../data/{selected_site}/{data}/{data}.{method}.lfc{lfc}.csv"
    tab_truth = open_table(truthfile)
        
    def open_df_cohort(cohort_path):
        with open(f"{cohort_path}/config.json", "r+") as f:
            configdict = json.load(f)
            cohort_samples = configdict["samples_i"]
            return df[cohort_samples]
    
    df_sub = open_df_cohort(cohort_path)
    print(cohort_path)
    return df_sub, cohort_path, tab_truth, tab_sub


def open_bootstrap_results(cohort_path, method, paramset, return_df=True):
    results_file = f"{cohort_path}/tab.bagged.trials*.{method}.{paramset}.csv"
    matched_files = glob.glob(results_file)
    if matched_files:
        results_file = matched_files[0]
        existing_trials = int(results_file.split(".trials")[1].split(".")[0])
        if return_df and existing_trials > 0:
            return open_table(results_file), results_file, existing_trials
        else:
            return None, results_file, existing_trials
            
    print("No bootstrap results file found")
    results_file = f"{cohort_path}/tab.bagged.trials0.{method}.{paramset}.csv"
    return None, results_file, 0

def bootstrap_data(df, N, lfc, design, method, paramset, cohort_path, trials, meta=None, logfile=None):

    results = None
    
    # If exists, read results file where trial results will be concatenated to
    results, results_file, existing_trials = open_bootstrap_results(cohort_path, method, paramset)
    
    if results is None:
        print("Initializing resultsfile")
        results_file = results_file.replace('trials*','trials0')
        os.system(f"touch {results_file}")
        existing_trials = 0

    if existing_trials >= trials :
        print(f"Already have {existing_trials} trials, returning")
        return "returned_early"

    # Store dea output file in temporary file that will be overwritten in next trial
    outfile_dea = f"../data/tmp/bagg.tmp.{cohort_path.split('/')[-1]}.csv"
            
    for trial in range(existing_trials+1, trials+1):
        
        reps = len(df.columns) // 2
        np.random.seed(trial) # in case of miltiprocessing
        
        for a in range(1,6): # max 5 attempts if DEA fails for small N
            try:
                bootstrap_samples_N = np.random.choice(df.columns[:reps], N)
                bootstrap_samples_T = np.random.choice(df.columns[reps:], N)
                bs = list(bootstrap_samples_N)+list(bootstrap_samples_T)
                print(f"Running trial: {trial}, samples: {bs}")
                df_bag = df[bs]
        
                if design == "custom":
                    meta_sub = meta.loc[df_bag.columns]
                    meta_sub.copy()
                    # add suffix to filename avoid multiprocess conflict
                    design_sub = f"../data/tmp/design.{cohort_path.split('/')[-1]}.csv" 
                    meta_sub.index = [col+str(i) for i, col in enumerate(meta_sub.index)]
                    meta_sub.to_csv(design_sub)
                elif design in ["paired", "unpaired"]:
                    design_sub = design
            
                df_bag.columns = [col+str(i) for i, col in enumerate(df_bag.columns)]
        
                edgerqlf_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design_sub,
                           "check_gof": False, "verbose": False}
                edgerlrt_kwargs = {"filter_expr": True, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": lfc, "design": design_sub,
                           "check_gof": False, "verbose": False}
                deseq2_kwargs = {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": lfc, "design": design_sub}


                match method:
                    case "edgerqlf":
                        edgerqlf_kwargs["design"] = design_sub
                        run_dea(df_bag, outfile_dea, "edgerqlf", True, verbose=False, **edgerqlf_kwargs)
                    case "edgerlrt":
                        edgerlrt_kwargs["design"] = design_sub
                        run_dea(df_bag, outfile_dea, "edgerlrt", True, verbose=False, **edgerlrt_kwargs)
                    case "deseq2":
                        deseq2_kwargs["design"] = design_sub
                        run_dea(df_bag, outfile_dea, "deseq2", True, verbose=False, **deseq2_kwargs)
                if a > 2 and logfile is not None:
                    log = f"{cohort_path} N{N} {paramset} attempts: {a}"
                    os.system(f"echo {log} >> {logfile}")
                break
            except:
                continue

        trial_results = open_table(outfile_dea)

        if results is None:
            results = trial_results            
        else:
            results = pd.concat([results,trial_results])
            

        # Increment file name, save new bagged result, rm old
        results_file_p1 = re.sub(r"trials(\d+)", lambda m: f"trials{int(m.group(1)) + 1}", results_file)
        results.to_csv(results_file_p1)
        os.system(f"rm {results_file}")
        results_file = results_file_p1

    if logfile is not None:
        now = datetime.datetime.now()
        log = f"{cohort_path} N{N} {paramset} trials: {trials} {now}"
        os.system(f"echo {log} >> {logfile}")

In [None]:
# Run single cohort

#Select subsample size
N = 5

# Select one of the 100 cohorts already subsampled
cohort = 9

method = "deseq2" #  one in ["deseq2", "edgerqlf", "edgerlrt"]

df_sub, cohort_path, tab_truth, tab_sub = prepare2(df,selected_site,data,method,lfc,design,N,cohort,paramset)

bootstrap_data(df_sub, N, lfc, design, method, paramset, cohort_path, trials=25, meta=meta)

In [None]:
# Run multiple cohorts in parallel

import multiprocessing as mp

cohorts = 10
all_N = [5]#,10]
trials = 25
method = "deseq2" #  one in ["deseq2", "edgerqlf", "edgerlrt"]
logfile = "../data/multi/log.bag.txt"
cores = int(os.getenv("SLURM_CPUS_PER_TASK"))
print("Cores:", cores)

def bootstrap_multi(df,selected_site,data,method,lfc,design,N,paramset,trials,meta,logfile,cohort):

    # Check if trials already exist
    cohort_path = f"../data/{selected_site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}"
    _, _, existing_trials = open_bootstrap_results(cohort_path, method, paramset, return_df=False)
    if existing_trials >= trials: 
        print(f"{trials} already exist, returning...")
        return
    
    df_sub, cohort_path, tab_truth, tab_sub = prepare2(df,selected_site,data,method,lfc,design,N,cohort,paramset)
    bootstrap_data(df_sub, N, lfc, design, method, paramset, cohort_path, trials=trials, meta=meta, logfile=logfile)
      
for selected_site in sites:
    if selected_site != "GSEPN": continue
    df, meta, data, design, lfc, paramset = perpare(selected_site=selected_site)
    for N in all_N:
        with mp.Pool(processes=cores) as pool:
            args = df,selected_site,data,method,lfc,design,N,paramset,trials,meta,logfile
            pool.starmap(bootstrap_multi, [(*args, i) for i in range(1,1+cohorts)])

# Clean up
os.system(f"rm ../data/tmp/*")

In [None]:
results, _, trials = open_bootstrap_results(cohort_path, method, paramset)
genes = len(results) // trials
print(genes)
assert genes == len(df_sub)
results

# Inspect results

## Divergence

Calculate Kullback-Leibler divergence between bootstrap samples, subsampled cohort, and ground truth

In [None]:
from misc import get_kl_div
from scipy.stats import spearmanr

def kls_box(kls_df, ax):
    sns.boxplot(kls_df, ax=ax)
    sns.stripplot(kls_df, color="black", ax=ax)
    ax.set(ylabel="KL Divergence")    

def get_kls_lists(tab_truth, tab_sub, results, trials, ax=None):
    kls_sub, kls_truth = [], []
    for trial in range(trials):
        ix = np.arange(trial*len(results)//trials, (trial+1)*len(results)//trials)
        bag = results.iloc[ix]["logFC"]
        if ax:
            sns.kdeplot(bag,color="grey", ax=ax, label="Bootstrapped" if trial==0 else None)
        kl = get_kl_div(tab_sub["logFC"], bag, bins=np.linspace(-4,4,50))
        kls_sub.append(kl)
        kl = get_kl_div(tab_truth["logFC"], bag, bins=np.linspace(-4,4,50))
        kls_truth.append(kl)
    return kls_truth, kls_sub

def get_spearman_lists(tab_truth, tab_sub, results, trials):
    spearman_sub, spearman_truth = [], []
    genes = len(results)//trials
    tab_sub_rank = tab_sub["logFC"].rank()
    tab_truth_rank = tab_truth["logFC"].rank()

    if len(results) % genes != 0:
        print(f"Unequal lengths, spearman no calculated")
        return [], []

    for trial in range(trials):
        ix = np.arange(trial*genes, (trial+1)*genes)
        bag_rank = results.iloc[ix]["logFC"].rank()
        spearman = spearmanr(tab_sub_rank, bag_rank).statistic
        spearman_sub.append(spearman)
        spearman = spearmanr(tab_truth_rank, bag_rank).statistic
        spearman_truth.append(spearman)
    return spearman_truth, spearman_sub

def results_logfc_wide(results, trials):
    t = results["logFC"]
    return pd.DataFrame(np.array(t).reshape(len(t)//trials, trials, order='F'), index=t.index[:len(t)//trials])

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))

kls_truth, kls_sub = get_kls_lists(tab_truth, tab_sub, results, trials, ax=ax[0])

sns.kdeplot(tab_truth["logFC"], alpha=1, ax=ax[0], label="Truth",color="cyan")
sns.kdeplot(tab_sub["logFC"], alpha=1, ls="--", color="red",ax=ax[0],label=f"N{N} Cohort {cohort}")

kls_df = pd.DataFrame(np.array([kls_truth,kls_sub]).T, columns=["Truth", "Cohort"])

kls_box(kls_df, ax=ax[1])
kl_truth_vs_sub = get_kl_div(tab_truth["logFC"], tab_sub["logFC"], bins=np.linspace(-4,4,50))
ax[1].axhline(kl_truth_vs_sub, label="Truh vs Cohort", color="green")

for a in ax:
    a.legend(loc="best")

fig.suptitle(f"Data: {data} | Trials: {trials}")
fig.tight_layout()
figpath = f"../figures/boot.kl.{data}.N{N}.pdf"
#fig.savefig(figpath)
#print(figpath)

# Multidata plots

In [None]:
# selected_sites = ["yeast", "GSETB","GSEPN","breast_lumab", "breast_herluma","breast_herlumb","breast_basher","breast_basluma","breast_baslumb","thyroid","lung","breast","prostate","kidney","colorectal","liver","lung2"]
# N = 5
# cohort = 1
# cols = 4
# rows = len(selected_sites) / cols
# rows = int(np.ceil(rows))
# fig, axes = plt.subplots(rows, cols, figsize=(cols*5,rows*5), sharey=True)
# axes = axes.flatten()

# for ax, site in zip(axes, selected_sites):
#     data_site = sites[site]['data']
#     paramset_site = sites[site]["paramset"]
#     cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
#     results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
#     if results_site is None: continue
#     print(site, trials_site, len(results_site) // trials_site)

#     cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
#     tab_sub_site = open_table(cohortfile_site)
#     truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
#     tab_truth_site = open_table(truthfile_site)
#     kls_truth_site, kls_sub_site = get_kls_lists(tab_truth_site, tab_sub_site, results_site, trials_site)
#     kls_df_site = pd.DataFrame(np.array([kls_truth_site,kls_sub_site]).T, columns=["Truth", "Cohort"])
    
#     kls_box(kls_df_site, ax)
#     ax.set(title=f"Data: {data_site} N{N} | Trials: {trials_site}")

#     truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
#     tab_truth_site = open_table(truthfile_site)

#     cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{sites[site]["paramset"]}"
#     tab_sub_site = open_table(cohortfile_site)

#     kl_truth_vs_sub_site = get_kl_div(tab_truth_site["logFC"], tab_sub_site["logFC"], bins=np.linspace(-4,4,50))
#     ax.axhline(kl_truth_vs_sub_site, label="Truh vs Cohort", color="green")

In [None]:
def load_long_df(all_N, cohorts):

    df_long = []
    
    for site in sites:
        for N in all_N:
            for cohort in cohorts:
                data_site = sites[site]['data']
                paramset_site = sites[site]["paramset"]
                cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
                results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
                if results_site is None: continue
                print(site, N, "cohort", cohort, "tials:", trials_site, len(results_site) // trials_site)
    
                cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
                tab_sub_site = open_table(cohortfile_site)
                truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
                tab_truth_site = open_table(truthfile_site)

                # Compute metrics
                kls_truth_site, kls_sub_site = get_kls_lists(tab_truth_site, tab_sub_site, results_site, trials_site)
                spear_truth_site, spear_sub_site = get_spearman_lists(tab_truth_site, tab_sub_site, results_site, trials_site)
    
        
                kls_df_site = pd.DataFrame(np.array([kls_truth_site,kls_sub_site]).T, columns=["Truth", "Cohort"])
                spear_df_site = pd.DataFrame(np.array([spear_truth_site,spear_sub_site]).T, columns=["Truth", "Cohort"])
                kls_df_site = kls_df_site.melt(var_name='Reference', value_name='KL')
                spear_df_site = spear_df_site.melt(var_name='Reference', value_name='Spearman')
                kls_df_site["Data"] = data_site
                kls_df_site["N"] = N
                kls_df_site["Cohort"] = cohort
                kls_df_site = pd.concat([kls_df_site, spear_df_site["Spearman"]], axis=1)
                df_long.append(kls_df_site)


    df_long = pd.concat(df_long)
    df_long.replace({"Data": alt_data_names}, inplace=True)
    return df_long

reload = True
all_N = [5]#,10]
cohorts = list(range(1,11))

if reload:
    print("Reloading df")
    df_long = load_long_df(all_N, cohorts)
    #df_long.to_csv("../data/multi/df_boot_long.csv")
else:
    df_long = pd.read_csv("../data/multi/df_boot_long.csv", index_col=0)
    print("Loaded saved df")

In [None]:
# site = "GSEPN"
# data_site = sites[site]["data"]
# paramset_site = sites[site]["paramset"]
# cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
# cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
# tab_sub_site = open_table(cohortfile_site)
# print(len(tab_sub_site))
# results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
# len(results_site) // trials_site

In [None]:
# Check all for duplicates
reference = "Cohort"
all_N = [5]

rm_duplicates = False

for site in sites:
    data_site = sites[site]["data"]
    data_alt = alt_data_names[data_site] if data_site in alt_data_names else data_site

    paramset_site = sites[site]["paramset"]
    
    for N in all_N:
        for cohort in cohorts:

            cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
            cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
            tab_sub_site = open_table(cohortfile_site)
            _, results_file_site, _ = open_bootstrap_results(cohort_path_site, method, paramset_site, return_df=False)
        
            k = df_long
            k = k[(k["Data"]==data_alt) & (k["Reference"]==reference) & (k["N"]==N) & (k["Cohort"]==cohort)]
            dupes = k.duplicated().sum()
            if dupes > 0:
                print(data_site, f"N{N:<2} Cohort {cohort}:", dupes)
                print(results_file_site)
                if rm_duplicates:
                    os.system(f"rm {results_file_site}")

In [None]:
# Check individual for duplicates
site = "GSEPN"
cohort = 3
N = 5

data_site = sites[site]['data']
paramset_site = sites[site]["paramset"]

if data_site in alt_data_names: data_site = alt_data_names[data_site]
k = df_long
k = k[(k["Data"]==data_site) & (k["Reference"]==reference) & (k["N"]==N)]
print(data_site, f"N{N:<2}", "Duplicates:", k.duplicated().sum())
cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)
results_logfc_wide(results_site, trials)

In [None]:
reference = "Cohort"
N = 5
cohort = 5

fig, ax = plt.subplots(1, 1, figsize=(10,4))
k = df_long
df_ref = k[(k["Reference"]==reference) & (k["N"]==N) & (k["Cohort"]==cohort)]
sns.boxplot(data=df_ref, x="Data", y="KL", ax=ax)

## Metrics

Create wide df with different metrics (precision, KL, spearman)

In [None]:
combined_all = pd.read_csv("../data/multi/combined_all.csv", index_col=0)
combined_all = combined_all[~combined_all["isSynthetic"]]

In [None]:
all_N = [5,10]
dfm = pd.DataFrame(index = list(set(combined_all["Data"])))
for N_ in all_N:
    c = combined_all
    c = c[(c["N"]==N_)&(c["DEA"]=="DESeq2 Wald")&(c["logFC"]==1)&(c["lfc_mode"]=="formal")]
    c.set_index("Data", inplace=True)
    dfm[f"Prec_N{N_}"] = c["median_prec"]

k = df_long
for ref in ["Truth","Cohort"]:
    for N in all_N:
        df_ref = k[(k["Reference"]==ref) & (k["N"]==N)]
        dfm[f"KL_{ref}_N{N}_mean"] = df_ref.groupby("Data")["KL"].mean()
        dfm[f"KL_{ref}_N{N}_std"] = df_ref.groupby("Data")["KL"].std()
        dfm[f"Spear_{ref}_N{N}_mean"] = df_ref.groupby("Data")["Spearman"].mean()
        dfm[f"Spear_{ref}_N{N}_std"] = df_ref.groupby("Data")["Spearman"].std()

In [None]:
data = "GIPF"
reference = "Cohort"
N = 5
k = df_long
k = k[(k["Data"]==data) & (k["Reference"]==reference) & (k["N"]==N)]
sns.boxplot(data=k, x="Cohort",y="KL")
plt.title(f"{data} N{N}")

In [None]:
import scipy.stats as stats

reference = "Cohort"
metric = f"mean"

fig, ax = plt.subplots(1, 2, figsize=(10,4),sharex=False)
df_ref = ksl_df_all_sites[ksl_df_all_sites["Reference"]==reference]
sns.scatterplot(data=dfm, y="Prec_N5", x=f"KL_{reference}_N5_{metric}", hue=dfm.index, style=dfm.index, s=200, ax=ax[0])
sns.regplot(data=dfm, y="Prec_N5", x=f"KL_{reference}_N5_{metric}", ax=ax[0], scatter_kws={'s':0})
    
sns.scatterplot(data=dfm, y="Prec_N10", x=f"KL_{reference}_N10_{metric}", hue=dfm.index, style=dfm.index, s=200, ax=ax[1])
sns.regplot(data=dfm, y="Prec_N10", x=f"KL_{reference}_N10_{metric}", scatter_kws={'s':0}, ax=ax[1])

for N, a in zip([5,10], ax):
    r_val, p_val = stats.pearsonr(dfm[f"KL_{reference}_N10_{metric}"], dfm[f"Prec_N{N}"])
    r2_val = r_val ** 2
    a.text(0.05, 0.05, f"r = {r_val:.2f}\nr² = {r2_val:.2f}\np = {p_val:.3g}", 
           transform=a.transAxes, fontsize=10, verticalalignment='bottom')

ax[0].legend().remove()
ax[1].legend(bbox_to_anchor=(1,1.06))
ax[0].set(ylabel="Median Precision (N=5)")
ax[1].set(ylabel="Median Precision (N=10)")
for a in ax:
    a.set(xlabel=(f"{metric.split('_')[-1].capitalize()} KL Divergence"))
    a.set(ylim=(-0.05,1.05))
fig.suptitle("KL of 25 Bootstrap trials relative to 1 Cohort")

In [None]:
all_N = [5,10]
cohorts = list(range(1,11))

site = "GSEPN"
N = 5
cohort = 1

data_site = sites[site]['data']
paramset_site = sites[site]["paramset"]
cohort_path_site = f"../data/{site}/{data_site}/{data_site}_N{N}/{data_site}_N{N}_{cohort:04}"
results_site, _, trials_site = open_bootstrap_results(cohort_path_site, method, paramset_site)

cohortfile_site = f"{cohort_path_site}/tab.none.{method}.{paramset_site}"
tab_sub_site = open_table(cohortfile_site)
truthfile_site = f"../data/{site}/{data_site}/{data_site}.{method}.lfc{lfc}.csv"
tab_truth_site = open_table(truthfile_site)

genes = len(results_site)//trials
print("Genes:", genes)

assert np.all(results_site.index.value_counts() == trials)
assert len(tab_sub_site) == genes

In [None]:
from itertools import combinations
from scipy.stats import spearmanr

def get_spearman(aa,bb):
        aa = aa.dropna()
        bb = bb.dropna()
        common = aa.index.intersection(bb.index)
        aa = aa.loc[common].rank()
        bb = bb.loc[common].rank()
        return spearmanr(aa,bb)
    
def get_pairwise_spearman(results_wide):
    spearmans = []
    for a, b in combinations(results_wide.columns, 2):
        aa = results_wide[a]
        bb = results_wide[b]
        spearmans.append(np.array(get_spearman(aa, bb)))
    return pd.DataFrame(spearmans, columns=["Statistic","pval"])

In [None]:
results_wide = results_logfc_wide(results_site, trials)
spearmans = get_pairwise_spearman(results_wide)

fig, ax = plt.subplots(1,2,figsize=(8,4))
spearmans["Statistic"].hist(ax=ax[0])
ax[0].set_title(f"Mean spearman: {spearmans["Statistic"].mean():.2f}")

a=results_wide[0]
b=results_wide[1]
ax[1].scatter(a.rank(), b.rank(), alpha=0.05)
ax[1].set_title(f"Spearman: {get_spearman(a, b).statistic:.2f}")

In [None]:
def get_iqr(tab, cutoff, CI=0.5):
    mean_lfc = tab.median(axis=1).sort_values(ascending=False)
    x = range(len(mean_lfc))
    
    # not really a confidence interval
    up_lim = (1+CI)/2
    low_lim = (1-CI)/2
    cutoff = 1
    
    #std_lfc = tab.std(axis=1).loc[mean_lfc.index]
    up = tab.quantile(up_lim,axis=1).loc[mean_lfc.index]
    low = tab.quantile(low_lim,axis=1).loc[mean_lfc.index]
    
    crossing = mean_lfc[ ((up>-cutoff) & (low<-cutoff)) | ((up>cutoff) & (low<cutoff)) ]
    cross_ind = np.array(x)[mean_lfc.index.isin(crossing.index)]
    
    mean_pass_cutoff = mean_lfc[mean_lfc.abs()>cutoff]

    return up, low, crossing, cross_ind, mean_pass_cutoff, x, mean_lfc

In [None]:
fig, ax = plt.subplots(1,1,figsize=(20,10))
cutoff = 1
CI = 0.5
up, low, crossing, cross_ind, mean_pass_cutoff, x, mean_lfc = get_iqr(results_wide, cutoff=cutoff, CI=CI)
    
#ax.plot(x, mean_lfc,lw=4,label="Mean")
ax.scatter(x, mean_lfc,label="Mean")
ax.scatter(cross_ind, crossing, label=f"# Crossing: {len(crossing)} ({len(crossing)/len(x):.2%})")

#ax.scatter(x, tab_truth_site.loc[mean_lfc.index, "logFC"], label=f"Truth")
#ax.scatter(x, tab_sub_site.loc[mean_lfc.index, "logFC"], label=f"Original cohort")

ax.axhline(cutoff,ls="--",c="red",label=f"Cutoff = {cutoff}")
ax.axhline(-cutoff,ls="--",c="red")
ax.fill_between(x, up, low, edgecolor="none",color="grey",alpha=0.5,label=f"{CI:.0%} IQR")
#ax.fill_between(cross_ind, up.iloc[cross_ind], low.iloc[cross_ind], edgecolor="none",color="pink",alpha=0.5,label="Crossing")
ax.set_xlabel("Gene Rank")
ax.set_ylabel(r"log$_2$FC")
ax.set_title(f"Bootstrapped with {trials} trials")
ax.legend(framealpha=1,title=f"{sites[site]['data']} N{N} {method}\n|Mean|>{cutoff} = {len(mean_pass_cutoff)}",title_fontsize=16)
#ax.set_xlim(8360,8400)
figpath = f"../figures/iqr.boot.{data_site}.N{N}.cohort{cohort}.png"
fig.savefig(figpath)
print(figpath)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from matplotlib.colors import Normalize

X1 = dfm[f"KL_{reference}_N10_mean"]
X2 = dfm[f"KL_{reference}_N10_std"]
y = dfm["Prec_N10"]

# Fit model
X = np.column_stack((X1, X2))
model = LinearRegression().fit(X, y)

# Generate grid for contour plot
X1_grid, X2_grid = np.meshgrid(np.linspace(0, .07, 100), np.linspace(0, .07, 100))
y_pred = model.intercept_ + model.coef_[0] * X1_grid + model.coef_[1] * X2_grid

# Create figure
fig, ax = plt.subplots(figsize=(8,6))

import matplotlib.cm as cm
import matplotlib.colors as mcolors

norm = mcolors.Normalize(vmin=0, vmax=1)
mappable = cm.ScalarMappable(norm=norm, cmap='inferno')
cbar = fig.colorbar(mappable, ax=a)

contour = ax.contourf(X1_grid, X2_grid, y_pred, levels=40, cmap="inferno", norm=norm)
sc = ax.scatter(X1, X2, c=y, cmap="inferno", edgecolors='k', norm=norm)

# Color bar
cbar = fig.colorbar(contour, ax=ax)
cbar.set_label("Predicted y")

# Labels and title
ax.set_xlabel("KL mean")
ax.set_ylabel("KL Std")
ax.set_title("Multiple Linear Regression - Contour Plot")
plt.show()


# How many trials?

In [None]:
from math import comb

def comb_with_replace(n):
    return comb(2*n-1, n)

for i in range(1,11):
    y = comb_with_replace(i)
    plt.scatter(i,y,color="cornflowerblue")
    plt.text(i,2*y,y,ha="center",va="center")
plt.yscale("log")
plt.xlabel("Cohort Size (N)")
plt.ylabel("Combinations")

In [None]:
from itertools import combinations_with_replacement

a = [0,1,2,3,4]
print(len(list(combinations_with_replacement(a,5))))
for c in combinations_with_replacement(a,5):
    print(c)
    break

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def running_mean(x):
    N = len(x)
    running_mean = np.cumsum(x) / np.arange(1, N + 1)
    plt.figure(figsize=(8, 5))
    plt.plot(np.arange(1, N + 1), running_mean, label="Running Mean", color='b')
    plt.axhline(y=np.mean(x), color='r', linestyle='--', label="Final Mean")
    plt.xlabel("Bootstrap Trial Number (i)")
    plt.ylabel("Mean of First i Trials")
    plt.title("Running Mean of Bootstrap Estimates")
    plt.legend()
    plt.grid()
    plt.show()

d=df_ref
d=d[d["Data"]=="LMAB"]

running_mean(d["KL"])

# Misc.

In [None]:
import sys, importlib
importlib.reload(sys.modules["misc"])
from DEA import run_dea