## DEA comparison with Li et al.

https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02648-4

Li et al. used 1% FDR threshold with edgeR QLF unpaired

In [None]:
import sys
import os
import logging
import glob
import pickle
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import random
from enum import Enum
import plot_utils
npg = plot_utils.npg_palette(); jco = plot_utils.jco_palette(); colors=plot_utils.matplotlib_init()

logging.basicConfig(filename='example.log', 
                    encoding='utf-8', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))

#datapath = Path("../data")
datapath = "/storage/homefs/pd21v747/datanew"

modpath = Path("../scripts")
sys.path.append(os.path.relpath(modpath))

from misc import Timer, pickler, open_table

In [None]:
from DEA import run_dea

class PermuteMode(Enum):
    
    NONE = 1
    RANDOM = 2
    BALANCED = 3
    
sites = {"thyroid": "THCA",
         "lung": "LUAD",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "liver": "LIHC",
         "prostate": "PRAD",
         "synthetic": "KIRC_syn_hom"}

def subsample_df(df_full, N):
        replicates = len(df_full.columns)//2
        p = sorted(np.random.choice(range(1,replicates), N, replace=False))
        p += [pi+replicates for pi in p]    
        return df_full.iloc[:,p]   

def permute_df(df):    
    ix = list(df.columns)
    random.shuffle(ix)
    return df[pd.Index(ix)]

# quick and dirty implementation
def permute_balanced(df):
    """Make sure each permuted (fake) condition has equal number of control/treatment"""
    replicates = len(df.columns)//2
    
    if replicates % 2 != 0:
        raise Exception("Must have even number of samples")
        
    # Take first quarter of samples and swap with random treatment samples, now both conditions have equal number of control/treatment samples
    col_ix = list(range(2*replicates))
    rnd_treat_ix = np.random.choice(range(replicates, 2*replicates), replicates//2, replace=False)

    for i, j in zip(col_ix[:replicates//2], rnd_treat_ix):
        col_ix[i], col_ix[j] = col_ix[j], col_ix[i]

    cols = df.columns.values
    cols = list(cols[col_ix])
    df = df[cols]
    
    # Shuffle within each fake condition
    df_c = df.iloc[:,:replicates]
    df_c = permute_df(df_c)
    
    df_t = df.iloc[:,replicates:]
    df_t = permute_df(df_t)
    
    df = pd.concat([df_c,df_t], axis=1)
    
    if get_imbalance_ratio(df) != 0.5:
        display(df)
        raise Exception("Balancing failed")
    return df

def get_imbalance_ratio(df):
    """Return ratio of true control samples in fake control group (same as fake treatment group)"""
    replicates = len(df.columns)//2
    return sum([1 if colname.startswith("N") else 0 for colname in df.columns[:replicates]]) / replicates

def run_dea_here(df, dea):

    DEA_kwargs = {
        "edgerqlf": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"]},
        "edgerlrt": {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "test":"lrt"},
        "deseq2": {"cols_to_keep": ["logFC","logCPM","FDR"]}
    }
   
    outfile = "/storage/homefs/pd21v747/datanew/test/dea_test.csv"

    run_dea(df, outfile, dea, overwrite=True, design="GSE91061" if GSE91061 else "unpaired", lfc=0, **DEA_kwargs[dea])
    return pd.read_csv(outfile, index_col=0)

In [None]:
site = "prostate"
data = sites[site]

permute = PermuteMode.RANDOM
subsample = True
N = 6

f = datapath + f"/{site}/{data}/{data}.csv"
#f = "/storage/homefs/pd21v747/datanew/GSE91061/GSE91061_BMS038109Sample.hg19KnownGene.raw.csv"

GSE91061 = "GSE91061" in f

df_full = pd.read_csv(f, index_col=0, header=0)

print("Running on " + ("GSE91061" if GSE91061 else "TCGA") + " data")

if(GSE91061):
    pre = list(df_full.columns)
    pre = [p for p in pre if "_Pre_" in p]
    on = df_full.columns.difference(set(pre))
    ix = pd.Index(pre+list(on))
    df_full = df_full[ix]

df_full

# Balanced vs imbalanced permutations

In [None]:
overwrite = 0

FDR = 0.01
all_N = [4,8,16,32,"all"]

permute_modes = [PermuteMode.RANDOM, PermuteMode.BALANCED, PermuteMode.NONE]

if "sig_dict" not in globals():
    
    if os.path.isfile("../data/misc/permuted_degs.txt"):
    
        with open("../data/misc/permuted_degs.txt", "rb") as f:
            sig_dict = pickle.load(f)
            print("Loaded sig dict")
    
    else:
        sig_dict = {site:{p: {N:None for N in all_N} for p in permute_modes} }

if site not in sig_dict:
    sig_dict[site] = {p: {N:None for N in all_N} for p in permute_modes}

is_dirty = False

with Timer(name="context manager"):
    for N in all_N:

        df = df_full if (N == "all") else subsample_df(df_full, N)
            
        for permute_mode in permute_modes:

            if not overwrite and sig_dict[site][permute_mode][N] != None:
                print("Skipped existing calculation: ", site, N, permute_mode)
                continue

            is_dirty = True

            if permute_mode == PermuteMode.BALANCED:
                
                if len(df.columns)//2 % 2 != 0:
                    drop_patient = df.columns[0]
                    df = df.drop([drop_patient, drop_patient.replace("N","T")], axis=1)
                    print("Dropping one patient, now df has", len(df.columns), "cols")
                    
                df_perm = permute_balanced(df) # re-use subsampled df, so avoid conflict with random permuted df
                
            elif permute_mode == PermuteMode.RANDOM:
                df_perm = permute_df(df)

            res = run_dea_here(df_perm, "edgerqlf

            sig = res[res["FDR"]<FDR]
            sig_dict[site][permute_mode][N] = len(sig)
            print(N, sig_dict[site][permute_mode][N])
        
if is_dirty:
    pickler(sig_dict, "../data/misc/permuted_degs.txt")
    print("Saving sig dict")

In [None]:
if "sig_dict" not in globals():
    with open("/../data/misc/permuted_degs.txt", "rb") as f:
        sig_dict = pickle.load(f)
        print("Loaded sig dict")


fig, ax = plt.subplots(1,1,figsize=(8,5))


for i, site in enumerate(sig_dict):
    
    color = npg[i]
    
    for j, permute_mode in enumerate(permute_modes):
        
        if permute_mode == PermuteMode.NONE:
            continue
            
        marker = "o" if permute_mode == PermuteMode.BALANCED else "^"
    
        for k, N in enumerate(all_N):

            ax.scatter(k,sig_dict[site][permute_mode][N],color=color,label=site if k == 0 else None, marker=marker)
        
ax.set(xlabel="Cohort size", ylabel="DEGs", title="edgeR QLF unpaired, 1% FDR")
ax.set_xticks(range(len(all_N)))
ax.set_xticklabels(all_N)
ax.legend()
ax.set_yscale("log")

figpath = f"../figures/permuted_degs.pdf"
fig.tight_layout()
fig.savefig(figpath)

# DEGs vs imbalance ratio

In [None]:
site = "breast"

overwrite = 0

FDR = 0.01
all_N = [32]
trials = 10

class ImbalanceContainer:

    def __init__(self, degs, imbalance_ratio):

        self.degs = degs
        self.imbalance_ratio = imbalance_ratio
        
    def __repr__(self):
        return f"DEGs: {self.degs} | Imbalance ratio: {self.imbalance_ratio}"

if "imba_dict" not in globals():
    
    if os.path.isfile("../data/misc/imba_dict.txt"):
    
        with open("../data/misc/imba_dict.txt", "rb") as f:
            imba_dict = pickle.load(f)
            print("Loaded sig dict")

    else:
        imba_dict = {site: {N: {t: None for t in range(trials)} for N in all_N } }

if site not in imba_dict:
    imba_dict[site] = {N: {t: None for t in range(trials)} for N in all_N }
    
for site in imba_dict.keys():
    for N in all_N:
        if N not in imba_dict[site]:
            imba_dict[site][N] = {t: None for t in range(trials)}
            
        for trial in range(trials):
                
            if trial not in imba_dict[site][N]:
                imba_dict[site][N] = {trial: None}

is_dirty = False

with Timer(name="context manager"):
    for N in all_N:

        df = df_full if (N == "all") else subsample_df(df_full, N)
            
        for trial in range(trials):

            if not overwrite and imba_dict[site][N][trial] != None:
                print("Skipped existing calculation: ", site, N, trial)
                continue

            is_dirty = True

            df_perm = permute_df(df)
            res = run_dea_here(df_perm, "edgerqlf")

            sig = res[res["FDR"]<FDR]
            imba_dict[site][N][trial] = ImbalanceContainer(degs=len(sig),imbalance_ratio=get_imbalance_ratio(df_perm))
            print(N, trial, imba_dict[site][N][trial])
        
if is_dirty:
    pickler(imba_dict, "../data/misc/imba_dict.txt")
    print("Saving imba dict")

In [None]:
if "imba_dict" not in globals():
    with open("../data/misc/imba_dict.txt", "rb") as f:
        imba_dict = pickle.load(f)
        print("Loaded imba dict")


fig, ax = plt.subplots(1,1,figsize=(8,5))


for i, site in enumerate(imba_dict):
    
    color = npg[i]
    
    for k, N in enumerate(all_N):
        
        print(N)
    
        for j, trial in enumerate(imba_dict[site][N].keys()):
        
            imba = imba_dict[site][N][trial]
            ax.scatter(imba.imbalance_ratio,imba.degs,color=color,label=site if j == 0 else None)
        
ax.set(xlabel="Imbalance ratio", ylabel="DEGs", title="edgeR QLF unpaired, 1% FDR")
ax.legend()
ax.set_yscale("log")

figpath = f"../figures/degs_vs_imbalance_ratio.pdf"
fig.tight_layout()
fig.savefig(figpath)