In [None]:
%matplotlib inline
import sys 
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import random
import pickle
from pathlib import Path
from itertools import product

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

modpath = "../scripts"
sys.path.append(os.path.abspath(modpath))

from misc import pickler, open_table
import plot_utils

In [None]:
DEAs = ["edgerlrt", "edger", "deseq2"]
outlier_methods = ["none","jk","pcah"]
all_N = [3,4,5,6,7,8,9,10,12,15]

# pretty names
prdea = {"edgerlrt":"edgeR LRT","edger":"edgeR QLF","deseq2":"DESeq2 Wald"}
prout = {"none":"None","jk":"Jackknife","pcah":"rPCA"}

In [None]:
from misc import Timer
with Timer(name="context manager"):
    pass

# Effect size inflation vs FDR 

Reviewer 1 comment: 

"It seems counterintuitive that, in small cohorts, the effect estimates are inflated (Fig. 5), but this doesn’t inflate the false positive rate (Fig. 2). Can you explain this? Is it that the effects are high variance but still unbiased? Perhaps you can find a way to simultaneously visualize effect sizes and whether they eventually lead to false positives."

In [None]:
from misc import open_table, get_grid_size
from process import gene_rep

datapath = "/storage/homefs/pd21v747/datanew"

def gene_reps_all_N(all_N,site,data,out,dea,FDR,logFC,param_set):
    gene_reps = {N: None for N in all_N}
    for N in all_N:
        tab = open_table(f"{datapath}/{site}/{data}/{data}_N{N}/all.FDR.{out}.{dea}.{param_set}")
        tablfc = open_table(f"{datapath}/{site}/{data}/{data}_N{N}/all.logFC.{out}.{dea}.{param_set}")
        gene_reps[N] = gene_rep(tab, logFC_tab=tablfc, FDR=FDR, logFC=logFC, normalize=True)
    return gene_reps


def get_truth_logFC(param_set, n_cohorts):
    
    logFC_test = 0 if param_set == "p1" else 1
    
    truth_logFC = {data: {dea: {N: {"RMSE":[], "logFC":[], "Error": []} for N in all_N_sub} | {"small_N_deg_logFC":[],"small_N_deg_logFC_truth":[]} for dea in DEAs} for data in datasets if "syn_hom" not in data}

    for data in truth_logFC:
        
        print(data)
        
        for dea in ["deseq2", "edgerlrt"]:
            
            path = datasets[data]['datapath'].split(f'/{data}.csv')[0]
            f = f"{path}/{data}.{dea}.lfc{logFC_test}.csv"
            truth_df=pd.read_csv(f, index_col=0, usecols=["Unnamed: 0","logFC"])
            logFC_truth = truth_df["logFC"]
            truth_logFC[data][dea]["truth"] = logFC_truth
            small_N_deg = []
            
            for N in all_N_sub:
                
                path_N = f"{path}/{data}_N{N}"
                cohorts = sorted([f.path for f in os.scandir(path_N) if f.is_dir()])

                for cohort in cohorts[:n_cohorts]:
                    
                    f = f"{cohort}/tab.{out}.{dea}.{param_set}"
                    tab_cohort = open_table(f)

                    logFC_cohort = tab_cohort["logFC"]
                    
                    if cohort == cohorts[0]: 
                        common = logFC_cohort.index.intersection(logFC_truth.index)
                    tab_cohort = tab_cohort.loc[common]

                    if N == all_N_sub[0]:
                        DEG = tab_cohort[(tab_cohort["FDR"]<FDR) & (tab_cohort["logFC"].abs()>logFC)]
                        truth_logFC[data][dea]["small_N_deg_logFC"] += DEG["logFC"].values.tolist()
                        small_N_deg = DEG.index

                    logFC_cohort = tab_cohort["logFC"]
                    rmse = np.sqrt(np.mean((logFC_truth.loc[common].values.flatten()-logFC_cohort.values)**2))
                    truth_logFC[data][dea][N]["RMSE"].append(rmse)
                    truth_logFC[data][dea][N]["Error"] += (logFC_truth.loc[common].values.flatten()-logFC_cohort.values).tolist()
                    truth_logFC[data][dea][N]["logFC"] += logFC_cohort.tolist()

            truth_logFC[data][dea]["small_N_deg_logFC_truth"] += logFC_truth.loc[small_N_deg].values.tolist()
            
    return truth_logFC

# don't forget: deseq2 does not show logCPM but counts

In [None]:
datasetsfile = "/storage/homefs/pd21v747/datanew/multi/datasets.txt"
with open(datasetsfile, "rb") as f:
    datasets = pickle.load(f)

In [None]:
data = "BRCA"
dea = "deseq2"
out = "none"
N = 3

# not FC estimate is not affected by this
paramset = "p1" # no FC cutoff
#paramset = "p2" # formal

site = datasets[data]["site"]
p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}_N{N}/all.logFC.{out}.{dea}.{paramset}"
tab = open_table(p)

p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}_N{N}/all.FDR.{out}.{dea}.{paramset}"
tab_fdr = open_table(p)

mean_lfc = tab.median(axis=1).sort_values(ascending=False)
x = range(len(mean_lfc))

print(len(tab))
tab.head()

In [None]:
# not really a confidence interval
CI = 0.5
up_lim = (1+CI)/2
low_lim = (1-CI)/2
cutoff = 1

#std_lfc = tab.std(axis=1).loc[mean_lfc.index]
up = tab.quantile(up_lim,axis=1).loc[mean_lfc.index]
low = tab.quantile(low_lim,axis=1).loc[mean_lfc.index]

crossing = mean_lfc[ ((up>-cutoff) & (low<-cutoff)) | ((up>cutoff) & (low<cutoff)) ]
cross_ind = np.array(x)[mean_lfc.index.isin(crossing.index)]

mean_pass_cutoff = mean_lfc[mean_lfc.abs()>cutoff]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(20,10))
#ax.plot(x, mean_lfc,lw=4,label="Mean")
ax.scatter(x, mean_lfc,label="Mean")
ax.scatter(cross_ind, crossing, label=f"# Crossing: {len(crossing)} ({len(crossing)/len(x):.2%})")
ax.axhline(cutoff,ls="--",c="red",label=f"Cutoff = {cutoff}")
ax.axhline(-cutoff,ls="--",c="red")
ax.fill_between(x, up, low, edgecolor="none",color="grey",alpha=0.5,label=f"{CI:.0%} IQR")
#ax.fill_between(cross_ind, up.iloc[cross_ind], low.iloc[cross_ind], edgecolor="none",color="pink",alpha=0.5,label="Crossing")
ax.set_xlabel("Gene Rank")
ax.set_ylabel(r"log$_2$FC")
ax.set_title(f"|Mean|>{cutoff} = {len(mean_pass_cutoff)}")
ax.legend(framealpha=1,title=f"{data} N{N} {prdea[dea]} Out: {prout[out]}",title_fontsize=16)
#ax.set_xlim(8360,8400)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(16,8))

fdr_thresh = 0.05
cohort = 2

## Post hoc thresholding
tab_cohort =  tab.iloc[:,cohort-1].loc[mean_lfc.index]
tab_cohort_fdr =  tab_fdr.iloc[:,cohort-1].loc[mean_lfc.index]

if (tab_cohort_fdr.isna().sum()):
    tab_cohort_fdr = tab_cohort_fdr.fillna(1)
    
mask = (tab_cohort.abs() >= 1) & (mean_lfc.abs()<1) & (tab_cohort_fdr<fdr_thresh)
tab_cohort_FP = tab_cohort[mask]
x_FP = np.array(x)[mask]

mask = (tab_cohort.abs() >= 1) & (mean_lfc.abs()>=1) & (tab_cohort_fdr<fdr_thresh)
tab_cohort_TP = tab_cohort[mask]
x_TP = np.array(x)[mask]

mask = (mean_lfc.abs()>=1) & (tab_cohort_fdr>=fdr_thresh)
tab_cohort_FN = tab_cohort[mask]
x_FN = np.array(x)[mask]

mask = (mean_lfc.abs()<1) & (tab_cohort_fdr>=fdr_thresh)
tab_cohort_TN = tab_cohort[mask]
x_TN = np.array(x)[mask]

assert len(x_TP) + len(x_FP) + len(x_FN) + len(x_TN) == len(x)

#ax.plot(x, mean_lfc,lw=4,label="Mean")
ax.scatter(x_FP, tab_cohort_FP,label=f"FP: {len(x_FP)} ({len(x_FP)/len(x):.2%})", zorder=2)
ax.scatter(x_TP, tab_cohort_TP,label=f"TP: {len(x_TP)} ({len(x_TP)/len(x):.2%})", zorder=2)
ax.scatter(x_FN, tab_cohort_FN,label=f"FN: {len(x_FN)} ({len(x_FN)/len(x):.2%})", zorder=2)
ax.scatter(x_TN, tab_cohort_TN,label=f"TN: {len(x_TN)} ({len(x_TN)/len(x):.2%})", zorder=2, alpha=0)
ax.scatter(x, mean_lfc,label="Mean of 100 cohorts", zorder=3, color="black")

#ax.scatter(cross_ind, crossing, label=f"# Crossing: {len(crossing)} ({len(crossing)/len(x):.2%})")
ax.axhline(cutoff,ls="--",c="black",label=f"Cutoff = {cutoff}", zorder=9)
ax.axhline(-cutoff,ls="--",c="black", zorder=9)

bbox=dict(boxstyle="round", fc="1",ec="0.8")
prec = len(x_TP) / (len(x_TP) + len(x_FP))
rec = len(x_TP) / (len(x_TP) + len(x_FN))
metrics = f"Precision: {prec:.2f}\nRecall: {rec:.2f}"
ax.text(0.5, 0.95, metrics, ha='center', va='top', transform=ax.transAxes, bbox=bbox, fontsize=18, weight="bold")

ax.set_xlabel("Gene Rank")
ax.set_ylabel(r"log$_2$FC")
ax.set_title(f"Cohort {cohort} | abs(Mean) > {cutoff} = {len(mean_pass_cutoff)}", fontsize=20)
ax.legend(framealpha=1,title=f"{data} N{N} {prdea[dea]} Out: {prout[out]}",title_fontsize=16)
#ax.set_xlim(8360,8400)

In [None]:
tab_cohort_TP.hist(bins=10,alpha=0.5)
mean_lfc.loc[tab_cohort_TP.index].hist(bins=10,alpha=0.5)

In [None]:
site = datasets[data]["site"]
p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}.{dea}.lfc{0}"
tab_gt = open_table(p)
print(len(tab_gt))
tab_gt.head()

In [None]:
a=tab_gt.loc[mean_lfc.index.intersection(tab_gt.index)]["logFC"]
# (a-mean_lfc).hist(bins=100)
# (a-mean_lfc).mean()

plt.scatter(range(len(a)), a,label="Mean of 100 cohorts", zorder=3)
plt.scatter(x, mean_lfc,label="Mean of 100 cohorts", zorder=4)

## Heteroskedasticity

In [None]:
i = 1
m = tab.abs().iloc[i].mean()
s = tab.abs().iloc[i].std()
tab.abs().iloc[i].hist()
plt.axvline(m, color="black")
plt.axvline(m-s, color="red")
plt.axvline(m+s, color="red")

In [None]:
CI = 0.5
up_lim = (1+CI)/2
low_lim = (1-CI)/2

cutoff = 1

mean_lfc = tab.abs().median(axis=1).sort_values(ascending=True)

up = tab.abs().quantile(up_lim,axis=1).loc[mean_lfc.index]
low = tab.abs().quantile(low_lim,axis=1).loc[mean_lfc.index]
spread = np.array([low.values, up.values])

# spread = tab.abs().std(axis=1).loc[mean_lfc.index]

cutoff = 1

fig, ax= plt.subplots(1,1,figsize=(7,5))
ax.errorbar(range(len(mean_lfc)), mean_lfc, yerr=spread,zorder=1, label=f"{CI:.0%} CI")
ax.scatter(range(len(mean_lfc)), mean_lfc,color="black",zorder=2, label="Median of 100 cohorts")
ax.set(ylabel=r"|log$_2$FC|",xlabel="Gene rank", title=f"{data} N{N} {prdea[dea]}")
ax.axhline(cutoff,ls="--",color="red",label=f"Threshold = {cutoff}")
ax.axvline(np.argmax(mean_lfc>1),ls="--",color="grey",label=f"")

ax.legend(loc="upper left")

fig.tight_layout()
figpath = f"../figures/heteroskedasticity.png"
fig.savefig(figpath)

In [None]:
!pwd

# logFC vs Precision

In [None]:
data = "THCA"
dea = "edgerlrt"
out = "none"
N = 3

# not FC estimate is not affected by this
paramset = "p1" # no FC cutoff
paramset = "p2" # formal

site = datasets[data]["site"]
p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}_N{N}/all.logFC.{out}.{dea}.{paramset}"
tab_lfc = open_table(p)

p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}_N{N}/all.FDR.{out}.{dea}.{paramset}"
tab_fdr = open_table(p)

print(len(tab))
tab.head()

In [None]:
site = datasets[data]["site"]
p = f"/storage/homefs/pd21v747/datanew/{site}/{data}/{data}.{dea}.lfc{1 if paramset == 'p2' else 0}"
tab_gt = open_table(p)

tab_lfc = tab_lfc.loc[tab_gt.index]
tab_fdr = tab_fdr.loc[tab_gt.index]

print(len(tab_gt))
tab_gt.head()

In [None]:
fdr = 0.05
deg_truth = tab_gt[tab_gt["FDR"]<fdr]
len(deg_truth)

In [None]:
deg_cohorts = tab_fdr.mask(tab_fdr<fdr,1).mask(tab_fdr>=fdr,0)

In [None]:
tab_gt["sig"] = 0
tab_gt["sig"] = tab_gt["sig"].where(tab_gt["FDR"]>=fdr,1)
tab_gt["sig"].sum()

In [None]:
from process import get_array_metrics_numba

mcc, prec, rec = get_array_metrics_numba(tab_gt["sig"].values, deg_cohorts.values)

In [None]:
plt.scatter(deg_cohorts.sum(axis=1), tab_gt["logFC"])

In [None]:
mcc, prec, rec = get_array_metrics_numba(tab_gt["sig"].values, deg_cohorts.values)

In [None]:
tab_gt = tab_gt.sort_values(by="logFC")
deg_cohorts = deg_cohorts.loc[tab_gt.index]
bins = np.linspace(tab_gt["logFC"].min()-0.1, tab_gt["logFC"].max()+0.1, 20)
tab_gt['logFC_bin'] = pd.cut(tab_gt['logFC'], bins)

prec_dict = {}
for bin_val, bin_group in tab_gt.groupby('logFC_bin'):
    mcc, prec, rec = get_array_metrics_numba(bin_group["sig"].values, deg_cohorts.loc[bin_group.index].values)
    prec_dict[bin_val] = prec

In [None]:
mid = []
precs = []
for bin_val, bin_group in tab_gt.groupby('logFC_bin'):
    mcc, prec, rec = get_array_metrics_numba(bin_group["sig"].values, deg_cohorts.loc[bin_group.index].values)
    mid.append(bin_val.mid)
    precs.append(np.nanmedian(prec))
    
    print(bin_val)
    print("Median prec:", np.nanmedian(prec))
    print("Genes:",len(bin_group),"| NaN:", np.isnan(prec).sum())
    plt.hist(prec, label=bin_val)
    print("\n")
    
plt.legend()

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(mid,precs)
plt.xlabel("logFC (binned)")
plt.ylabel("Median precision")

## Inflation

In [None]:
from misc import open_table, get_grid_size
from process import gene_rep

datapath = "/storage/homefs/pd21v747/datanew"

def gene_reps_all_N(all_N,site,data,out,dea,FDR,logFC,param_set):
    gene_reps = {N: None for N in all_N}
    for N in all_N:
        tab = open_table(f"{datapath}/{site}/{data}/{data}_N{N}/all.FDR.{out}.{dea}.{param_set}")
        tablfc = open_table(f"{datapath}/{site}/{data}/{data}_N{N}/all.logFC.{out}.{dea}.{param_set}")
        gene_reps[N] = gene_rep(tab, logFC_tab=tablfc, FDR=FDR, logFC=logFC, normalize=True)
    return gene_reps


def get_truth_logFC(param_set, n_cohorts):
    
    logFC_test = 0 if param_set == "p1" else 1
    
    truth_logFC = {data: {dea: {N: {"RMSE":[], "logFC":[], "Error": []} for N in all_N_sub} | {"small_N_deg_logFC":[],"small_N_deg_logFC_truth":[]} for dea in DEAs} for data in datasets if "syn_hom" not in data}

    for data in truth_logFC:
        
        print(data)
        
        for dea in ["deseq2", "edgerlrt"]:
            
            path = datasets[data]['datapath'].split(f'/{data}.csv')[0]
            f = f"{path}/{data}.{dea}.lfc{logFC_test}.csv"
            truth_df=pd.read_csv(f, index_col=0, usecols=["Unnamed: 0","logFC"])
            logFC_truth = truth_df["logFC"]
            truth_logFC[data][dea]["truth"] = logFC_truth
            small_N_deg = []
            
            for N in all_N_sub:
                
                path_N = f"{path}/{data}_N{N}"
                cohorts = sorted([f.path for f in os.scandir(path_N) if f.is_dir()])

                for cohort in cohorts[:n_cohorts]:
                    
                    f = f"{cohort}/tab.{out}.{dea}.{param_set}"
                    tab_cohort = open_table(f)

                    logFC_cohort = tab_cohort["logFC"]
                    
                    if cohort == cohorts[0]: 
                        common = logFC_cohort.index.intersection(logFC_truth.index)
                    tab_cohort = tab_cohort.loc[common]

                    if N == all_N_sub[0]:
                        DEG = tab_cohort[(tab_cohort["FDR"]<FDR) & (tab_cohort["logFC"].abs()>logFC)]
                        truth_logFC[data][dea]["small_N_deg_logFC"] += DEG["logFC"].values.tolist() # faster than np.concatenate
                        small_N_deg = DEG.index

                    logFC_cohort = tab_cohort["logFC"]
                    rmse = np.sqrt(np.mean((logFC_truth.loc[common].values.flatten()-logFC_cohort.values)**2))
                    truth_logFC[data][dea][N]["RMSE"].append(rmse)
                    truth_logFC[data][dea][N]["Error"] += (logFC_truth.loc[common].values.flatten()-logFC_cohort.values).tolist()
                    truth_logFC[data][dea][N]["logFC"] += logFC_cohort.tolist()

            truth_logFC[data][dea]["small_N_deg_logFC_truth"] += logFC_truth.loc[small_N_deg].values.tolist()
            
    return truth_logFC

# don't forget: deseq2 does not show logCPM but counts

In [None]:
all_N_sub = [3,9,15]
FDR = 0.01
logFC = 1
out = "none" # outlier method
n_cohorts = 10
overwrite = False
paramset = "p1"

truth_logFC = get_truth_logFC(paramset, n_cohorts)

In [None]:
print(truth_logFC["THCA"]["edgerlrt"].keys())

len(truth_logFC["THCA"]["edgerlrt"]["small_N_deg_logFC"])
len(truth_logFC["THCA"]["edgerlrt"]["small_N_deg_logFC_truth"])

# Unpaired designs

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

combined_td2 = pd.read_csv("../data/multi/combined_td.p2.csv", index_col=0) # Paired formal lfc 1
combined_td3 = pd.read_csv("../data/multi/combined_td.p3.csv", index_col=0) # Unpaired formal lfc 1

combined_td3["DEA"] = combined_td3["DEA"].str.replace("edgerqlf","edgeR QLF")

In [None]:
combined_td2 = combined_td2[(combined_td2["N"].isin([3,7,15]))&(combined_td2["FDR"]==0.05)]

len(combined_td2),len(combined_td3)

In [None]:
sns.set_style("whitegrid")

combined_td2["Design"] = "Paired"
combined_td3["Design"] = "Unpaired"

df = pd.concat([combined_td2, combined_td3])

met = "median_prec"

fig, ax = plt.subplots(4,3,figsize=(14,14), sharex=True, sharey="row")

methods = ["edgeR QLF", "edgeR LRT", "DESeq2"]
pretty_metric = {"median_prec": "Precision",
                "median_deg": "#DEGs",
                "median_rec": "Recall",
                "median_mcc": "MCC"}

for i, met in enumerate(["median_prec","median_rec","median_mcc","median_deg"]):
    for j, method in enumerate(methods):
        df_sub = df[df["DEA"]==method]
        sns.boxplot(data=df_sub, x="N", y=met,hue="Design", ax=ax[i][j])
        if not (i == j == 0): ax[i][j].legend().remove()
        sns.stripplot(data=df_sub, x="N", y=met,hue="Design",dodge=True, color="black", ax=ax[i][j], legend=False)
        ax[i][j].set(ylabel=pretty_metric[met])
        if i < 3: ax[i][j].set(xlabel="")
        if j > 0: ax[i][j].set(ylabel="")
        if i == 0: ax[i][j].set_title(method + "|lfc|>1")
    
fig.tight_layout()
figpath = f"../figures/paired_vs_unpaired_lfc1.png"
fig.savefig(figpath)

## Test cases

In [None]:
test = "deseq2"
site = "liver"
data = "LIHC"
N = 3
cohort = 1

# Results from sent batch jobs
f3 = f"/storage/homefs/pd21v747/RNASeqReplicability/data/{site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}/tab.none.{test}.p3.feather" # unpaired
f2 = f"/storage/homefs/pd21v747/RNASeqReplicability/data/{site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}/tab.none.{test}.p2.feather" # paired
tab_unpaired = open_table(f3)
tab_paired = open_table(f2)

# Unpaired and paired results should yield different results
print((tab_paired["logFC"]==tab_unpaired.loc[tab_paired.index]["logFC"]).sum(), len(tab_paired))
(tab_paired["FDR"]==tab_unpaired.loc[tab_paired.index]["FDR"]).sum(), len(tab_paired)

In [None]:
# Re-run DEA here

from DEA import run_dea
import json

config = f"/storage/homefs/pd21v747/RNASeqReplicability/data/{site}/{data}/{data}_N{N}/{data}_N{N}_{cohort:04}/config.json"

with open(config, "r") as f:
    j = json.load(f)    
    samples_i = j["samples_i"]
    
df_cohort = pd.read_csv(f"/storage/homefs/pd21v747/RNASeqReplicability/data/{site}/{data}/{data}.csv", index_col=0)
df_cohort = df_cohort[samples_i]

outfile = "../data/test/unpaired.csv"
kwargs = {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 1, "design": "unpaired"}
if test == "deseq2": kwargs = {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": 1, "design": "unpaired"}
run_dea(df_cohort, outfile, method=test, overwrite=True, **kwargs)

outfile_paired = "../data/test/paired.csv"
kwargs = {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 1, "design": "paired"}
if test == "deseq2": kwargs = {"cols_to_keep": ["logFC","logCPM","FDR"],"lfc": 1, "design": "paired"}
run_dea(df_cohort, outfile_paired, method=test, overwrite=True, **kwargs)

In [None]:
df_unpaired = pd.read_csv(outfile, index_col=0)
df_paired = pd.read_csv(outfile_paired, index_col=0)
(df_paired["FDR"] == df_unpaired.loc[df_paired.index]["FDR"]).sum(), len(df_paired)

In [None]:
if test != "deseq2":
    assert np.allclose(df_paired["FDR"], tab_paired.loc[df_paired.index]["FDR"])
    assert np.allclose(df_unpaired["FDR"], tab_unpaired.loc[df_unpaired.index]["FDR"])
else:
    ix = df_paired[~df_paired["FDR"].isna()].index
    assert np.allclose(df_paired.loc[ix, "FDR"], tab_paired.loc[ix]["FDR"])
    ix = df_unpaired[~df_unpaired["FDR"].isna()].index
    assert np.allclose(df_unpaired.loc[ix, "FDR"], tab_unpaired.loc[ix]["FDR"])

In [None]:
fdr = 0.05
print(len(df_paired[df_paired["FDR"]<fdr]))
print(len(tab_paired[tab_paired["FDR"]<fdr]))
print(len(df_unpaired[df_unpaired["FDR"]<fdr]))
print(len(tab_unpaired[tab_unpaired["FDR"]<fdr]))

# Non-TCGA Data

**Tuberculosis**

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6008327/

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3492754/

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE107995

- Compare TB vs LTBI/control
- GSE107991 Berry London: test set (n=54) 
- GSE107992 Berry South Africa: valiadtion set (n=51)
- Leicester: active TB (n=53), recent close contacts (n=108)
-- Close contacts: IGRA negative (n=50), IGRA positive (n=49), TB active (n=9)
- LTBI outliers: cluster with active TB

In [None]:
meta = pd.read_csv("../data/GSETB/GSE107995_series_matrix.tsv", sep="\t", on_bad_lines='warn', header=None, skiprows=27)
meta = meta.set_index(0).T
meta.columns = [c[1:] for c in meta.columns]
meta = meta.set_index("Sample_title")

In [None]:
covariants = meta["Sample_characteristics_ch1"]
covariants.columns = covariants.iloc[-1,:].str.split(":").str[0].str.strip()
covariants.columns[0]
covariants.columns.name = ""
covariants.index.name = "Sample"
covariants = covariants.map(lambda x: x.split(":")[1].strip() if isinstance(x, str) else x)
covariants["Series"] = covariants.index.str.split("_Sample").str[0]
covariants.reset_index(inplace=True)
covariants.head()

In [None]:
covariants["Series"].value_counts()

In [None]:
covariants.groupby("Series")["group"].value_counts()

In [None]:
counts = pd.read_csv("../data/GSETB/GSE107994_Raw_counts_Leicester_with_progressor_longitudinal.csv", index_col=0)
counts = counts[counts["Gene_biotype"] == "protein_coding"]
counts = counts.set_index(['Gene_name', 'Gene_biotype'], append=True)

cov = covariants[covariants['Series'] == 'Leicester_with_progressor_longitudinal']

assert (counts.columns.values == cov["Sample"].values).all()

multi_columns = pd.MultiIndex.from_frame(cov)
counts.columns = multi_columns
counts.head()

In [None]:
counts.columns.get_level_values('group').value_counts()
counts.columns.get_level_values('smear_result').value_counts()
counts.columns.get_level_values('outlier').value_counts()

In [None]:
control = counts.iloc[:,counts.columns.get_level_values("group")=="Control"]
active_TB = counts.iloc[:,counts.columns.get_level_values("group")=="Active_TB"]
N = min(len(control.columns), len(active_TB.columns))
final =pd.concat([control.iloc[:,:N], active_TB.iloc[:,:N]], axis=1)

In [None]:
final.columns.get_level_values('group').value_counts()
final.columns.get_level_values('smear_result').value_counts()
#final.columns.get_level_values('outlier').value_counts()

In [None]:
final.index = final.index.get_level_values("Genes")
final.columns = final.columns.get_level_values("Sample").astype(str) + "_" + final.columns.get_level_values("group").astype(str)
final.to_csv("../data/GSETB/LWPL/LWPL.csv")
final.head()

In [None]:
deg = pd.read_csv("../data/GSETB/LWPL/LWPL.edgerlrt.lfc0.csv", index_col=0)
deg[deg["FDR"]<0.05]

## Results

In [None]:
combined_td3 = pd.read_csv("../data/multi/combined_td.p3.csv", index_col=0) # Unpaired formal lfc 1
combined_td3["DEA"] = combined_td3["DEA"].str.replace("edgerqlf","edgeR QLF")


In [None]:
d = combined_td3[combined_td3["Data"]=="LWPL"]
d = d[(d["FDR"]==0.05)&(d["logFC"]==1)]

In [None]:
truth = pd.read_csv("../data/GSETB/LWPL/truth.fdr0.05.post_lfc1.lfc1.csv", index_col=0)

In [None]:
sns.set_style("whitegrid")

fig, ax = plt.subplots(2,2,figsize=(10,10))
ax=ax.flatten()
for i, met in enumerate(["median_prec","median_rec","median_mcc","median_deg"]):
    sns.barplot(data=d, x="N",y=met, hue="DEA", ax=ax[i])
    if i < 3: ax[i].legend().remove()

fig.suptitle(f"Control vs active TB, |lfc|>1 (formal), 5% FDR, 100 cohorts, truth DEGs: {len(truth)}")
fig.tight_layout()
figpath = f"../figures/LWPL_metrics.png"
fig.savefig(figpath)

In [None]:
import sys, importlib
importlib.reload(sys.modules["misc"])
importlib.reload(sys.modules["DEA"])

from DEA import run_dea