# Synthetic data

https://dcgerard.github.io/seqgendiff/index.html

In [None]:
import os, sys
import pandas as pd
from pathlib import Path

modpath = Path("../scripts")
sys.path.append(os.path.relpath(modpath))

%load_ext rpy2.ipython

In [None]:
%%R

source("../.Rprofile")
.libPaths()

In [None]:
%%R

library(seqgendiff)
library(sva)
library(DESeq2)

In [None]:
prop_null = 0.9 # proportion of null genes

In [None]:
%%R

counts <- read.csv("/storage/homefs/pd21v747/RNASeqReplicability/data/breast/BRCA/BRCA.csv", row.names=1)

# Subset to first 50 normal tissue samples
N <- ncol(counts)/2
stopifnot(N>=50)
counts <- counts[, 1:50]
ncol(counts)

In [None]:
%%R -i prop_null

thout <- thin_2group(mat = data.matrix(counts), 
                     prop_null = prop_null, 
                     signal_fun = stats::rnorm,
                     signal_params = list(mean = 0, sd = 0.8))

# X <- cbind(thout$design_obs, thout$designmat)
# Y <- log2(thout$mat + 0.5)
# n_sv <- num.sv(dat = Y, mod = X)
# svout <- sva(dat = Y, mod = X, n.sv = n_sv)

In [None]:
%%R

# Ground truth
head(thout$coefmat, 10)

In [None]:
%%R

thout_mat <- thout$mat
designmat <- thout$designmat
class0 <- which(thout$designmat == 0)
class1 <- which(thout$designmat == 1)
c0 <- thout_mat[, class0]
c1 <- thout_mat[, class1]
thout_mat <- cbind(c0, c1)
write.csv(thout_mat, "../data/test/thout.csv", col.names = TRUE, row.names = TRUE, quote = FALSE)

## Differential expression test

In [None]:
thout = pd.read_csv("../data/test/thout.csv", index_col=0)
N = len(thout.columns)//2
thout.columns = [f"N{i}" for i in range(1,1+N)] + [f"T{i}" for i in range(1,1+N)]
thout.head()

In [None]:
from DEA import run_dea

edgerqlf_kwargs = {"filter_expr": False, "cols_to_keep": ["logFC","logCPM","FDR"], "lfc": 0, "design": "unpaired",
                   "check_gof": False, "verbose": False}

outfile = "../data/test/thout.qlf.csv"
run_dea(thout, outfile, "edgerqlf", True, **edgerqlf_kwargs)

In [None]:
tab = pd.read_csv(outfile, index_col=0)

len(tab)*(1-prop_null), len(tab[tab["FDR"]<0.05]), len(tab)

# Semi-synthetic data

Create semi-synthetic data by merging real data and susbampling DEGs and non-DEGs.

In [None]:
import io
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

datapath = Path("../data")

In [None]:
# 8 cancer types for main results
sites = {"liver": "LIHC",
         "thyroid": "THCA",
         "lung": "LUAD",
         "lung2": "LUSC",
         "kidney": "KIRC",
         "colorectal": "COAD",
         "breast": "BRCA",
         "prostate": "PRAD"}

In [None]:
counts_list = []
for s in sites:
    #if s != "breast": continue
    f = Path(f"{datapath}/{s}/{sites[s]}/{sites[s]}.csv")
    counts = pd.read_csv(f, index_col=0)
    
    N = len(counts.columns) // 2
    if N < 50: 
        print(f"{s} has < 50 replicates, skipping")
        continue

    # Subset to first 50 replicates
    counts = counts.iloc[:, list(range(50)) + list(range(N,N+50))]
    counts.columns = [f"N{i}" for i in range(1,51)] + [f"T{i}" for i in range(1,51)]

    # Find ground truth
    f = Path(f"{datapath}/{s}/{sites[s]}/truth.fdr0.01.post_lfc1.lfc1.csv")
    truth = pd.read_csv(f, index_col=0)
    counts["isDEG"] = False
    counts.loc[truth.index,"isDEG"] = True

    # Append
    counts_list.append(counts)

counts = pd.concat(counts_list, axis=0)
counts.reset_index(drop=True, inplace=True)