In [None]:
setwd('/srv/scratch/nsabell/mpra-v2')
lib.loc='~/R'
library(tidyverse)
library(GGally)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(data.table)
library(ggpubr)
library(patchwork,lib.loc = "~/R")
library(ggridges)
library(epitools, lib.loc="~/R")
library(ggExtra)
library(naniar, lib.loc="~/R")
library(pROC)
library(fs)

In [None]:
hgConvTable = unique(fread("sumstats-new/hgConversionTable.txt", header = T, stringsAsFactors = F))

mpraStats = read.table("sumstats-new/1KG_novaSeq_DESeq2_Love.txt", header = T, stringsAsFactors = F)
#mpraStats = read.table("sumstats-new/1KG_novaSeq_DESeq2_contrast.txt", header = T, stringsAsFactors = F)
#mpraStats = read.table("sumstats-new/1KG_novaSeq_DESeq2_interaction.txt", header = T, stringsAsFactors = F)
alleleTransform = fread("allelicMetric.txt", sep = "\t", stringsAsFactors = F)
mpraStats$alleleTransform = alleleTransform$V2

mpraStats.tmp = mpraStats[,c(3,4,5,6,7,8,9,23,27,29,33,34)]
mpraStats.tmp = subset(mpraStats.tmp, haploFlag == "Base")

In [None]:
computeOddsRatios = function(labels, categories){
    
    oddsRatios = data.frame("label" = as.character(),
                        "or" = numeric(),
                        "lower" = numeric(),
                        "upper" = numeric(),
                        "midp.exact" = numeric(),
                        "fisher.exact" = numeric(),
                        "chi.square" = numeric(), stringsAsFactors = F)
    
    uniqCategories = unique(categories[!is.na(categories)])
    
    for(categ in uniqCategories){
        
        idx = which(categories == categ | is.na(categories))
        
        testTable = table(labels[idx], is.na(categories[idx])) + 1
        print(testTable)
        print(oddsratio(testTable, correction = T))
        testOddsRatio = suppressWarnings(oddsratio(testTable, correction = T))

        estimates = testOddsRatio$measure[2,]
        pvals = testOddsRatio$p.value[2,]

        newRow = data.frame("label" = as.character(categ),
                            "or" = estimates[1],
                            "lower" = estimates[2],
                            "upper" = estimates[3],
                            "midp.exact" = pvals[1],
                            "fisher.exact" = pvals[2],
                            "chi.square" = pvals[3], stringsAsFactors = F)

        oddsRatios = rbind(oddsRatios, newRow)
        
    }
    
    oddsRatios$fisher.exact.bh = p.adjust(oddsRatios$fisher.exact, method = "BH")
    return(oddsRatios)
}

## ADASTRA 2020 ASB 

In [None]:
adastra = fread("GM12878-ADASTRA.tsv", header = T, sep = "\t", stringsAsFactors = F)
adastra = separate_rows(adastra,`TF-ASBs`,sep = ",")
adastra = subset(adastra, `TF-ASBs` != "")
adastra = adastra[grep("GM12878",adastra$`Cell type-ASBs`),]

adastra$`TF-ASBs` = gsub("_HUMAN","",adastra$`TF-ASBs`)

adastra.id = merge(adastra, hgConvTable, by.x = c("Chromosome","Position"), by.y = c("hg38_chrom","hg38_pos"))
mpra.adastra = merge(mpraStats.tmp, adastra.id, 
                     by.x = c("chrom","pos", "ref","alt"), 
                     by.y = c("hg19_chrom","hg19_pos","Ref","Alt"))

mpra.adastra$deltaEffect = mpra.adastra$`GM12878 (female B-cells lymphoblastoid cell line)_Effect_Size_Alt` - 
                            mpra.adastra$`GM12878 (female B-cells lymphoblastoid cell line)_Effect_Size_Ref`
mpra.adastra$concord = ifelse(sign(mpra.adastra$deltaEffect) == sign(mpra.adastra$log2FoldChange_allele),
                              "Concordant","Discordant")

mpra.adastra.all = merge(mpraStats.tmp, adastra.id, 
                     by.x = c("chrom","pos", "ref","alt"), 
                     by.y = c("hg19_chrom","hg19_pos","Ref","Alt"),all.x = T)

In [None]:
hitList = ifelse(mpra.adastra.all$padj_allele <= 5e-2 & mpra.adastra.all$padj_expr <= 5e-2 &
                 (mpra.adastra.all$`GM12878 (female B-cells lymphoblastoid cell line)_FDR_Ref` >= 0  |
                  mpra.adastra.all$`GM12878 (female B-cells lymphoblastoid cell line)_FDR_Alt` >= 0  ),
                 "hit","nonhit")
categs = mpra.adastra.all$`TF-ASBs`
oddsRatios = computeOddsRatios(hitList, mpra.adastra.all$`TF-ASBs`)

In [None]:
pA = ggplot(subset(oddsRatios, fisher.exact.bh < 5e-2), 
            aes(y = reorder(label, log2(or)), x = log2(or))) + 
    geom_pointrange(aes(xmin = log2(lower), xmax = log2(upper)),stat = "identity") +
    geom_vline(aes(xintercept = 0), color = "red") + 
    theme_pubr(base_size = 15) + 
    theme(axis.title.y = element_blank()) + 
    xlab("log2(odds ratio)") + ylab("")

In [None]:
mpra.adastra.subset = mpra.adastra %>% subset(padj_expr <= 5e-6 & 
                                              padj_allele <= 5e-2 & 
                                              (`GM12878 (female B-cells lymphoblastoid cell line)_FDR_Ref` >= 0| 
                                               `GM12878 (female B-cells lymphoblastoid cell line)_FDR_Alt` >= 0))

p1 = mpra.adastra.subset %>%
        ggplot(aes(y = log2FoldChange_allele, x = deltaEffect,  color = `TF-ASBs`)) + 
#            geom_text_repel(aes(label=`TF-ASBs`, vjust=0), position = "jitter") +
            geom_point(size = 3) + 
            theme_pubr(base_size = 15) + 
            theme(legend.position = "none") + 
            xlab("ASB Effect Size") + ylab("MPRA Allelic Effect Size")

p1 = mpra.adastra.subset %>%
            group_by(chrom,pos,ref,alt,log2FoldChange_allele) %>% 
            summarize("meanEffect" = max(deltaEffect), "tf" = paste0(unique(`TF-ASBs`),collapse = ",\n")) %>%
            ggplot(aes(y = log2FoldChange_allele, x = meanEffect)) + 
                #geom_text_repel(aes(label=tf, vjust=0), position = "jitter") +
                geom_point(size = 3) + 
                theme_pubr(base_size = 15)  + 
                theme(legend.position = "none") +
                xlab("ADASTRA Effect Size") + ylab("MPRA Allelic Effect Size")

p2 = mpra.adastra.subset %>%
            group_by(`TF-ASBs`,concord) %>% 
            summarize("count" = n()) %>% 
            spread(concord,count,fill = 0) %>%
            ggplot(aes(y=reorder(`TF-ASBs`, Concordant+Discordant), yend=reorder(`TF-ASBs`, Concordant+Discordant))) + 
                geom_segment(aes(x=-1*Discordant, xend=0), size =9, color = "red") + 
                geom_segment(aes(x=0, xend=Concordant), size = 9, color = "blue") + 
                geom_vline(xintercept = 0) +
                theme_pubr(base_size = 15) + 
                xlab("Number of Variants") + ylab("")

pB = p1 / p2

In [None]:
options(repr.plot.width = 15, repr.plot.height = 12)
pdf("Figure2-adastra.pdf",width=15,height=12,useDingbats = F)
pA + pB
dev.off()