PTEN_composite
PTEN fill-in abundance DMS and integration with yeast activity scores
PTEN composite analysis
Kenneth Matreyek 6/19/2020
rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
set.seed(1234)
if(!require(reshape)){install.packages("reshape")}
## Loading required package: reshape
library(reshape)
if(!require(ggrepel)){install.packages("ggrepel")}
## Loading required package: ggrepel
## Loading required package: ggplot2
library(ggrepel)
if(!require(uwot)){install.packages("uwot")}
## Loading required package: uwot
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:reshape':
##
## expand
library(uwot)
if(!require(scales)){install.packages("scales")}
## Loading required package: scales
library(scales)
if(!require(MASS)){install.packages("MASS")}
## Loading required package: MASS
library(MASS)
if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")}
if(!require(ggtree)){install.packages(BiocManager::install("ggtree"))}
## Loading required package: ggtree
## ggtree v3.0.2 For help: https://yulab-smu.top/treedata-book/
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## 1. Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics, 2020, 69:e96. doi:10.1002/cpbi.96
## 2. Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194
## 3. Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:reshape':
##
## expand
library(ggtree)
if(!require(reldist)){install.packages("reldist")}
## Loading required package: reldist
## reldist: Relative Distribution Methods
## Version 1.6-6 created on 2016-10-07.
## copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
## For citation information, type citation("reldist").
## Type help(package="reldist") to get started.
library(reldist)
if(!require(tidyverse)){install.packages("tidyverse")}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks ggtree::expand(), Matrix::expand(), reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::rename() masks reshape::rename()
## x dplyr::select() masks MASS::select()
## x tidyr::unpack() masks Matrix::unpack()
library(tidyverse)
#if(!require(phytools)){install.packages("phytools")}
#library(phytools)
theme_new <- theme_set(theme_bw())
theme_new <- theme_update(panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_line(colour = "grey95"),
panel.grid.minor = element_blank())
to_single_notation <- function(arg1){
if(toupper(arg1) == "ALA"){return("A")}
if(toupper(arg1) == "CYS"){return("C")}
if(toupper(arg1) == "ASP"){return("D")}
if(toupper(arg1) == "GLU"){return("E")}
if(toupper(arg1) == "PHE"){return("F")}
if(toupper(arg1) == "GLY"){return("G")}
if(toupper(arg1) == "HIS"){return("H")}
if(toupper(arg1) == "ILE"){return("I")}
if(toupper(arg1) == "LYS"){return("K")}
if(toupper(arg1) == "LEU"){return("L")}
if(toupper(arg1) == "MET"){return("M")}
if(toupper(arg1) == "ASN"){return("N")}
if(toupper(arg1) == "PRO"){return("P")}
if(toupper(arg1) == "GLN"){return("Q")}
if(toupper(arg1) == "ARG"){return("R")}
if(toupper(arg1) == "SER"){return("S")}
if(toupper(arg1) == "THR"){return("T")}
if(toupper(arg1) == "VAL"){return("V")}
if(toupper(arg1) == "TRP"){return("W")}
if(toupper(arg1) == "TYR"){return("Y")}
if(toupper(arg1) == "TER"){return("X")}
}
Looking at the total number of barcodes in this secondary library
plasmid1 <- read.delim(file = "enrich2_output/Plasmid_1.tsv", sep = "\t"); colnames(plasmid1) <- c("barcode","count1")
plasmid2 <- read.delim(file = "enrich2_output/Plasmid_2.tsv", sep = "\t"); colnames(plasmid2) <- c("barcode","count2")
plasmid <- merge(plasmid1, plasmid2, by = "barcode", all = T)
plasmid[is.na(plasmid)] <- 0
plasmid$total <- plasmid$count1 + plasmid$count2
plasmid_barcodes <- plasmid %>% filter(total >= 20)
This is the scoring portion
pten_seq <- "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV"
map <- read.delim(file = "input_datatables/PTEN_variant_barcode_subassembly.tsv", sep = "\t", header= TRUE, stringsAsFactors = FALSE)
map$aa_pos_1 <- 0
map$aa_pos_2 <- 0
map$aa_pos_3 <- 0
map$aa_pos_4 <- 0
for(x in 1:nrow(map)){
map$aa_pos_1[x] <- as.double(gsub("[^0-9]","",unlist(strsplit(unlist(strsplit(as.character(map$value[x]), " \\(p"))[2],")"))[1]))
map$aa_pos_2[x] <- as.double(gsub("[^0-9]","",unlist(strsplit(unlist(strsplit(as.character(map$value[x]), " \\(p"))[3],")"))[1]))
map$aa_pos_3[x] <- as.double(gsub("[^0-9]","",unlist(strsplit(unlist(strsplit(as.character(map$value[x]), " \\(p"))[4],")"))[1]))
map$aa_pos_4[x] <- as.double(gsub("[^0-9]","",unlist(strsplit(unlist(strsplit(as.character(map$value[x]), " \\(p"))[5],")"))[1]))
}
## Get all unique amino acid change positions from above
map$change1 <- 0
map$change2 <- 0
map$change3 <- 0
for(x in 1:nrow(map)){
map$change1[x] <- unique(c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x])[c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x]) != ""])[1]
map$change2[x] <- unique(c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x])[c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x]) != ""])[2]
map$change3[x] <- unique(c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x])[c(map$aa_pos_1[x], map$aa_pos_2[x], map$aa_pos_3[x], map$aa_pos_4[x]) != ""])[3]
if(is.na(map$change1[x] & !is.na(map$change2[x]))){map$change1[x] <- map$change2[x]}
if(is.na(map$change2[x] & map$change1[x] & !is.na(map$change3[x]))){map$change1[x] <- map$change3[x]}
}
## Find the number of mutations for each variant
map$mut_number <- 0
for(x in 1:nrow(map)){
map$mut_number[x] <- 3-sum(is.na(map[x,c("change1","change2","change3")]))
}
##
map_singles <- subset(map, mut_number == 1)
map_wt <- subset(map, value == "_wt" & mut_number == 0)
map_syn <- subset(map, !is.na(value) & value != "_wt" & mut_number == 0)
map_singles$start_aa3 <- "NA"
map_singles$end_aa3 <- "NA"
tempy <- subset(map_singles, aa_pos_1 == 1)
for(x in 1:nrow(map_syn)){
map_syn$position1[x] <- as.integer(as.numeric(substr(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[1],3,
nchar(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[1])-4))/3 - 1/6) + 1
map_syn$position2[x] <- as.integer(as.numeric(substr(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[2],3,
nchar(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[1])-4))/3 - 1/6) + 1
map_syn$position3[x] <- as.integer(as.numeric(substr(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[3],3,
nchar(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[1])-4))/3 - 1/6) + 1
map_syn$position4[x] <- as.integer(as.numeric(substr(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[4],3,
nchar(unlist(strsplit(x = as.character(map_syn$value[x]), split = "\\(p."))[1])-4))/3 - 1/6) + 1
map_syn$change1[x] <- unique(c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x])[c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x]) != ""])[1]
map_syn$change2[x] <- unique(c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x])[c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x]) != ""])[2]
map_syn$change3[x] <- unique(c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x])[c(map_syn$position1[x], map_syn$position2[x], map_syn$position3[x], map_syn$position4[x]) != ""])[3]
map_syn$start[x] <- substr(pten_seq,map_syn$change1[x],map_syn$change1[x])
map_syn$end[x] <- map_syn$start[x]
map_syn$variant[x] <- paste(map_syn$start[x],map_syn$change1[x],map_syn$end[x],sep="")
}
for(x in 1:nrow(map_singles)){
map_singles$start_aa3[x] <- as.character(gsub("[0-9]","",substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = "\\(p."))[2],1,4)))
if(map_singles$start_aa3[x] == "=), "){
map_singles$start_aa3[x] <- as.character(gsub("[0-9]","",substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = "\\(p."))[3],1,4)))
}
if(map_singles$start_aa3[x] == "=), "){
map_singles$start_aa3[x] <- as.character(gsub("[0-9]","",substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = "\\(p."))[4],1,4)))
}
map_singles$end_aa3[x] <- substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[1],nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[1]) - 2,nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[1]))
if(map_singles$end_aa3[x] == "p.="){
map_singles$end_aa3[x] <- substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[2],nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[2]) - 2,nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[2]))
}
if(map_singles$end_aa3[x] == "p.="){
map_singles$end_aa3[x] <- substr(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[3],nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[3]) - 2,nchar(unlist(strsplit(x = as.character(map_singles$value[x]), split = ")"))[3]))
}
}
for(x in 1:nrow(map_singles)){
map_singles$position[x] <- map_singles$aa_pos_1[x]
if(map_singles$position[x] == "" | is.na(map_singles$position[x])){
map_singles$position[x] <- map_singles$aa_pos_2[x]
}
}
map_singles$start <- "NA"
map_singles$end <- "NA"
map_singles$variant <- "NA"
for(x in 1:nrow(map_singles)){
map_singles$start[x] <- to_single_notation(map_singles$start_aa3[x])
map_singles$end[x] <- to_single_notation(map_singles$end_aa3[x])
map_singles$variant[x] <- paste(map_singles$start[x],map_singles$position[x],map_singles$end[x], sep="")
}
map_nonsense <- subset(map_singles, end == "X")
map_missense <- subset(map_singles, end != "X")
map_syn$position <- map_syn$position1
map_wt$class <- "wt"
map_wt$position <- 0
map_wt$start <- "Z"
map_wt$end <- "Z"
map_syn$class <- "syn"
map_nonsense$class <- "nonsense"
map_missense$class <- "missense"
map_wt$variant <- "wt"
PTEN_library_variant_map <- rbind(map_wt[,c("barcode","variant","position","start","end","class")], map_syn[,c("barcode","variant","position","start","end","class")],map_nonsense[,c("barcode","variant","position","start","end","class")],map_missense[,c("barcode","variant","position","start","end","class")])
write.csv(file = "output_datatables/PTEN_library_variant_map.csv", PTEN_library_variant_map, row.names = FALSE, quote = FALSE)
PTEN_subassembly_maps <- read.csv(file = "input_datatables/PTEN_subassembly_maps.csv", header = T, stringsAsFactors = F)
original <- subset(PTEN_subassembly_maps, library == "original")
fillin <- subset(PTEN_subassembly_maps, library == "fillin")
length(unique(original$variant))
## [1] 5192
length(unique(fillin$variant))
## [1] 1255
original_variants <- data.frame("variant" = unique(original$variant), "library" = "original")
fillin_variants <- data.frame("variant" = unique(fillin$variant), "library" = "fillin")
combined_variants <- merge(original_variants, fillin_variants, by = "variant", all = T)
combined_variants <- combined_variants[!(combined_variants$variant == "wt"),]
combined_variants$library <- "none"
for(x in 1:nrow(combined_variants)){
if(!is.na(combined_variants$library.x[x]) & is.na(combined_variants$library.y[x])){combined_variants$library[x] <- "original"}
if(is.na(combined_variants$library.x[x]) & !is.na(combined_variants$library.y[x])){combined_variants$library[x] <- "fillin"}
if(!is.na(combined_variants$library.x[x]) & !is.na(combined_variants$library.y[x])){combined_variants$library[x] <- "both"}
}
combined_variants$position <- gsub("[A-Z]", "", combined_variants$variant)
combined_variants$start <- substr(gsub("[^A-Z]", "", combined_variants$variant),1,1)
combined_variants$end <- substr(gsub("[^A-Z]", "", combined_variants$variant),2,2)
combined_variants$position <- as.numeric(combined_variants$position)
combined_variants$end <- factor(combined_variants$end, levels = c("G","S","T","C","A","M","W","Y","F","L","I","V","D","E","N","Q","H","K","R","P","X"))
combined_variants$library <- factor(combined_variants$library, levels = c("fillin","original","both"))
PTEN_library_heatmap <- ggplot() + geom_tile(data = combined_variants, aes(x = position, y = end, fill = library)) + scale_x_continuous(expand = c(0,0)) + scale_fill_manual(values = c("brown3","cornflowerblue","black")) + theme(panel.background = element_rect("grey80"), panel.grid.major = element_blank())
ggsave(fil = "Plots/PTEN_library_heatmap.pdf", PTEN_library_heatmap, height = 3, width = 15)
PTEN_library_heatmap
combined_variants$count <- 1
combined_variants2 <- combined_variants %>% group_by(position,library) %>% summarize(count = sum(count))
## `summarise()` has grouped output by 'position'. You can override using the `.groups` argument.
#combined_variants2$library <- factor(combined_variants2$library, levels = c("fillin","original","both"))
PTEN_library_bargraph <- ggplot() + geom_bar(data = combined_variants2, aes(x = position, y = count, fill = library), stat = "identity") + scale_fill_manual(values = c("brown3","cornflowerblue","black")) + scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
ggsave(fil = "plots/PTEN_library_bargraph.pdf", PTEN_library_bargraph, height = 3, width = 15)
PTEN_library_bargraph
e1s1_1a1 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1a2 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1a3 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1a <- merge(e1s1_1a1, e1s1_1a2, by = "X", all = T); e1s1_1a <- merge(e1s1_1a, e1s1_1a3, by = "X", all = T); e1s1_1a[is.na(e1s1_1a)] <- 0
e1s1_1a$count <- rowSums(e1s1_1a[,2:4]); e1s1_1a <- e1s1_1a[,c("X","count")]
e1s1_1b1 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1b2 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1b3 <- read.delim(file = "enrich2_output/Rep1_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_1b <- merge(e1s1_1b1, e1s1_1b2, by = "X", all = T); e1s1_1b <- merge(e1s1_1b, e1s1_1b3, by = "X", all = T); e1s1_1b[is.na(e1s1_1b)] <- 0
e1s1_1b$count <- rowSums(e1s1_1b[,2:4]); e1s1_1b <- e1s1_1b[,c("X","count")]
e1s1_2a1 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2a2 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2a3 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2a <- merge(e1s1_2a1, e1s1_2a2, by = "X", all = T); e1s1_2a <- merge(e1s1_2a, e1s1_2a3, by = "X", all = T); e1s1_2a[is.na(e1s1_2a)] <- 0
e1s1_2a$count <- rowSums(e1s1_2a[,2:4]); e1s1_2a <- e1s1_2a[,c("X","count")]
e1s1_2b1 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2b2 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2b3 <- read.delim(file = "enrich2_output/Rep1_Bin2_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_2b <- merge(e1s1_2b1, e1s1_2b2, by = "X", all = T); e1s1_2b <- merge(e1s1_2b, e1s1_2b3, by = "X", all = T); e1s1_2b[is.na(e1s1_2b)] <- 0
e1s1_2b$count <- rowSums(e1s1_2b[,2:4]); e1s1_2b <- e1s1_2b[,c("X","count")]
e1s1_3a1 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3a2 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3a3 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3a <- merge(e1s1_3a1, e1s1_3a2, by = "X", all = T); e1s1_3a <- merge(e1s1_3a, e1s1_3a3, by = "X", all = T); e1s1_3a[is.na(e1s1_3a)] <- 0
e1s1_3a$count <- rowSums(e1s1_3a[,2:4]); e1s1_3a <- e1s1_3a[,c("X","count")]
e1s1_3b1 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3b2 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3b3 <- read.delim(file = "enrich2_output/Rep1_Bin3_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_3b <- merge(e1s1_3b1, e1s1_3b2, by = "X", all = T); e1s1_3b <- merge(e1s1_3b, e1s1_3b3, by = "X", all = T); e1s1_3b[is.na(e1s1_3b)] <- 0
e1s1_3b$count <- rowSums(e1s1_3b[,2:4]); e1s1_3b <- e1s1_3b[,c("X","count")]
e1s1_4a1 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4a2 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4a3 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4a <- merge(e1s1_4a1, e1s1_4a2, by = "X", all = T); e1s1_4a <- merge(e1s1_4a, e1s1_4a3, by = "X", all = T); e1s1_4a[is.na(e1s1_4a)] <- 0
e1s1_4a$count <- rowSums(e1s1_4a[,2:4]); e1s1_4a <- e1s1_4a[,c("X","count")]
e1s1_4b1 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4b2 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4b3 <- read.delim(file = "enrich2_output/Rep1_Bin4_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s1_4b <- merge(e1s1_4b1, e1s1_4b2, by = "X", all = T); e1s1_4b <- merge(e1s1_4b, e1s1_4b3, by = "X", all = T); e1s1_4b[is.na(e1s1_4b)] <- 0
e1s1_4b$count <- rowSums(e1s1_4b[,2:4]); e1s1_4b <- e1s1_4b[,c("X","count")]
e1s1 <- merge(e1s1_1a, e1s1_1b, by = "X", all = TRUE)
e1s1 <- merge(e1s1, e1s1_2a, by = "X", all = TRUE)
e1s1 <- merge(e1s1, e1s1_2b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s1, e1s1_2b, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s1 <- merge(e1s1, e1s1_3a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s1, e1s1_3a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s1 <- merge(e1s1, e1s1_3b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s1, e1s1_3b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s1 <- merge(e1s1, e1s1_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s1, e1s1_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s1 <- merge(e1s1, e1s1_4b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s1, e1s1_4b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are duplicated
## in the result
colnames(e1s1) <- c("barcode","b1a","b1b","b2a","b2b","b3a","b3b","b4a","b4b")
e1s1 <- merge(e1s1, PTEN_library_variant_map, by = "barcode", all = FALSE)
e1s1[is.na(e1s1)] <- 0
e1s1 <- e1s1 %>% group_by(variant) %>% summarize(b1a = sum(b1a), b1b = sum(b1b), b2a = sum(b2a), b2b = sum(b2b), b3a = sum(b3a), b3b = sum(b3b), b4a = sum(b4a), b4b = sum(b4b), class = unique(class))
e1s1$b1 <- e1s1$b1a + e1s1$b1b; e1s1$b2 <- e1s1$b2a + e1s1$b2b;e1s1$b3 <- e1s1$b3a + e1s1$b3b; e1s1$b4 <- e1s1$b4a + e1s1$b4b
ggplot() + geom_point(data = e1s1, aes(x = b1a, y = b1b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s1, aes(x = b2a, y = b2b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s1, aes(x = b3a, y = b3b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s1, aes(x = b4a, y = b4b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
e1s1$b1 <- e1s1$b1 / sum(e1s1$b1, na.rm = TRUE)
e1s1$b2 <- e1s1$b2 / sum(e1s1$b2, na.rm = TRUE)
e1s1$b3 <- e1s1$b3 / sum(e1s1$b3, na.rm = TRUE)
e1s1$b4 <- e1s1$b4 / sum(e1s1$b4, na.rm = TRUE)
e1s1$mean_freq <- (e1s1$b1 + e1s1$b2 + e1s1$b3 + e1s1$b4) / 4
frequency_filter <- 1e-5 * 10^(1/4)
e1s1 <- subset(e1s1, mean_freq > 1e-5 * 10^(1/4))
e1s1 <- subset(e1s1, mean_freq != 0)
e1s1$weighted_ave <- (e1s1$b1 * 0 + e1s1$b2 * (1/3) + e1s1$b3 * (2/3) + e1s1$b4) /
rowSums(e1s1[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e1s1_weighted_ave.csv", e1s1, row.names = FALSE, quote = FALSE)
e1s2_1a1 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1a2 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1a3 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1a <- merge(e1s2_1a1, e1s2_1a2, by = "X", all = T); e1s2_1a <- merge(e1s2_1a, e1s2_1a3, by = "X", all = T); e1s2_1a[is.na(e1s2_1a)] <- 0
e1s2_1a$count <- rowSums(e1s2_1a[,2:4]); e1s2_1a <- e1s2_1a[,c("X","count")]
e1s2_1b1 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1b2 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1b3 <- read.delim(file = "enrich2_output/Rep2_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_1b <- merge(e1s2_1b1, e1s2_1b2, by = "X", all = T); e1s2_1b <- merge(e1s2_1b, e1s2_1b3, by = "X", all = T); e1s2_1b[is.na(e1s2_1b)] <- 0
e1s2_1b$count <- rowSums(e1s2_1b[,2:4]); e1s2_1b <- e1s2_1b[,c("X","count")]
e1s2_2a1 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2a2 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2a3 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2a <- merge(e1s2_2a1, e1s2_2a2, by = "X", all = T); e1s2_2a <- merge(e1s2_2a, e1s2_2a3, by = "X", all = T); e1s2_2a[is.na(e1s2_2a)] <- 0
e1s2_2a$count <- rowSums(e1s2_2a[,2:4]); e1s2_2a <- e1s2_2a[,c("X","count")]
e1s2_2b1 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2b2 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2b3 <- read.delim(file = "enrich2_output/Rep2_Bin2_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_2b <- merge(e1s2_2b1, e1s2_2b2, by = "X", all = T); e1s2_2b <- merge(e1s2_2b, e1s2_2b3, by = "X", all = T); e1s2_2b[is.na(e1s2_2b)] <- 0
e1s2_2b$count <- rowSums(e1s2_2b[,2:4]); e1s2_2b <- e1s2_2b[,c("X","count")]
e1s2_3a1 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3a2 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3a3 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3a <- merge(e1s2_3a1, e1s2_3a2, by = "X", all = T); e1s2_3a <- merge(e1s2_3a, e1s2_3a3, by = "X", all = T); e1s2_3a[is.na(e1s2_3a)] <- 0
e1s2_3a$count <- rowSums(e1s2_3a[,2:4]); e1s2_3a <- e1s2_3a[,c("X","count")]
e1s2_3b1 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3b2 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3b3 <- read.delim(file = "enrich2_output/Rep2_Bin3_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_3b <- merge(e1s2_3b1, e1s2_3b2, by = "X", all = T); e1s2_3b <- merge(e1s2_3b, e1s2_3b3, by = "X", all = T); e1s2_3b[is.na(e1s2_3b)] <- 0
e1s2_3b$count <- rowSums(e1s2_3b[,2:4]); e1s2_3b <- e1s2_3b[,c("X","count")]
e1s2_4a1 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4a2 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4a3 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4a <- merge(e1s2_4a1, e1s2_4a2, by = "X", all = T); e1s2_4a <- merge(e1s2_4a, e1s2_4a3, by = "X", all = T); e1s2_4a[is.na(e1s2_4a)] <- 0
e1s2_4a$count <- rowSums(e1s2_4a[,2:4]); e1s2_4a <- e1s2_4a[,c("X","count")]
e1s2_4b1 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4b2 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4b3 <- read.delim(file = "enrich2_output/Rep2_Bin4_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s2_4b <- merge(e1s2_4b1, e1s2_4b2, by = "X", all = T); e1s2_4b <- merge(e1s2_4b, e1s2_4b3, by = "X", all = T); e1s2_4b[is.na(e1s2_4b)] <- 0
e1s2_4b$count <- rowSums(e1s2_4b[,2:4]); e1s2_4b <- e1s2_4b[,c("X","count")]
e1s2 <- merge(e1s2_1a, e1s2_1b, by = "X", all = TRUE)
e1s2 <- merge(e1s2, e1s2_2a, by = "X", all = TRUE)
e1s2 <- merge(e1s2, e1s2_2b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s2, e1s2_2b, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s2 <- merge(e1s2, e1s2_3a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s2, e1s2_3a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s2 <- merge(e1s2, e1s2_3b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s2, e1s2_3b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s2 <- merge(e1s2, e1s2_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s2, e1s2_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s2 <- merge(e1s2, e1s2_4b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s2, e1s2_4b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are duplicated
## in the result
colnames(e1s2) <- c("barcode","b1a","b1b","b2a","b2b","b3a","b3b","b4a","b4b")
e1s2 <- merge(e1s2, PTEN_library_variant_map, by = "barcode", all = FALSE)
e1s2[is.na(e1s2)] <- 0
e1s2 <- e1s2 %>% group_by(variant) %>% summarize(b1a = sum(b1a), b1b = sum(b1b), b2a = sum(b2a), b2b = sum(b2b), b3a = sum(b3a), b3b = sum(b3b), b4a = sum(b4a), b4b = sum(b4b), class = unique(class))
e1s2$b1 <- e1s2$b1a + e1s2$b1b; e1s2$b2 <- e1s2$b2a + e1s2$b2b;e1s2$b3 <- e1s2$b3a + e1s2$b3b; e1s2$b4 <- e1s2$b4a + e1s2$b4b
ggplot() + geom_point(data = e1s2, aes(x = b1a, y = b1b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s2, aes(x = b2a, y = b2b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s2, aes(x = b3a, y = b3b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s2, aes(x = b4a, y = b4b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
e1s2$b1 <- e1s2$b1 / sum(e1s2$b1, na.rm = TRUE)
e1s2$b2 <- e1s2$b2 / sum(e1s2$b2, na.rm = TRUE)
e1s2$b3 <- e1s2$b3 / sum(e1s2$b3, na.rm = TRUE)
e1s2$b4 <- e1s2$b4 / sum(e1s2$b4, na.rm = TRUE)
e1s2$mean_freq <- (e1s2$b1 + e1s2$b2 + e1s2$b3 + e1s2$b4) / 4
e1s2 <- subset(e1s2, mean_freq > 1e-5 * 10^(1/4))
e1s2 <- subset(e1s2, mean_freq != 0)
e1s2$weighted_ave <- (e1s2$b1 * 0 + e1s2$b2 * (1/3) + e1s2$b3 * (2/3) + e1s2$b4) /
rowSums(e1s2[,c("b1","b2","b3","b4")])
write.csv(file = "Output_datatables/e1s2_weighted_ave.csv", e1s2, row.names = FALSE, quote = FALSE)
e1s3_1a1 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1a2 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1a3 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1a <- merge(e1s3_1a1, e1s3_1a2, by = "X", all = T); e1s3_1a <- merge(e1s3_1a, e1s3_1a3, by = "X", all = T); e1s3_1a[is.na(e1s3_1a)] <- 0
e1s3_1a$count <- rowSums(e1s3_1a[,2:4]); e1s3_1a <- e1s3_1a[,c("X","count")]
e1s3_1b1 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1b2 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1b3 <- read.delim(file = "enrich2_output/Rep3_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_1b <- merge(e1s3_1b1, e1s3_1b2, by = "X", all = T); e1s3_1b <- merge(e1s3_1b, e1s3_1b3, by = "X", all = T); e1s3_1b[is.na(e1s3_1b)] <- 0
e1s3_1b$count <- rowSums(e1s3_1b[,2:4]); e1s3_1b <- e1s3_1b[,c("X","count")]
e1s3_2a1 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2a2 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2a3 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2a <- merge(e1s3_2a1, e1s3_2a2, by = "X", all = T); e1s3_2a <- merge(e1s3_2a, e1s3_2a3, by = "X", all = T); e1s3_2a[is.na(e1s3_2a)] <- 0
e1s3_2a$count <- rowSums(e1s3_2a[,2:4]); e1s3_2a <- e1s3_2a[,c("X","count")]
e1s3_2b1 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2b2 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2b3 <- read.delim(file = "enrich2_output/Rep3_Bin2_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_2b <- merge(e1s3_2b1, e1s3_2b2, by = "X", all = T); e1s3_2b <- merge(e1s3_2b, e1s3_2b3, by = "X", all = T); e1s3_2b[is.na(e1s3_2b)] <- 0
e1s3_2b$count <- rowSums(e1s3_2b[,2:4]); e1s3_2b <- e1s3_2b[,c("X","count")]
e1s3_3a1 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3a2 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3a3 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3a <- merge(e1s3_3a1, e1s3_3a2, by = "X", all = T); e1s3_3a <- merge(e1s3_3a, e1s3_3a3, by = "X", all = T); e1s3_3a[is.na(e1s3_3a)] <- 0
e1s3_3a$count <- rowSums(e1s3_3a[,2:4]); e1s3_3a <- e1s3_3a[,c("X","count")]
e1s3_3b1 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3b2 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3b3 <- read.delim(file = "enrich2_output/Rep3_Bin3_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_3b <- merge(e1s3_3b1, e1s3_3b2, by = "X", all = T); e1s3_3b <- merge(e1s3_3b, e1s3_3b3, by = "X", all = T); e1s3_3b[is.na(e1s3_3b)] <- 0
e1s3_3b$count <- rowSums(e1s3_3b[,2:4]); e1s3_3b <- e1s3_3b[,c("X","count")]
e1s3_4a1 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4a2 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4a3 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR1_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4a <- merge(e1s3_4a1, e1s3_4a2, by = "X", all = T); e1s3_4a <- merge(e1s3_4a, e1s3_4a3, by = "X", all = T); e1s3_4a[is.na(e1s3_4a)] <- 0
e1s3_4a$count <- rowSums(e1s3_4a[,2:4]); e1s3_4a <- e1s3_4a[,c("X","count")]
e1s3_4b1 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4b2 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4b3 <- read.delim(file = "enrich2_output/Rep3_Bin4_PCR2_Seq3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s3_4b <- merge(e1s3_4b1, e1s3_4b2, by = "X", all = T); e1s3_4b <- merge(e1s3_4b, e1s3_4b3, by = "X", all = T); e1s3_4b[is.na(e1s3_4b)] <- 0
e1s3_4b$count <- rowSums(e1s3_4b[,2:4]); e1s3_4b <- e1s3_4b[,c("X","count")]
e1s3 <- merge(e1s3_1a, e1s3_1b, by = "X", all = TRUE)
e1s3 <- merge(e1s3, e1s3_2a, by = "X", all = TRUE)
e1s3 <- merge(e1s3, e1s3_2b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s3, e1s3_2b, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s3 <- merge(e1s3, e1s3_3a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s3, e1s3_3a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s3 <- merge(e1s3, e1s3_3b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s3, e1s3_3b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s3 <- merge(e1s3, e1s3_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s3, e1s3_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s3 <- merge(e1s3, e1s3_4b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s3, e1s3_4b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are duplicated
## in the result
colnames(e1s3) <- c("barcode","b1a","b1b","b2a","b2b","b3a","b3b","b4a","b4b")
e1s3 <- merge(e1s3, PTEN_library_variant_map, by = "barcode", all = FALSE)
e1s3[is.na(e1s3)] <- 0
e1s3 <- e1s3 %>% group_by(variant) %>% summarize(b1a = sum(b1a), b1b = sum(b1b), b2a = sum(b2a), b2b = sum(b2b), b3a = sum(b3a), b3b = sum(b3b), b4a = sum(b4a), b4b = sum(b4b), class = unique(class))
e1s3$b1 <- e1s3$b1a + e1s3$b1b; e1s3$b2 <- e1s3$b2a + e1s3$b2b;e1s3$b3 <- e1s3$b3a + e1s3$b3b; e1s3$b4 <- e1s3$b4a + e1s3$b4b
ggplot() + geom_point(data = e1s3, aes(x = b1a, y = b1b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s3, aes(x = b2a, y = b2b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s3, aes(x = b3a, y = b3b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s3, aes(x = b4a, y = b4b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
e1s3$b1 <- e1s3$b1 / sum(e1s3$b1, na.rm = TRUE)
e1s3$b2 <- e1s3$b2 / sum(e1s3$b2, na.rm = TRUE)
e1s3$b3 <- e1s3$b3 / sum(e1s3$b3, na.rm = TRUE)
e1s3$b4 <- e1s3$b4 / sum(e1s3$b4, na.rm = TRUE)
e1s3$mean_freq <- (e1s3$b1 + e1s3$b2 + e1s3$b3 + e1s3$b4) / 4
e1s3 <- subset(e1s3, mean_freq > 1e-5 * 10^(1/4))
e1s3 <- subset(e1s3, mean_freq != 0)
e1s3$weighted_ave <- (e1s3$b1 * 0 + e1s3$b2 * (1/3) + e1s3$b3 * (2/3) + e1s3$b4) /
rowSums(e1s3[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e1s3_weighted_ave.csv", e1s3, row.names = FALSE, quote = FALSE)
e1s4_1a1 <- read.delim(file = "enrich2_output/Rep4_Bin1_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_1a2 <- read.delim(file = "enrich2_output/Rep4_Bin1_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_1a <- merge(e1s4_1a1, e1s4_1a2, by = "X", all = T); e1s4_1a[is.na(e1s4_1a)] <- 0; e1s4_1a$count <- e1s4_1a$count.x + e1s4_1a$count.y; e1s4_1a <- e1s4_1a[,c("X","count")]
e1s4_1b1 <- read.delim(file = "enrich2_output/Rep4_Bin1_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_1b2 <- read.delim(file = "enrich2_output/Rep4_Bin1_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_1b <- merge(e1s4_1b1, e1s4_1b2, by = "X", all = T); e1s4_1b[is.na(e1s4_1b)] <- 0; e1s4_1b$count <- e1s4_1b$count.x + e1s4_1b$count.y; e1s4_1b <- e1s4_1b[,c("X","count")]
e1s4_2a1 <- read.delim(file = "enrich2_output/Rep4_Bin2_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_2a2 <- read.delim(file = "enrich2_output/Rep4_Bin2_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_2a <- merge(e1s4_2a1, e1s4_2a2, by = "X", all = T); e1s4_2a[is.na(e1s4_2a)] <- 0; e1s4_2a$count <- e1s4_2a$count.x + e1s4_2a$count.y; e1s4_2a <- e1s4_2a[,c("X","count")]
e1s4_2b1 <- read.delim(file = "enrich2_output/Rep4_Bin2_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_2b2 <- read.delim(file = "enrich2_output/Rep4_Bin2_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_2b <- merge(e1s4_2b1, e1s4_2b2, by = "X", all = T); e1s4_2b[is.na(e1s4_2b)] <- 0; e1s4_2b$count <- e1s4_2b$count.x + e1s4_2b$count.y; e1s4_2b <- e1s4_2b[,c("X","count")]
e1s4_3a1 <- read.delim(file = "enrich2_output/Rep4_Bin3_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_3a2 <- read.delim(file = "enrich2_output/Rep4_Bin3_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_3a <- merge(e1s4_3a1, e1s4_3a2, by = "X", all = T); e1s4_3a[is.na(e1s4_3a)] <- 0; e1s4_3a$count <- e1s4_3a$count.x + e1s4_3a$count.y; e1s4_3a <- e1s4_3a[,c("X","count")]
e1s4_3b1 <- read.delim(file = "enrich2_output/Rep4_Bin3_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_3b2 <- read.delim(file = "enrich2_output/Rep4_Bin3_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_3b <- merge(e1s4_3b1, e1s4_3b2, by = "X", all = T); e1s4_3b[is.na(e1s4_3b)] <- 0; e1s4_3b$count <- e1s4_3b$count.x + e1s4_3b$count.y; e1s4_3b <- e1s4_3b[,c("X","count")]
e1s4_4a1 <- read.delim(file = "enrich2_output/Rep4_Bin4_PCR1_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_4a2 <- read.delim(file = "enrich2_output/Rep4_Bin4_PCR1_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_4a <- merge(e1s4_4a1, e1s4_4a2, by = "X", all = T); e1s4_4a[is.na(e1s4_4a)] <- 0; e1s4_4a$count <- e1s4_4a$count.x + e1s4_4a$count.y; e1s4_4a <- e1s4_4a[,c("X","count")]
e1s4_4b1 <- read.delim(file = "enrich2_output/Rep4_Bin4_PCR2_Seq1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_4b2 <- read.delim(file = "enrich2_output/Rep4_Bin4_PCR2_Seq2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e1s4_4b <- merge(e1s4_4b1, e1s4_4b2, by = "X", all = T); e1s4_4b[is.na(e1s4_4b)] <- 0; e1s4_4b$count <- e1s4_4b$count.x + e1s4_4b$count.y; e1s4_4b <- e1s4_4b[,c("X","count")]
e1s4 <- merge(e1s4_1a, e1s4_1b, by = "X", all = TRUE)
e1s4 <- merge(e1s4, e1s4_2a, by = "X", all = TRUE)
e1s4 <- merge(e1s4, e1s4_2b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s4, e1s4_2b, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s4 <- merge(e1s4, e1s4_3a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s4, e1s4_3a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
e1s4 <- merge(e1s4, e1s4_3b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s4, e1s4_3b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s4 <- merge(e1s4, e1s4_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s4, e1s4_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y' are duplicated in the result
e1s4 <- merge(e1s4, e1s4_4b, by = "X", all = TRUE)
## Warning in merge.data.frame(e1s4, e1s4_4b, by = "X", all = TRUE): column names
## 'count.x', 'count.y', 'count.x', 'count.y', 'count.x', 'count.y' are duplicated
## in the result
colnames(e1s4) <- c("barcode","b1a","b1b","b2a","b2b","b3a","b3b","b4a","b4b")
e1s4 <- merge(e1s4, PTEN_library_variant_map, by = "barcode", all = FALSE)
e1s4[is.na(e1s4)] <- 0
e1s4 <- e1s4 %>% group_by(variant) %>% summarize(b1a = sum(b1a), b1b = sum(b1b), b2a = sum(b2a), b2b = sum(b2b), b3a = sum(b3a), b3b = sum(b3b), b4a = sum(b4a), b4b = sum(b4b), class = unique(class))
e1s4$b1 <- e1s4$b1a + e1s4$b1b; e1s4$b2 <- e1s4$b2a + e1s4$b2b;e1s4$b3 <- e1s4$b3a + e1s4$b3b; e1s4$b4 <- e1s4$b4a + e1s4$b4b
ggplot() + geom_point(data = e1s4, aes(x = b1a, y = b1b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s4, aes(x = b2a, y = b2b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s4, aes(x = b3a, y = b3b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot() + geom_point(data = e1s4, aes(x = b4a, y = b4b)) + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
e1s4$b1 <- e1s4$b1 / sum(e1s4$b1, na.rm = TRUE)
e1s4$b2 <- e1s4$b2 / sum(e1s4$b2, na.rm = TRUE)
e1s4$b3 <- e1s4$b3 / sum(e1s4$b3, na.rm = TRUE)
e1s4$b4 <- e1s4$b4 / sum(e1s4$b4, na.rm = TRUE)
e1s4$mean_freq <- (e1s4$b1 + e1s4$b2 + e1s4$b3 + e1s4$b4) / 4
e1s4 <- subset(e1s4, mean_freq > 1e-5 * 10^(1/4))
e1s4 <- subset(e1s4, mean_freq != 0)
e1s4$weighted_ave <- (e1s4$b1 * 0 + e1s4$b2 * (1/3) + e1s4$b3 * (2/3) + e1s4$b4) /
rowSums(e1s4[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e1s4_weighted_ave.csv", e1s4, row.names = FALSE, quote = FALSE)
e3s1_1a <- read.delim(file = "enrich2_output/Rep5_Bin1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s1_2a <- read.delim(file = "enrich2_output/Rep5_Bin2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s1_3a <- read.delim(file = "enrich2_output/Rep5_Bin3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s1_4a <- read.delim(file = "enrich2_output/Rep5_Bin4.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s1 <- merge(e3s1_1a, e3s1_2a, by = "X", all = TRUE)
e3s1 <- merge(e3s1, e3s1_3a, by = "X", all = TRUE)
e3s1 <- merge(e3s1, e3s1_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e3s1, e3s1_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
colnames(e3s1) <- c("barcode","b1","b2","b3","b4")
e3s1 <- merge(e3s1, PTEN_library_variant_map, by = "barcode", all = FALSE)
e3s1[is.na(e3s1)] <- 0
e3s1 <- e3s1 %>% group_by(variant) %>% summarize(b1 = sum(b1), b2 = sum(b2), b3 = sum(b3), b4 = sum(b4), class = unique(class))
e3s1$b1 <- e3s1$b1 / sum(e3s1$b1, na.rm = TRUE)
e3s1$b2 <- e3s1$b2 / sum(e3s1$b2, na.rm = TRUE)
e3s1$b3 <- e3s1$b3 / sum(e3s1$b3, na.rm = TRUE)
e3s1$b4 <- e3s1$b4 / sum(e3s1$b4, na.rm = TRUE)
e3s1$mean_freq <- (e3s1$b1 + e3s1$b2 + e3s1$b3 + e3s1$b4) / 4
e3s1 <- subset(e3s1, mean_freq > 1e-5 * 10^(1/4))
e3s1 <- subset(e3s1, mean_freq != 0)
e3s1$weighted_ave <- (e3s1$b1 * 0 + e3s1$b2 * (1/3) + e3s1$b3 * (2/3) + e3s1$b4) /
rowSums(e3s1[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e3s1_weighted_ave.csv", e3s1, row.names = FALSE, quote = FALSE)
e3s2_1a <- read.delim(file = "enrich2_output/Rep6_Bin1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s2_2a <- read.delim(file = "enrich2_output/Rep6_Bin2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s2_3a <- read.delim(file = "enrich2_output/Rep6_Bin3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s2_4a <- read.delim(file = "enrich2_output/Rep6_Bin4.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s2 <- merge(e3s2_1a, e3s2_2a, by = "X", all = TRUE)
e3s2 <- merge(e3s2, e3s2_3a, by = "X", all = TRUE)
e3s2 <- merge(e3s2, e3s2_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e3s2, e3s2_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
colnames(e3s2) <- c("barcode","b1","b2","b3","b4")
e3s2 <- merge(e3s2, PTEN_library_variant_map, by = "barcode", all = FALSE)
e3s2[is.na(e3s2)] <- 0
e3s2 <- e3s2 %>% group_by(variant) %>% summarize(b1 = sum(b1), b2 = sum(b2), b3 = sum(b3), b4 = sum(b4), class = unique(class))
e3s2$b1 <- e3s2$b1 / sum(e3s2$b1, na.rm = TRUE)
e3s2$b2 <- e3s2$b2 / sum(e3s2$b2, na.rm = TRUE)
e3s2$b3 <- e3s2$b3 / sum(e3s2$b3, na.rm = TRUE)
e3s2$b4 <- e3s2$b4 / sum(e3s2$b4, na.rm = TRUE)
e3s2$mean_freq <- (e3s2$b1 + e3s2$b2 + e3s2$b3 + e3s2$b4) / 4
e3s2 <- subset(e3s2, mean_freq > 1e-5 * 10^(1/4))
e3s2 <- subset(e3s2, mean_freq != 0)
e3s2$weighted_ave <- (e3s2$b1 * 0 + e3s2$b2 * (1/3) + e3s2$b3 * (2/3) + e3s2$b4) /
rowSums(e3s2[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e3s2_weighted_ave.csv", e3s2, row.names = FALSE, quote = FALSE)
e3s3_1a <- read.delim(file = "enrich2_output/Rep7_Bin1.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s3_2a <- read.delim(file = "enrich2_output/Rep7_Bin2.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s3_3a <- read.delim(file = "enrich2_output/Rep7_Bin3.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s3_4a <- read.delim(file = "enrich2_output/Rep7_Bin4.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
e3s3 <- merge(e3s3_1a, e3s3_2a, by = "X", all = TRUE)
e3s3 <- merge(e3s3, e3s3_3a, by = "X", all = TRUE)
e3s3 <- merge(e3s3, e3s3_4a, by = "X", all = TRUE)
## Warning in merge.data.frame(e3s3, e3s3_4a, by = "X", all = TRUE): column names
## 'count.x', 'count.y' are duplicated in the result
colnames(e3s3) <- c("barcode","b1","b2","b3","b4")
e3s3 <- merge(e3s3, PTEN_library_variant_map, by = "barcode", all = FALSE)
e3s3[is.na(e3s3)] <- 0
e3s3 <- e3s3 %>% group_by(variant) %>% summarize(b1 = sum(b1), b2 = sum(b2), b3 = sum(b3), b4 = sum(b4), class = unique(class))
e3s3$b1 <- e3s3$b1 / sum(e3s3$b1, na.rm = TRUE)
e3s3$b2 <- e3s3$b2 / sum(e3s3$b2, na.rm = TRUE)
e3s3$b3 <- e3s3$b3 / sum(e3s3$b3, na.rm = TRUE)
e3s3$b4 <- e3s3$b4 / sum(e3s3$b4, na.rm = TRUE)
e3s3$mean_freq <- (e3s3$b1 + e3s3$b2 + e3s3$b3 + e3s3$b4) / 4
e3s3 <- subset(e3s3, mean_freq > 1e-5 * 10^(1/4))
e3s3 <- subset(e3s3, mean_freq != 0)
e3s3$weighted_ave <- (e3s3$b1 * 0 + e3s3$b2 * (1/3) + e3s3$b3 * (2/3) + e3s3$b4) /
rowSums(e3s3[,c("b1","b2","b3","b4")])
write.csv(file = "output_datatables/e3s3_weighted_ave.csv", e3s3, row.names = FALSE, quote = FALSE)
e1s1 <- read.csv(file = "output_datatables/e1s1_weighted_ave.csv", header = T, stringsAsFactors = F)
e1s2 <- read.csv(file = "output_datatables/e1s2_weighted_ave.csv", header = T, stringsAsFactors = F)
e1s3 <- read.csv(file = "output_datatables/e1s3_weighted_ave.csv", header = T, stringsAsFactors = F)
e1s4 <- read.csv(file = "output_datatables/e1s4_weighted_ave.csv", header = T, stringsAsFactors = F)
e3s1 <- read.csv(file = "output_datatables/e3s1_weighted_ave.csv", header = T, stringsAsFactors = F)
e3s2 <- read.csv(file = "output_datatables/e3s2_weighted_ave.csv", header = T, stringsAsFactors = F)
e3s3 <- read.csv(file = "output_datatables/e3s3_weighted_ave.csv", header = T, stringsAsFactors = F)
e1s1$position <- as.numeric(gsub("[A-Z]", "", e1s1$variant))
## Warning: NAs introduced by coercion
e1s1$score1 <- (e1s1$weighted_ave - median(subset(e1s1, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e1s1, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e1s1, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e1s2$position <- as.numeric(gsub("[A-Z]", "", e1s2$variant))
## Warning: NAs introduced by coercion
e1s2$score2 <- (e1s2$weighted_ave - median(subset(e1s2, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e1s2, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e1s2, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e1s3$position <- as.numeric(gsub("[A-Z]", "", e1s3$variant))
## Warning: NAs introduced by coercion
e1s3$score3 <- (e1s3$weighted_ave - median(subset(e1s3, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e1s3, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e1s3, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e1s4$position <- as.numeric(gsub("[A-Z]", "", e1s4$variant))
## Warning: NAs introduced by coercion
e1s4$score4 <- (e1s4$weighted_ave - median(subset(e1s4, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e1s4, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e1s4, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e3s1$position <- as.numeric(gsub("[A-Z]", "", e3s1$variant))
## Warning: NAs introduced by coercion
e3s1$score5 <- (e3s1$weighted_ave - median(subset(e3s1, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e3s1, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e3s1, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e3s2$position <- as.numeric(gsub("[A-Z]", "", e3s2$variant))
## Warning: NAs introduced by coercion
e3s2$score6 <- (e3s2$weighted_ave - median(subset(e3s2, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e3s2, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e3s2, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
e3s3$position <- as.numeric(gsub("[A-Z]", "", e3s3$variant))
## Warning: NAs introduced by coercion
e3s3$score7 <- (e3s3$weighted_ave - median(subset(e3s3, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE)) /
(median(subset(e3s3, class == "wt")$weighted_ave, na.rm = TRUE) - median(subset(e3s3, position > 50 & position < 350 & class == "nonsense")$weighted_ave, na.rm = TRUE))
replicates <- merge(e1s1[,c("variant","score1")], e1s2[,c("variant","score2")], by = "variant", all = TRUE)
replicates <- merge(replicates, e1s3[,c("variant","score3")], by = "variant", all = TRUE)
replicates <- merge(replicates, e1s4[,c("variant","score4")], by = "variant", all = TRUE)
replicates <- merge(replicates, e3s1[,c("variant","score5")], by = "variant", all = TRUE)
replicates <- merge(replicates, e3s2[,c("variant","score6")], by = "variant", all = TRUE)
replicates <- merge(replicates, e3s3[,c("variant","score7")], by = "variant", all = TRUE)
colnames(replicates) <- c("variant","e1s1","e1s2","e1s3","e1s4","e3s1","e3s2","e3s3")
replicates$average <- rowMeans(replicates[,c("e1s1","e1s2","e1s3","e1s4","e3s1","e3s2","e3s3")], na.rm = TRUE)
replicates$count <- rowSums(cbind(!is.na(replicates$e1s1),!is.na(replicates$e1s2),!is.na(replicates$e1s3),!is.na(replicates$e1s4),!is.na(replicates$e3s1),!is.na(replicates$e3s2),!is.na(replicates$e3s3)))
replicates$sd <- rowMeans(cbind(abs(replicates$average-replicates$e1s1), abs(replicates$average-replicates$e1s2)
, abs(replicates$average-replicates$e1s3), abs(replicates$average-replicates$e1s4),
abs(replicates$average-replicates$e3s1),abs(replicates$average-replicates$e3s2),abs(replicates$average-replicates$e3s3)), na.rm = T)
replicates$class <- "single"
replicates$start <- substr(gsub("[0-9]","",replicates$variant), 1,1)
replicates$end <- substr(gsub("[0-9]","",replicates$variant), 2,2)
replicates$position <- gsub("[^0-9]","",replicates$variant)
replicates[replicates$end == "X","class"] <- "nonsense"
replicates[replicates$class == "single","class"] <- "missense"
replicates[replicates$start == replicates$end,"class"] <- "synonymous"
replicates[replicates$variant == "wt","start"] <- "Z"
replicates[replicates$variant == "wt","position"] <- "0"
replicates[replicates$variant == "wt","end"] <- "Z"
## Let's try to put in some abundance classifications here
replicates$se <- replicates$sd / sqrt(replicates$count)
replicates$lower_ci <- replicates$average - qnorm(0.975) * replicates$se
replicates$upper_ci <- replicates$average + qnorm(0.975) * replicates$se
replicates["average" == "NaN"] <- NA
synonymous_lowest_5percent <- quantile(subset(replicates, class == "synonymous")$average, 0.05, na.rm = TRUE)
synonymous_median <- quantile(subset(replicates, class == "synonymous")$average, 0.5, na.rm = TRUE)
replicate_filter <- 4
passing_replicate_filter <- subset(replicates, count >= replicate_filter)
passing_replicate_filter$abundance_class <- NA
for (x in 1:nrow(passing_replicate_filter)){
if(is.na(passing_replicate_filter$average[x])){passing_replicate_filter$abundance_class[x] <- "unknown"}
if(passing_replicate_filter$average[x] < synonymous_lowest_5percent & passing_replicate_filter$upper_ci[x] >= synonymous_lowest_5percent){passing_replicate_filter$abundance_class[x] <- "possibly_low"}
if(passing_replicate_filter$average[x] < synonymous_lowest_5percent & passing_replicate_filter$upper_ci[x] < synonymous_lowest_5percent){passing_replicate_filter$abundance_class[x] <- "low"}
if(passing_replicate_filter$average[x] > synonymous_lowest_5percent & passing_replicate_filter$lower_ci[x] < synonymous_lowest_5percent){passing_replicate_filter$abundance_class[x] <- "possibly_wt-like"}
if(passing_replicate_filter$average[x] > synonymous_lowest_5percent & passing_replicate_filter$lower_ci[x] >= synonymous_lowest_5percent){passing_replicate_filter$abundance_class[x] <- "wt-like"}
}
replicates <- merge(replicates, passing_replicate_filter[,c("variant","abundance_class")], by = "variant", all.x = T)
write.csv(file = "Output_datatables/PTEN_fillin_vampseq_combined.csv", replicates, row.names = FALSE, quote = FALSE)
pten_fillin_vampseq <- ggplot() +
geom_density(data = subset(passing_replicate_filter, class == "missense"), aes(x = average), alpha = 0.4) +
geom_density(data = subset(passing_replicate_filter, class == "nonsense" & (position > 50 | position < 350)), aes(x = average), fill = "blue", alpha = 0.4) +
geom_density(data = subset(passing_replicate_filter, class == "synonymous"), aes(x = average), fill = "red", alpha = 0.4) +
geom_vline(xintercept = subset(passing_replicate_filter, variant == "wt")$average)
pten_fillin_vampseq
ggsave(file = "Plots/PTEN_fillin_VAMPseq_densityplot.pdf", pten_fillin_vampseq, height = 3, width = 5)
pten_fillin_vampseq <- ggplot() +
geom_histogram(data = subset(passing_replicate_filter, class == "missense"), aes(x = average), alpha = 0.4) +
geom_histogram(data = subset(passing_replicate_filter, class == "nonsense" & (position > 50 | position < 350)), aes(x = average), fill = "blue", alpha = 0.4) +
geom_histogram(data = subset(passing_replicate_filter, class == "synonymous"), aes(x = average), fill = "red", alpha = 0.4) +
geom_vline(xintercept = subset(passing_replicate_filter, variant == "wt")$average)
pten_fillin_vampseq
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave(file = "Plots/PTEN_fillin_VAMPseq_histogram.pdf", pten_fillin_vampseq, height = 3, width = 5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This is now the analysis portion
Import all relevant datasets
## Import the original dataset with relevant columns
original <- read.delim(file = "input_datatables/PTEN_abundance_original.tsv", header = TRUE, stringsAsFactors = FALSE)
colnames(original)[colnames(original) %in% c("abundance_class","score","sd","expts","se","lower_ci","upper_ci","egfp_geomean")] <- c("abundance_class_orig","score_orig","sd_orig","count_orig","se_orig","lower_ci_orig","upper_ci_orig","egfp_geomean")
# Rename the score for the original dataset
colnames(original)[colnames(original) == "score"] <- "score_orig"
## Import the fill-in dataset
replicates <- read.csv(file = "output_datatables/PTEN_fillin_vampseq_combined.csv", header = TRUE, stringsAsFactors = FALSE)
colnames(replicates)[colnames(replicates) %in% c("e1s1","e1s2","e1s3","e1s4","e3s1","e3s2","e3s3","average","count","sd","se","lower_ci","upper_ci","abundance_class")] <- c("score9","score10","score11","score12","score13","score14","score15","score_fillin","count_fillin","sd_fillin","se_fillin","lower_ci_fillin","upper_ci_fillin","abundance_class_fillin")
combined <- merge(
replicates[,c("variant","score9","score10","score11","score12","score13","score14","score15","score_fillin","count_fillin","sd_fillin","se_fillin","lower_ci_fillin","upper_ci_fillin","abundance_class_fillin")],
original[,c("variant","position","start","end","class","snv","score_orig","abundance_class_orig","sd_orig","count_orig","se_orig","lower_ci_orig","upper_ci_orig","egfp_geomean","score1","score2","score3","score4","score5","score6","score7","score8")],
by = "variant", all = T)
combined <- combined[,c("variant","position","start","end","class","snv","egfp_geomean","score_orig","abundance_class_orig","score_fillin","score1","score2","score3","score4","score5","score6","score7","score8","score9","score10","score11","score12","score13","score14","score15","sd_orig","count_orig","se_orig","lower_ci_orig","upper_ci_orig","sd_fillin","count_fillin","se_fillin","lower_ci_fillin","upper_ci_fillin")]
First paragraph of results
total_possible <- (402*21) # Includes all missense, nonsense, and synonymous
paste("PTEN variants scored in the first paper:",nrow(subset(combined, !is.na(score_orig)))-2)
## [1] "PTEN variants scored in the first paper: 4407"
paste("PTEN missense variants scored in the first paper:",nrow(subset(combined, class == "missense" & !is.na(score_orig)))-2)
## [1] "PTEN missense variants scored in the first paper: 4110"
paste("Fraction of possible variants scored in the first paper:",round((nrow(subset(combined, !is.na(score_orig)))-2)/total_possible,2))
## [1] "Fraction of possible variants scored in the first paper: 0.52"
paste("Number of positions where 18 or more missense variants were scored in the first paper:", sum(data.frame(table((combined %>% filter(class == "missense", !is.na(score_orig) & variant != "M1V"))$position))$Freq >= 17))
## [1] "Number of positions where 18 or more missense variants were scored in the first paper: 50"
paste("There were",nrow(subset(replicates, class == "missense")),"unique PTEN missense variants associated with a barcode in the secondary library.")
## [1] "There were 4186 unique PTEN missense variants associated with a barcode in the secondary library."
Compare the well-scored variants for the first and second datasets
combined_5plus <- combined %>% filter(count_orig >= 5 & count_fillin >= 5)
paste("There were",nrow(combined_5plus),"variants that were observed in both experiments in 5 or more replicates, and the there was a Pearson's correlation coefficient (r) of",round(cor(combined_5plus$score_orig, combined_5plus$score_fillin),2),"for the scores of the variants from the two different libraries.")
## [1] "There were 272 variants that were observed in both experiments in 5 or more replicates, and the there was a Pearson's correlation coefficient (r) of 0.84 for the scores of the variants from the two different libraries."
lm_orig_fillin <- lm(combined_5plus$score_fillin ~ combined_5plus$score_orig)
paste("For a linear model fit to the overlapping data, the slope was",round(lm_orig_fillin$coefficients[2],2),"and the intercept was",round(lm_orig_fillin$coefficients[1],2))
## [1] "For a linear model fit to the overlapping data, the slope was 0.89 and the intercept was 0.11"
Experiment_score_correlation_plot.pdf <- ggplot() +
theme_bw() +
theme(panel.grid.major = element_blank()) +
scale_x_continuous(breaks = seq(0,1.2,0.2)) +
scale_y_continuous(breaks = seq(0,1.2,0.2)) +
xlab("Score from original experiment") +
ylab("Score from second experiment") +
geom_abline(slope = 1, intercept = 0, alpha = 0.2, size = 10) +
geom_abline(slope = lm_orig_fillin$coefficients[2], intercept = lm_orig_fillin$coefficients[1], alpha = 0.2, size = 10, color = "blue") +
geom_point(data = combined_5plus, aes(x = score_orig, y = score_fillin), alpha = 0.5) +
geom_text(aes(x = 0, y = 1.2, label = paste("n=",nrow(combined_5plus))))
Experiment_score_correlation_plot.pdf
ggsave(file = "plots/Experiment_score_correlation_plot.pdf", Experiment_score_correlation_plot.pdf, height = 50, width = 80, units = "mm")
paste("Pearson's r:",round(cor(combined_5plus$score_orig, combined_5plus$score_fillin, method = "pearson"),2))
## [1] "Pearson's r: 0.84"
paste("Percent of variance explained:",round(cor(combined_5plus$score_orig, combined_5plus$score_fillin, method = "pearson")^2,2))
## [1] "Percent of variance explained: 0.7"
paste("Percent of variance unexplained:",round(1-cor(combined_5plus$score_orig, combined_5plus$score_fillin)^2,2))
## [1] "Percent of variance unexplained: 0.3"
Using all of the available data to ascribe consensus abundance scores
combined$score_total <- rowMeans(combined[c("score1","score2","score3","score4","score5","score6","score7","score8","score9","score10","score11","score12","score13","score14","score15")], na.rm = T)
combined$count_total <- rowSums(!is.na(combined[c("score1","score2","score3","score4","score5","score6","score7","score8","score9","score10","score11","score12","score13","score14","score15")]))
combined$sd_total <- rowMeans(cbind(abs(combined$score_total-combined$score1), abs(combined$score_total-combined$score2), abs(combined$score_total-combined$score3), abs(combined$score_total-combined$score4), abs(combined$score_total-combined$score5), abs(combined$score_total-combined$score6), abs(combined$score_total-combined$score7), abs(combined$score_total-combined$score8), abs(combined$score_total-combined$score9), abs(combined$score_total-combined$score10), abs(combined$score_total-combined$score11), abs(combined$score_total-combined$score12),abs(combined$score_total-combined$score13),abs(combined$score_total-combined$score14),abs(combined$score_total-combined$score15)), na.rm = T)
combined$se_total <- combined$sd_total / sqrt(combined$count_total)
combined$lower_ci_total <- combined$score_total - qnorm(0.975) * combined$se_total
combined$upper_ci_total <- combined$score_total + qnorm(0.975) * combined$se_total
Creating a schematic that will help in describing the replicate filtering scheme in a supplementary figure
schematic_dataframe <- data.frame("variant" = c("A120A","F257F","K322K","A79X","C250X","D331X"),
"class" = c("syn","syn","syn","nonsense","nonsense","nonsense"),
"score" = c(0.8,0.9,0.5,0.1,0.6,0.2),
"linked" = c(7,8,3,4,2,5),
"randomized" = c(2,4,8,5,7,3))
schematic_colorscale <- c("syn" = "red","nonsense" = "blue")
Schematic_1 <- ggplot(data = schematic_dataframe, aes(x = score, fill = class, color = class)) +
theme(legend.position = "none", panel.grid.major = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) +
xlab("Abundance score") + ylab("Variant count") +
scale_x_continuous(limits = c(0,1), expand = c(0,0), breaks = c(0,0.5,1)) + scale_y_continuous(expand = c(0,0.1)) +
scale_fill_manual(values = schematic_colorscale) + scale_color_manual(values = schematic_colorscale) +
geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
geom_point(aes(y = 0), size = 4) +
geom_density(aes(color = class), alpha = 0.2, adjust = 1.5)
Schematic_1
ggsave(file = "Plots/Schematic_1.pdf", Schematic_1, height = 4, width = 6, units = "cm")
Schematic_2 <- ggplot(data = subset(schematic_dataframe, linked >= 4), aes(x = score, fill = class, color = class)) +
theme(legend.position = "none", panel.grid.major = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) +
xlab("Abundance score") + ylab("Variant count") +
scale_x_continuous(limits = c(0,1), expand = c(0,0), breaks = c(0,0.5,1)) + scale_y_continuous(expand = c(0,0.1)) +
scale_fill_manual(values = schematic_colorscale) + scale_color_manual(values = schematic_colorscale) +
geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
geom_point(aes(y = 0), size = 4) +
geom_density(aes(color = class), alpha = 0.2, adjust = 1.5)
Schematic_2
ggsave(file = "Plots/Schematic_2.pdf", Schematic_2, height = 4, width = 6, units = "cm")
Schematic_3 <- ggplot(data = subset(schematic_dataframe, randomized >= 4), aes(x = score, fill = class, color = class)) +
theme(legend.position = "none", panel.grid.major = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) +
xlab("Abundance score") + ylab("Variant count") +
scale_x_continuous(limits = c(0,1), expand = c(0,0), breaks = c(0,0.5,1)) + scale_y_continuous(expand = c(0,0.1)) +
scale_fill_manual(values = schematic_colorscale) + scale_color_manual(values = schematic_colorscale) +
geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
geom_point(aes(y = 0), size = 4) +
geom_density(aes(color = class), alpha = 0.2, adjust = 1.5)
Schematic_3
ggsave(file = "Plots/Schematic_3.pdf", Schematic_3, height = 4, width = 6, units = "cm")
Lets optimize replicate number cutoff based on controls (synonymous and nonsense)
summary_frame<- data.frame("cutoff_inclusive" = seq(1,15))
resampling_number <-100
for(y in 1:resampling_number){
observed_frame <- data.frame("cutoff_inclusive" = seq(1,15))
observed_frame$syn_value <- 0
observed_frame$called_correctly <- 0
observed_frame$called_incorrectly <- 0
shuffled_frame <- data.frame("cutoff_inclusive" = seq(1,15))
shuffled_frame$syn_value <- 0
shuffled_frame$called_correctly <- 0
scorable_syn <- nrow(subset(combined, class == "synonymous" & !is.na(score_total)))
scorable_syn_subset <- subset(combined, class == "synonymous" & !is.na(score_total))
scorable_syn_subset$shuffled_score_total <- sample(scorable_syn_subset$score_total, replace = F)
scorable_nonsense <- nrow(subset(combined, class == "nonsense" & !is.na(score_total) & position < 350 & position > 50))
nonsense_with_scores <- subset(combined, class == "nonsense" & !is.na(score_total))
nonsense_with_scores$shuffled_upperci <- sample(nonsense_with_scores$upper_ci_total, replace = F)
for(x in 1:nrow(observed_frame)){
temp_subset <- subset(combined, count_total >= observed_frame$cutoff_inclusive[x] & !is.na(score_total))
temp_subset_syn <- subset(scorable_syn_subset, class == "synonymous" & count_total >= observed_frame$cutoff_inclusive[x] & !is.na(score_total))
temp_subset_syn_threshold <- quantile(temp_subset_syn$score_total, 0.05)
temp_subset_nonsense <- subset(nonsense_with_scores, count_total >= observed_frame$cutoff_inclusive[x] & !is.na(score_total) & position < 350 & position > 50)
called_correctly <- sum(temp_subset_nonsense$upper_ci_total < temp_subset_syn_threshold, na.rm = T)
called_incorrectly <- sum(temp_subset_nonsense$upper_ci_total >= temp_subset_syn_threshold, na.rm = T)
observed_frame$syn_value[x] <- temp_subset_syn_threshold
observed_frame$called_correctly[x] <- called_correctly
observed_frame$called_incorrectly[x] <- called_incorrectly
temp_subset_syn_threshold_shuffled <- quantile(temp_subset_syn$shuffled_score_total, 0.05)
random_correctly <- sum(temp_subset_nonsense$shuffled_upperci < temp_subset_syn_threshold_shuffled, na.rm = T)
random_incorrectly <- sum(temp_subset_nonsense$shuffled_upperci >= temp_subset_syn_threshold_shuffled, na.rm = T)
shuffled_frame$syn_value[x] <- temp_subset_syn_threshold_shuffled
shuffled_frame$called_correctly[x] <- random_correctly
}
summary_frame <- cbind(summary_frame, observed_frame$called_correctly - shuffled_frame$called_correctly)
}
summary_frame$average_correct <- rowMeans(summary_frame[,seq(2,resampling_number+1)])
summary_frame_melted <- melt(summary_frame[,seq(1,resampling_number+1)], id = "cutoff_inclusive")
observed_frame$pct_correct_of_total_nonsense <- observed_frame$called_correctly / (observed_frame$called_correctly + observed_frame$called_incorrectly) * 100
correct_and_incorrectly_scored_plot <- ggplot() +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
labs(x = "Minimum variant replicate\ncount for inclusion", y = "Correctly(green) or\nincorrectly(red) scored") +
scale_x_continuous(limits = c(0,15), breaks = c(0,5,10,15)) +
geom_hline(yintercept = 0, alpha = 0.5) +
geom_line(data = observed_frame, aes(x = cutoff_inclusive, y = called_correctly), alpha = 0.5, width = 0.2, color = "dark green") +
geom_point(data = observed_frame, aes(x = cutoff_inclusive, y = called_correctly), alpha = 0.5, width = 0.2, color = "dark green") +
geom_text(data = observed_frame, aes(x = cutoff_inclusive, y = called_correctly+10, label = called_correctly), alpha = 1, color = "dark green", size = 2.5) +
geom_line(data = observed_frame, aes(x = cutoff_inclusive, y = called_incorrectly), alpha = 0.5, width = 0.2, color = "magenta") +
geom_point(data = observed_frame, aes(x = cutoff_inclusive, y = called_incorrectly), alpha = 0.5, width = 0.2, color = "magenta") +
geom_text(data = observed_frame, aes(x = cutoff_inclusive, y = called_incorrectly-15, label = called_incorrectly), alpha = 1, color = "magenta", size = 2.5)
correct_and_incorrectly_scored_plot
ggsave(file = "Plots/Correct_and_incorrectly_scored_plot.pdf", correct_and_incorrectly_scored_plot, height = 50, width = 80, units = "mm")
replicate_cutoff_nonsense_syn_plot <- ggplot() +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
scale_x_continuous(limits = c(0,15), breaks = c(0,5,10,15)) +
labs(x = "Minimum variant replicate\ncount for inclusion", y = "Nonsense variants correctly\nscored over random") +
geom_jitter(data = summary_frame_melted, aes(x = cutoff_inclusive, y = value), alpha = 0.1, width = 0.2) +
geom_boxplot(data = summary_frame_melted, aes(x = cutoff_inclusive, y = value, group = cutoff_inclusive), alpha = 0.2, fill = "red")
replicate_cutoff_nonsense_syn_plot
ggsave(file = "Plots/Replicate_cutoff_nonsense_syn_plot.pdf", replicate_cutoff_nonsense_syn_plot, height = 60, width = 80, units = "mm")
Using the optimal replicate filter value obtained from the above graph to actually filter the data
combined$score_abundance <- NA
for(x in 1:nrow(combined)){
if(combined$count_total[x] >= 4){combined$score_abundance[x] <- combined$score_total[x]}
}
synonymous_lowest_5percent <- quantile(subset(combined, class == "synonymous")$score_abundance, 0.05, na.rm = TRUE)
synonymous_median <- quantile(subset(combined, class == "synonymous")$score_abundance, 0.5, na.rm = TRUE)
combined_for_plotting_classes <- rbind(
combined %>% filter(class == "missense") %>% mutate(plotting_group = "missense"),
combined %>% filter(class == "synonymous") %>% mutate(plotting_group = "synonymous"),
combined %>% filter(class == "nonsense" & position < 350 & position > 50) %>% mutate(plotting_group = "nonsense"))
combined_for_plotting_classes$plotting_group = factor(combined_for_plotting_classes$plotting_group, levels = c("synonymous","nonsense","missense"))
syn_non_mis_hist_n_frame <- data.frame("n" = c(nrow(subset(combined_for_plotting_classes, plotting_group == "synonymous" & !is.na(score_abundance))),
nrow(subset(combined_for_plotting_classes, plotting_group == "nonsense" & !is.na(score_abundance))),
nrow(subset(combined_for_plotting_classes, plotting_group == "missense" & !is.na(score_abundance)))),
"plotting_group" = c("synonymous","nonsense","missense"), "y" = c(15,20,300))
Synonymous_nonsense_missense_histograms_plot <- ggplot() +
theme_bw() +
theme(panel.grid.major.y = element_blank()) +
xlab("Abundance score") +
geom_hline(yintercept = 0) +
geom_histogram(data = combined_for_plotting_classes, aes(x = score_abundance), fill = "grey50", alpha = 0.5, color = "black") +
geom_text(data = syn_non_mis_hist_n_frame, aes(x = 0.4, y = y, label = paste("n=",n)), hjust = 0.5) +
facet_grid(rows = vars(plotting_group), scales = "free_y")
Synonymous_nonsense_missense_histograms_plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3436 rows containing non-finite values (stat_bin).
ggsave(file = "Plots/Synonymous_nonsense_missense_histograms_plot.pdf", Synonymous_nonsense_missense_histograms_plot, height = 50, width = 80, units = "mm")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3436 rows containing non-finite values (stat_bin).
paste("We now have abundance scores for",
nrow(subset(combined, class == "synonymous" & !is.na(score_abundance))),"synonymous variants")
## [1] "We now have abundance scores for 144 synonymous variants"
paste("We now have abundance scores for",
nrow(subset(combined, class == "nonsense" & !is.na(score_abundance))),"nonsense variants")
## [1] "We now have abundance scores for 160 nonsense variants"
paste("We now have abundance scores for",
nrow(subset(combined, class == "missense" & !is.na(score_abundance))),"missense variants")
## [1] "We now have abundance scores for 4387 missense variants"
To see if the updated dataset is higher in quality, assess how the coefficient of variation changed for variants observed in both datasets
combined_for_plotting_classes_4plus_orig <- combined_for_plotting_classes %>% filter(count_orig >= 4 & score_orig >= -0.2 & score_total >= -0.2)
combined_for_plotting_classes_4plus_composite <- combined_for_plotting_classes %>% filter(count_total >= 4 & score_orig >= -0.2 & score_total >= -0.2)
combined_for_plotting_classes_4plus_orig$cv_orig <- combined_for_plotting_classes_4plus_orig$sd_orig / combined_for_plotting_classes_4plus_orig$score_orig
combined_for_plotting_classes_4plus_composite$cv_total <- combined_for_plotting_classes_4plus_composite$sd_total / combined_for_plotting_classes_4plus_composite$score_total
combined_for_plotting_classes_4plus_orig[combined_for_plotting_classes_4plus_orig$cv_orig < - 0.1,"cv_orig"] <- -0.2
combined_for_plotting_classes_4plus_composite[combined_for_plotting_classes_4plus_composite$cv_total < - 0.1,"cv_total"] <- -0.2
combined_for_plotting_classes_4plus_orig[combined_for_plotting_classes_4plus_orig$cv_orig > 1.5,"cv_orig"] <- 1.7
combined_for_plotting_classes_4plus_composite[combined_for_plotting_classes_4plus_composite$cv_total > 1.5,"cv_total"] <- 1.7
paste("fraction of original missense variants with CV < 0.5:",round(nrow(subset(combined_for_plotting_classes_4plus_orig, cv_orig > 0 & cv_orig < 0.5)) / nrow(combined_for_plotting_classes_4plus_orig),2))
## [1] "fraction of original missense variants with CV < 0.5: 0.76"
paste("fraction of total missense variants with CV < 0.5:",round(nrow(subset(combined_for_plotting_classes_4plus_composite, cv_total > 0 & cv_total < 0.5)) / nrow(combined_for_plotting_classes_4plus_composite),2))
## [1] "fraction of total missense variants with CV < 0.5: 0.78"
paste("fraction of original missense variants with CV > 1.5:",round(nrow(subset(combined_for_plotting_classes_4plus_orig, cv_orig > 0 & cv_orig > 1.5)) / nrow(combined_for_plotting_classes_4plus_orig),2))
## [1] "fraction of original missense variants with CV > 1.5: 0.02"
paste("fraction of total missense variants with CV > 1.5:",round(nrow(subset(combined_for_plotting_classes_4plus_composite, cv_total > 0 & cv_total > 1.5)) / nrow(combined_for_plotting_classes_4plus_composite),2))
## [1] "fraction of total missense variants with CV > 1.5: 0.01"
Coefficient_of_variation_plot <-
ggplot() + theme_bw() +
scale_x_continuous(limits = c(-0.25,1.8)) +
geom_histogram(data = combined_for_plotting_classes_4plus_orig, aes(x = cv_orig), fill = "blue", alpha = 0.5, color = "blue", binwidth = 0.05) +
geom_histogram(data = combined_for_plotting_classes_4plus_composite, aes(x = cv_total), fill = "red", alpha = 0.5, color = "red", binwidth = 0.05) +
geom_vline(xintercept = 0.5, linetype = 2, alpha = 0.4) + geom_vline(xintercept = 0, linetype = 2, alpha = 0.4) +
ylab("Number of variants") + xlab("Coefficient of variation")
Coefficient_of_variation_plot
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
ggsave(file = "Plots/Coefficient_of_variation_plot.pdf", Coefficient_of_variation_plot, height = 40, width = 60, units = "mm")
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
cv_dataframe <- data.frame(dataset = c("Original","Composite"),
cv_below_pt5 = c(nrow(subset(combined_for_plotting_classes_4plus_orig, cv_orig > 0 & cv_orig < 0.5)),
nrow(subset(combined_for_plotting_classes_4plus_composite, cv_total > 0 & cv_total < 0.5))))
cv_dataframe$dataset <- factor(cv_dataframe$dataset, levels = c("Original","Composite"))
low_cv_bargraph <- ggplot() + theme(panel.grid.major.x = element_blank()) + labs(x = NULL, y = "Number of variants") +
scale_fill_manual(values = c("Original" = "blue", "Composite" = "red")) +
geom_bar(data = cv_dataframe, aes(x = dataset, y = cv_below_pt5, fill = dataset), stat = "identity", alpha = 0.75, color = "black")
low_cv_bargraph
ggsave(file = "Plots/Low_CV_bargraph.pdf", low_cv_bargraph, height = 30, width = 60, units = "mm")
paste("There was a",round((cv_dataframe$cv_below_pt5[2] - cv_dataframe$cv_below_pt5[1]) / cv_dataframe$cv_below_pt5[1] * 100,1),"percent increase in the number of variants with a CV less than 0.5")
## [1] "There was a 12.5 percent increase in the number of variants with a CV less than 0.5"
lm_composite_abundance <- lm(combined_for_plotting_classes$score_abundance ~ log10(combined_for_plotting_classes$egfp_geomean))
round(cor(combined_for_plotting_classes$score_abundance, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2)
## [1] 0.93
lm_orig_abundance <- lm(combined_for_plotting_classes$score_orig ~ log10(combined_for_plotting_classes$egfp_geomean))
round(cor(combined_for_plotting_classes$score_orig, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2)
## [1] 0.93
ggplot() + labs(x = "Log10 of geometric mean of MFI", y = "Abundance score") +
geom_abline(slope = lm_composite_abundance$coefficients[2], intercept = lm_composite_abundance$coefficients[1], alpha = 0.25, size = 2, color = "red") +
geom_point(data = combined_for_plotting_classes_4plus_composite, aes(x = log10(egfp_geomean), y = score_abundance), color = "red", alpha = 0.5) +
geom_abline(slope = lm_orig_abundance$coefficients[2], intercept = lm_orig_abundance$coefficients[1], alpha = 0.25, size = 2, color = "blue") +
geom_point(data = combined_for_plotting_classes_4plus_orig, aes(x = log10(egfp_geomean), y = score_orig), color = "blue", alpha = 0.5)
## Warning: Removed 3907 rows containing missing values (geom_point).
## Warning: Removed 3559 rows containing missing values (geom_point).
round(cor(combined_for_plotting_classes$score_orig, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2)
## [1] 0.93
round(cor(combined_for_plotting_classes$score_abundance, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2)
## [1] 0.93
Scores_vs_individually_assessed_variants_plot <- ggplot() + labs(x = "Mean fluorescence intensity of\nindividually tested variants", y = "Abundance score") +
geom_point(data = combined_for_plotting_classes_4plus_orig, aes(x = egfp_geomean, y = score_orig), color = "blue", alpha = 0.5) +
geom_point(data = combined_for_plotting_classes_4plus_composite, aes(x = egfp_geomean, y = score_abundance), color = "red", alpha = 0.5) +
geom_text(aes(x = 1.6, y = 0.2, label = paste("Spearman's rho^2, original:",round(cor(combined_for_plotting_classes$score_orig, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2))), hjust = 1, color = "blue", size = 2.75) +
geom_text(aes(x = 1.6, y = 0, label = paste("Spearman's rho^2, composite:",round(cor(combined_for_plotting_classes$score_abundance, combined_for_plotting_classes$egfp_geomean, use = "complete", method = "spearman")^2,2))), hjust = 1, color = "red", size = 2.75)
Scores_vs_individually_assessed_variants_plot
## Warning: Removed 3559 rows containing missing values (geom_point).
## Warning: Removed 3907 rows containing missing values (geom_point).
ggsave(file = "Plots/Scores_vs_individually_assessed_variants_plot.pdf", Scores_vs_individually_assessed_variants_plot, height = 40, width = 70, units = "mm")
## Warning: Removed 3559 rows containing missing values (geom_point).
## Warning: Removed 3907 rows containing missing values (geom_point).
Ascribe abundance classes based on the full dataset
synonymous_lowest_5percent <- quantile(subset(combined, class == "synonymous")$score_abundance, 0.05, na.rm = TRUE)
synonymous_median <- quantile(subset(combined, class == "synonymous")$score_abundance, 0.5, na.rm = TRUE)
combined$abundance_class <- NA
for (x in 1:nrow(combined)){
if(is.na(combined$score_abundance[x]) | is.na(combined$upper_ci_total[x]) | is.na(combined$lower_ci_total[x])){combined$abundance_class[x] <- "unknown"} else{
if(combined$score_abundance[x] < synonymous_lowest_5percent){combined$abundance_class[x] <- "possibly_low"}
if(combined$upper_ci_total[x] < synonymous_lowest_5percent){combined$abundance_class[x] <- "low"}
if(combined$score_abundance[x] > synonymous_lowest_5percent){combined$abundance_class[x] <- "possibly_wt-like"}
if(combined$lower_ci_total[x] > synonymous_lowest_5percent){combined$abundance_class[x] <- "wt-like"}
}
}
## Export the initial data
PTEN_abundance_table <- subset(combined, abundance_class %in% c("low","possibly_low","wt-like","possibly_wt-like"))[,c("variant","position","start","end","abundance_class")]
PTEN_abundance_table$position <- as.numeric(PTEN_abundance_table$position)
PTEN_abundance_table <- PTEN_abundance_table[order(PTEN_abundance_table$position),]
write.csv(file = "output_datatables/PTEN_abundance_table.csv", row.names = F, PTEN_abundance_table, quote = F)
A summary of key stats from the composite abundance dataset
unassigned_so_far <- subset(combined, !is.na(score_abundance) & !(class %in% c("synonymous","nonsense","missense")))
## Looks like a few synonymous variants were not classified that. Go back and do that now.
for(x in 1:nrow(combined)){
if(is.na(combined$start[x])){
combined$start[x] <- substr(gsub("[^A-Za-z]", "", combined$variant[x]),1,1)
combined$end[x] <- substr(gsub("[^A-Za-z]", "", combined$variant[x]),2,2)
combined$position[x] <- gsub("[^0-9]", "", combined$variant[x])
if(combined$start[x] == combined$end[x]){combined$class[x] <- "synonymous"}
}
}
combined <- combined %>% filter(position != "")
paste("Number of variants that were not scored before but now have scores",nrow(combined %>% filter(is.na(score_orig) & !is.na(score_abundance))))
## [1] "Number of variants that were not scored before but now have scores 764"
paste("Number of variants that were originally scored but given new data with the fill-in experiments:",nrow(combined %>% filter(!is.na(score_orig)) %>% filter((!is.na(score9) | !is.na(score10) | !is.na(score11) | !is.na(score12) | !is.na(score13) | !is.na(score14) | !is.na(score15))) %>% filter(count_total >= 4)))
## [1] "Number of variants that were originally scored but given new data with the fill-in experiments: 3351"
paste("Number of total PTEN variants currently scored:",round(nrow(subset(combined, !is.na(score_abundance))),2))
## [1] "Number of total PTEN variants currently scored: 4721"
paste("Number of synonymous PTEN variants currently scored:",round(nrow(subset(combined, !is.na(score_abundance) & class == "synonymous")),2))
## [1] "Number of synonymous PTEN variants currently scored: 174"
paste("Number of nonsense PTEN variants currently scored:",round(nrow(subset(combined, !is.na(score_abundance) & class == "nonsense")),2))
## [1] "Number of nonsense PTEN variants currently scored: 160"
paste("Number of missense PTEN variants currently scored:",round(nrow(subset(combined, !is.na(score_abundance) & class == "missense")),2))
## [1] "Number of missense PTEN variants currently scored: 4387"
paste("Number of variants that were not scored before but got a score after the second dataset:",round(nrow(subset(combined, is.na(score_orig) & !is.na(score_abundance))),2))
## [1] "Number of variants that were not scored before but got a score after the second dataset: 764"
paste("Number of variants that received adjusted scores:",round(nrow(subset(combined, !is.na(score_orig) & score_orig != score_abundance)),2))
## [1] "Number of variants that received adjusted scores: 3904"
paste("Number of low abundance PTEN variants intepreted in total:",round(nrow(subset(combined, abundance_class %in% c("low"))),2))
## [1] "Number of low abundance PTEN variants intepreted in total: 1423"
paste("Number of WT-like abundance PTEN variants intepreted in total:",round(nrow(subset(combined, abundance_class %in% c("wt-like"))),2))
## [1] "Number of WT-like abundance PTEN variants intepreted in total: 1738"
paste("The number of additional low abundance variants identified in the combined dataset:",nrow(subset(combined, abundance_class %in% c("low"))) - nrow(subset(combined, abundance_class_orig %in% c("low"))))
## [1] "The number of additional low abundance variants identified in the combined dataset: 163"
paste("The number of additional WT-like abundance variants identified in the combined dataset:",nrow(subset(combined, abundance_class %in% c("wt-like"))) - nrow(subset(combined, abundance_class_orig %in% c("wt-like"))))
## [1] "The number of additional WT-like abundance variants identified in the combined dataset: 161"
interpreted_subset <- subset(combined, abundance_class %in% c("low","wt-like"))
paste("Fraction of scored variants in the combined study considered low or WT-like:",round(nrow(subset(combined, abundance_class %in% c("low","wt-like"))) / nrow(subset(combined, abundance_class %in% c("low","wt-like","possibly_low","possibly_wt-like"))),2))
## [1] "Fraction of scored variants in the combined study considered low or WT-like: 0.67"
paste("Fraction of scored variants in the original study considered low or WT-like:",round(nrow(subset(combined, abundance_class_orig %in% c("low","wt-like")))/nrow(subset(combined, abundance_class_orig %in% c("low","wt-like","possibly_low","possibly_wt-like"))),2))
## [1] "Fraction of scored variants in the original study considered low or WT-like: 0.64"
paste("Number of positions where 18 or more missense variants were scored in the end:", sum(data.frame(table((combined %>% filter(class == "missense", !is.na(score_abundance) & variant != "M1V"))$position))$Freq >= 17))
## [1] "Number of positions where 18 or more missense variants were scored in the end: 61"
paste("Number of positions where all missense variants were scored in the end:", sum(data.frame(table((combined %>% filter(class == "missense", !is.na(score_abundance) & variant != "M1V"))$position))$Freq == 19))
## [1] "Number of positions where all missense variants were scored in the end: 22"
## Also figuring out how many variants enter each abundance class
abundance_class_summary <- merge(data.frame(table(combined$abundance_class_orig)),data.frame(table(combined$abundance_class)) %>% filter(Var1 != "unknown"), by = "Var1")
colnames(abundance_class_summary) <- c("abundance_class","Original","Composite")
abundance_class_summary_melt <- melt(abundance_class_summary, id = "abundance_class")
Abundance_class_original_composite <- ggplot() + theme(panel.grid.major.x = element_blank()) + labs(x = NULL, y = "Number of variants") +
scale_fill_manual(values = c("Original" = "blue", "Composite" = "red")) +
geom_bar(data = abundance_class_summary_melt, aes(x = abundance_class, y = value, fill = variable), stat = "identity", alpha = 0.75, position_dodge(), color = "black")
Abundance_class_original_composite
ggsave(file = "Plots/Abundance_class_original_composite.pdf", Abundance_class_original_composite, height = 40, width = 90, units = "mm")
write.table(file = "output_datatables/Composite_abundance_data.tsv", combined, sep = "\t", quote = F, row.names = F)
Updating the ClinVar assessments with the composite abundance dataset
Bringing in a more recent PTEN ClinVar dataset
## Add annotations for Clinvar
clinvar <- read.table(file = "input_datatables/20210415_PTEN_clinvar_missense_nonsense.tsv", header = TRUE, sep = "\t")
clinvar$Name <- as.character(clinvar$Name)
clinvar$position <- substr(clinvar$Name, nchar((clinvar$Name)) - 12, nchar((clinvar$Name)))
clinvar <- clinvar[grepl("\\(p.", clinvar$position),]
for (x in 1:nrow(clinvar)){clinvar$position[x] <- unlist(strsplit(clinvar$position[x],"\\(p."))[2]}
clinvar <- subset(clinvar, position != "")
clinvar <- subset(clinvar, nchar(position) < 20)
clinvar$start <- "NA"
clinvar$end <- "NA"
for (x in 1:nrow(clinvar)){
clinvar$start[x] <- substr(clinvar$position[x], 1, 3)
clinvar$end[x] <- substr(clinvar$position[x], nchar(toString(clinvar$position[x]))-4, nchar(toString(clinvar$position[x])))}
clinvar$start <- gsub("[^A-Za-z]", "", clinvar$start)
clinvar$end <- gsub("[^A-Za-z]", "", clinvar$end)
clinvar$position <- gsub("[^0-9]", "", clinvar$position)
for (x in 1:nrow(clinvar)){
clinvar$start[x] <- to_single_notation(clinvar$start[x])
clinvar$end[x] <- to_single_notation(clinvar$end[x])
}
clinvar$variant <- paste(clinvar$start, clinvar$position, clinvar$end, sep ="")
clinvar2 <- clinvar[c("variant","Condition.s.","Clinical.significance..Last.reviewed.")]
colnames(clinvar2) <- c("variant","clinvar_disease","clinvar_interpretation")
clinvar2$clinvar_pathog <- 0
clinvar2$clinvar_likely_pathog <- 0
clinvar2$clinvar_uncertain <- 0
clinvar2$clinvar_likely_benign <- 0
clinvar2$clinvar_benign <- 0
for(x in 1:nrow(clinvar2)){
if(grepl("Pathogenic",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_pathog[x] <- 1}
if(grepl("Likely pathogenic",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_likely_pathog[x] <- 1}
if(grepl("Uncertain",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_uncertain[x] <- 1}
if(grepl("Conflicting",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_uncertain[x] <- 1}
if(grepl("Likely benign",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_likely_benign[x] <- 1}
if(grepl("Benign",clinvar2$clinvar_interpretation[x]) == TRUE){
clinvar2$clinvar_benign[x] <- 1}
}
## Manually fixing PTEN LP/P CLinVar conflicts / double-counts
pten_lp_p_conflicts <- clinvar2[grep("Pathogenic/Likely pathogenic",clinvar2$clinvar_interpretation),]
clinvar2[clinvar2$variant == "H61R","clinvar_likely_pathog"] <- 0 ## The 2017 Pathogenic Assertion by Herman Laboratory,Nationwide Children's Hospital is much better supported
clinvar2[clinvar2$variant == "Y68H","clinvar_likely_pathog"] <- 0 ## The 2017 Pathogenic Assertion by GeneDx gives good supporting evidence of pathogenicity assertion
clinvar2[clinvar2$variant == "L108P","clinvar_likely_pathog"] <- 0 ## The 2014 Pathogenic Assertion by GeneDx gives good supporting evidence of pathogenicity assertion
clinvar2[clinvar2$variant == "G127R","clinvar_likely_pathog"] <- 0 ## The Assertion provided by GeneDx gives good supporting evidence of pathogenicity
clinvar2[clinvar2$variant == "R130L","clinvar_likely_pathog"] <- 0 ## The germline assertions for R130L is Pathogenic, whereas somatic assertions are likely pathogenic (and I'm only considering germline for downstream analyses)
clinvar2[clinvar2$variant == "R130Q","clinvar_likely_pathog"] <- 0 ## The germline assertions for R130Q is Pathogenic, whereas somatic assertions are likely pathogenic (and I'm only considering germline for downstream analyses)
clinvar2[clinvar2$variant == "G132V","clinvar_likely_pathog"] <- 0 ## The assertions made by GeneDx and OMIM for pathogenic are much better supported
clinvar2[clinvar2$variant == "R173C","clinvar_likely_pathog"] <- 0 ## The assertions for pathogenic made by the clinical lab and GeneDx are well supproted
clinvar2[clinvar2$variant == "R173H","clinvar_likely_pathog"] <- 0 ## The germline assertions for R173H are pathogenic, so no conflict at germline level.
#### This subsection finds duplicate annotations, and removes an uninformative one
clinvar2_duplicates <- subset(data.frame(table(clinvar2$variant)), Freq > 1)
duplicate_clinvar_rows <- c()
for(x in 1:nrow(clinvar2_duplicates)){
duplicate_clinvar_rows <- append(duplicate_clinvar_rows,which(clinvar2_duplicates$Var1[x] == clinvar2$variant)[1])
}
clinvar2_single <- clinvar2[!(seq(1:nrow(clinvar2)) %in% duplicate_clinvar_rows),]
combined_clinvar <- merge(combined, clinvar2_single[,c("variant","clinvar_pathog","clinvar_likely_pathog","clinvar_uncertain","clinvar_likely_benign")], by = "variant", all.x = TRUE)
combined_clinvar[is.na(combined_clinvar$clinvar_pathog),"clinvar_pathog"] <- 0
combined_clinvar[is.na(combined_clinvar$clinvar_likely_pathog),"clinvar_likely_pathog"] <- 0
combined_clinvar[is.na(combined_clinvar$clinvar_uncertain),"clinvar_uncertain"] <- 0
combined_clinvar[is.na(combined_clinvar$clinvar_likely_benign),"clinvar_likely_benign"] <- 0
Bringing in and setting up new GnomAD data
pten_gnomad <- read.csv(file = "input_datatables/gnomAD_v2.1.1_(non-TOPMed)_ENSG00000171862_2021_04_20_16_18_13.csv", header = T, stringsAsFactors = F)
pten_gnomad <- subset(pten_gnomad, !(VEP.Annotation %in% c("splice_donor_variant","splice_donor")) & VEP.Annotation %in% c("missense_variant","stop_gained"))
pten_gnomad$variant <- substr(pten_gnomad$Protein.Consequence,3,15)
pten_gnomad$position <- gsub("[A-Za-z]","",pten_gnomad$variant)
for(x in 1:nrow(pten_gnomad)){
pten_gnomad$start[x] <- to_single_notation(substr(gsub("[0-9]","",pten_gnomad$variant[x]),1,3))
pten_gnomad$end[x] <- to_single_notation(substr(gsub("[0-9]","",pten_gnomad$variant[x]),4,6))
pten_gnomad$variant[x] <- paste(pten_gnomad$start[x],pten_gnomad$position[x],pten_gnomad$end[x],sep="")
}
colnames(pten_gnomad)[colnames(pten_gnomad) == "Allele.Count"] <- "gnomad_allele_count"
colnames(pten_gnomad)[colnames(pten_gnomad) == "Allele.Number"] <- "gnomad_allele_number"
pten_gnomad_variant <- pten_gnomad %>% group_by(variant) %>% summarize(gnomad_bravo_allele_count = sum(gnomad_allele_count), gnomad_bravo_allele_number = max(gnomad_allele_number), .groups = "drop")
pten_bravo <- read.csv(file = "input_datatables/PTEN_BRAVO_210420_ENSG00000171862.csv", header = T, stringsAsFactors = F)
pten_bravo <- pten_bravo[!grepl("frameshift",pten_bravo$Annotation),]
pten_bravo$type <- ""
for(x in 1:nrow(pten_bravo)){
pten_bravo$variant[x] <- substr(strsplit(pten_bravo$Consequence[x],";",1)[[1]][1],3,15)
pten_bravo$position[x] <- gsub("[A-Za-z]","",pten_bravo$variant[x])
pten_bravo$start[x] <- to_single_notation(substr(gsub("[0-9]","",pten_bravo$variant[x]),1,3))
pten_bravo$end[x] <- to_single_notation(substr(gsub("[0-9]","",pten_bravo$variant[x]),4,6))
pten_bravo$variant[x] <- paste(pten_bravo$start[x],pten_bravo$position[x],pten_bravo$end[x],sep="")
if(pten_bravo$start[x] == pten_bravo$end[x]){pten_bravo$type[x] <- "synonymous"}
}
colnames(pten_bravo)[colnames(pten_bravo) == "Het"] <- "bravo_allele_count"
pten_bravo$bravo_allele_number <- 132345 #TOPMed Freeze 8 on GRCh38
pten_bravo_variant <- pten_bravo %>% group_by(variant) %>% summarize(gnomad_bravo_allele_count = sum(bravo_allele_count), gnomad_bravo_allele_number = max(bravo_allele_number), .groups = "drop")
pten_gnomad_bravo_variant <- rbind(pten_gnomad_variant[,c("variant","gnomad_bravo_allele_count","gnomad_bravo_allele_number")], pten_bravo_variant[,c("variant","gnomad_bravo_allele_count","gnomad_bravo_allele_number")]) %>% group_by(variant) %>% summarize(gnomad_bravo_allele_count = sum(gnomad_bravo_allele_count), gnomad_bravo_allele_number = sum(gnomad_bravo_allele_number), .groups = "drop")
pten_gnomad_bravo_variant$gnomad_bravo_allele_number <- max(pten_gnomad_bravo_variant$gnomad_bravo_allele_number)
combined_clinvar_gnomad_bravo <- merge(combined_clinvar, pten_gnomad_bravo_variant[,c("variant","gnomad_bravo_allele_count","gnomad_bravo_allele_number")], all.x = T)
combined_clinvar_gnomad_bravo$above_gnomad_allele_count_threshold_for_cowdens <- NA
collective_pten_cowdens_allele_frequency <- 1/400000
for(x in 1:nrow(combined_clinvar_gnomad_bravo)){
if(!is.na(combined_clinvar_gnomad_bravo$gnomad_bravo_allele_count[x])){
if(combined_clinvar_gnomad_bravo$gnomad_bravo_allele_count[x] > qbinom(0.99, size = combined_clinvar_gnomad_bravo$gnomad_bravo_allele_number[x], collective_pten_cowdens_allele_frequency / 0.95)){
combined_clinvar_gnomad_bravo$above_gnomad_allele_count_threshold_for_cowdens[x] <- "above threshold"
if(combined_clinvar_gnomad_bravo$gnomad_bravo_allele_count[x] <= qbinom(0.99, size = combined_clinvar_gnomad_bravo$gnomad_bravo_allele_number[x], collective_pten_cowdens_allele_frequency / 0.95)){
combined_clinvar_gnomad_bravo$above_gnomad_allele_count_threshold_for_cowdens[x] <- "below threshold"
}
}
}
}
write.csv(file = "output_datatables/combined_clinvar_gnomad_bravo.csv", combined_clinvar_gnomad_bravo, quote = FALSE, row.names = FALSE)
This is where the new biological analyses with the ClinVar and GnomAD data begins
custom_colorscale <- c("low" = "#3366ff","possibly_low" = "#b3d9ff","possibly_wt-like" = "#ffcccc","wt-like" = "#F8766D","dominant_negative" = "yellow","high" = "brown","unknown" = "grey75")
combined_for_plotting_clinvar <- rbind(
combined_clinvar_gnomad_bravo %>% filter(snv == 1) %>% mutate(clinvar = "All SNV"),
combined_clinvar_gnomad_bravo %>% filter(clinvar_pathog == 1) %>% mutate(clinvar = "Pathog"),
combined_clinvar_gnomad_bravo %>% filter(clinvar_likely_pathog == 1) %>% mutate(clinvar = "Likely pathog"),
combined_clinvar_gnomad_bravo %>% filter(clinvar_uncertain == 1) %>% mutate(clinvar = "Uncertain"),
combined_clinvar_gnomad_bravo %>% filter(above_gnomad_allele_count_threshold_for_cowdens == "above threshold") %>% mutate(clinvar = "GnomAD inferred benign")
)
all_snv_table <- data.frame(table((subset(combined_clinvar_gnomad_bravo, snv == 1))$abundance_class))
colnames(all_snv_table) <- c("abundance_class","count")
all_snv_table$count_interpreted <- all_snv_table$count
all_snv_table[all_snv_table$abundance_class == "unknown","count_interpreted"] <- 0
all_snv_table[all_snv_table$abundance_class == "low","count_interpreted"] / sum(all_snv_table$count_interpreted)
## [1] 0.2741722
pathogenic_table <- data.frame(table((subset(combined_clinvar_gnomad_bravo, clinvar_pathog == 1))$abundance_class))
colnames(pathogenic_table) <- c("abundance_class","count")
pathogenic_table$count_interpreted <- pathogenic_table$count
pathogenic_table[pathogenic_table$abundance_class == "unknown","count_interpreted"] <- 0
pathogenic_table[pathogenic_table$abundance_class == "low","count_interpreted"] / sum(pathogenic_table$count_interpreted)
## [1] 0.7903226
likelypathog_table <- data.frame(table((subset(combined_clinvar_gnomad_bravo, clinvar_likely_pathog == 1))$abundance_class))
colnames(likelypathog_table) <- c("abundance_class","count")
likelypathog_table$count_interpreted <- likelypathog_table$count
likelypathog_table[likelypathog_table$abundance_class == "unknown","count_interpreted"] <- 0
likelypathog_table[likelypathog_table$abundance_class == "low","count_interpreted"] / sum(likelypathog_table$count_interpreted)
## [1] 0.6037736
plp_table <- data.frame(table((subset(combined_clinvar_gnomad_bravo, clinvar_likely_pathog == 1 | clinvar_pathog == 1))$abundance_class))
colnames(plp_table) <- c("abundance_class","count")
plp_table$count_interpreted <- plp_table$count
plp_table[plp_table$abundance_class == "unknown","count_interpreted"] <- 0
plp_table[plp_table$abundance_class == "low","count_interpreted"] / sum(plp_table$count_interpreted)
## [1] 0.7181818
combined_for_plotting_clinvar$clinvar <- factor(combined_for_plotting_clinvar$clinvar, levels = c("All SNV","Pathog","Likely pathog","Uncertain","GnomAD inferred benign"))
Clinvar_plots_supplementary <- ggplot() +
theme_bw() + theme(legend.position = "top") +
scale_fill_manual(values = custom_colorscale) + scale_color_manual(values = custom_colorscale) +
geom_histogram(data = subset(combined_for_plotting_clinvar, class == "missense"), aes(x = score_abundance, fill = abundance_class), binwidth = 0.1, alpha = 0.5, position="stack", color = "black") +
facet_grid(rows = vars(clinvar), scale = "free_y")
Clinvar_plots_supplementary
## Warning: Removed 1135 rows containing non-finite values (stat_bin).
ggsave(file = "plots/Clinvar_plots_supplementary.pdf", Clinvar_plots_supplementary, height = 140, width = 80, units = "mm")
## Warning: Removed 1135 rows containing non-finite values (stat_bin).
Printing out commands that can be used in Pymol (crystal structure 2d5r)
paste("select substrate, resi 1352","show spheres, substrate","color magenta, substrate", sep = ";")
## [1] "select substrate, resi 1352;show spheres, substrate;color magenta, substrate"
paste("select pathog_stable, not name c+n+o and resi", gsub(", ","+",toString(as.character(subset(combined_clinvar, class == "missense" & clinvar_pathog == 1 & snv == 1 & abundance_class == "wt-like")$position))),"; show spheres, pathog_stable; color red, pathog_stable")
## [1] "select pathog_stable, not name c+n+o and resi 24+93+12+130+14+15+47 ; show spheres, pathog_stable; color red, pathog_stable"
paste("select likely_pathog_stable, not name c+n+o and resi", gsub(", ","+",toString(as.character(subset(combined_clinvar, class == "missense" & clinvar_likely_pathog == 1 & snv == 1 & abundance_class == "wt-like")$position))),"; show spheres, likely_pathog_stable; color salmon, likely_pathog_stable")
## [1] "select likely_pathog_stable, not name c+n+o and resi 124+124+24+24+92+90+35+35+14+159+15+15+53 ; show spheres, likely_pathog_stable; color salmon, likely_pathog_stable"
## Rotate the pymol view
paste("rotate x, -90")
## [1] "rotate x, -90"
Look at clustering of well-represented positions
orig_positional_coverage <- combined %>% filter(!is.na(score_orig) & class == "missense") %>% count(position) %>% count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
composite_positional_coverage <- combined %>% filter(!is.na(score_abundance) & class == "missense") %>% count(position) %>% count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
positional_coverage_frame <- merge(orig_positional_coverage,composite_positional_coverage, by = "n"); colnames(positional_coverage_frame) <- c("coverage","orig","composite")
positional_coverage_frame_melted <- melt(positional_coverage_frame, "coverage")
ggplot() + scale_x_continuous(limits = c(0.5,19.5), expand = c(0,1), breaks = seq(1,19,2)) +
geom_bar(data = positional_coverage_frame_melted, aes(x = coverage, y = value, fill = variable), stat = "identity", position = "dodge", color = "black")
pten_missense <- subset(combined, class == "missense" & variant != "wt" & variant != "WT")
pten_seq <- "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV"
pten_aa_num <- nchar(pten_seq)
pten_df <- data.frame("position" = seq(1,pten_aa_num,1),
"A"= rep(NA,pten_aa_num), "C"= rep(NA,pten_aa_num), "D"= rep(NA,pten_aa_num), "E"= rep(NA,pten_aa_num), "F"= rep(NA,pten_aa_num), "G"= rep(NA,pten_aa_num),
"H"= rep(NA,pten_aa_num), "I"= rep(NA,pten_aa_num), "K"= rep(NA,pten_aa_num), "L"= rep(NA,pten_aa_num), "M"= rep(NA,pten_aa_num), "N"= rep(NA,pten_aa_num),
"P"= rep(NA,pten_aa_num), "Q"= rep(NA,pten_aa_num), "R"= rep(NA,pten_aa_num), "S"= rep(NA,pten_aa_num), "T"= rep(NA,pten_aa_num), "V"= rep(NA,pten_aa_num),
"W"= rep(NA,pten_aa_num), "Y"= rep(NA,pten_aa_num))
for(y in 1:nchar(pten_seq)){pten_df[y,substr(pten_seq,y,y)] <- 1}
for(x in 1:nrow(pten_missense)){
pten_df[pten_missense$position[x],pten_missense$end[x]] <- pten_missense$score_abundance[x]}
pten_df <- subset(pten_df, !is.na(position))
abundance_normal_percentile_low <- median((subset(combined_clinvar_gnomad_bravo, class == "synonymous"))$score_total, na.rm = T)
abundance_normal_percentile_high <- quantile((subset(combined_clinvar_gnomad_bravo, class == "synonymous"))$score_total, 0.95, na.rm = T)
abundance_low_percentile <- median((subset(combined_clinvar_gnomad_bravo, class == "nonsense"))$score_total, na.rm = T) #quantile(normal_dist_fit_low$prob, 0.90)
pten_double3 <- pten_df[2:nrow(pten_df),]
pten_double3$fraction_below_wt <- 0
for(x in 1:nrow(pten_double3)){
pten_double3$fraction_below_wt[x] <- sum(na.omit(unlist(pten_double3[x,2:ncol(pten_double3)])) < abundance_normal_percentile_low) / length(na.omit(unlist(pten_double3[x,2:ncol(pten_double3)])))
}
pten_lower_threshold <- as.numeric(abundance_low_percentile)
pten_upper_threshold <- as.numeric(abundance_normal_percentile_high)
## Double hclust
pten_double <- pten_df[,2:21]
rownames(pten_double) <- pten_df[,1]
## Subsetting only for positions with 18 or more variants
pten_double$count <- NA
for(x in 1:nrow(pten_double)){
pten_double$count[x] <- sum(!is.na(pten_double[x,]))
}
pten_double <- subset(pten_double, count >= 18)
pten_double <- pten_double[,1:20]
pten_double2 <- pten_double
for(x in 1:nrow(pten_double2)){
for(y in 1:ncol(pten_double2)){
if(is.na(pten_double2[x,y]) == TRUE){
pten_double2[x,y] <- mean(unlist(pten_double2[x,]), na.rm = TRUE)
}
}
}
row.order <- hclust(dist(pten_double2))$order
col.order <- hclust(dist(t(pten_double2)))$order
pten_double_label_order <- rev(rownames(pten_double2[row.order, col.order]))
pten_double_variable_order <- colnames(pten_double2[row.order, col.order])
pten_double_variable_order <- c("G","S","T","C","A","M","W","Y","F","L","I","V","D","E","N","Q","H","K","R","P")
pten_double$label <- rownames(pten_double)
pten_double2_melt <- melt(pten_double, id = "label")
for(x in 1:nrow(pten_double2_melt)){
pten_double2_melt$value[x] <- as.numeric(as.character(pten_double2_melt$value[x]))
if(pten_double2_melt$value[x] <= pten_lower_threshold & !(is.na(pten_double2_melt$value[x]))){pten_double2_melt$value[x] <- pten_lower_threshold}
if(pten_double2_melt$value[x] >= pten_upper_threshold & !(is.na(pten_double2_melt$value[x]))){pten_double2_melt$value[x] <- pten_upper_threshold}
}
pten_double2_melt <- subset(pten_double2_melt, label != 1)
pten_double2_melt$label <- factor(pten_double2_melt$label, levels = pten_double_label_order)
pten_double2_melt$variable <- factor(pten_double2_melt$variable, levels = pten_double_variable_order)
pten_mutational_spectra <- ggplot() +
xlab(NULL) + ylab(NULL) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_gradientn(colours = c("blue","white","red"), values = rescale(c(pten_lower_threshold,1,1.5)), limits=c(pten_lower_threshold,1.5)) +
geom_tile(data = pten_double2_melt, aes(x = label, y = variable, fill = value))
pten_mutational_spectra
ggsave(file = "plots/pten_abundance_mutational_spectra.pdf", pten_mutational_spectra, height = 80, width = 178*2, units = "mm")
write.csv(file = "output_datatables/pten_abundance_imputed.csv", pten_double2, quote = FALSE, row.names = FALSE)
library(ggdendro)
#convert cluster object to use with ggplot
dendr <- dendro_data(hclust(dist(pten_double2)), type="rectangle")
dendrogram_plot <- ggplot() +
geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=NULL, aes(x=dendr$label$x, y=dendr$label$y, label=dendr$label$label, hjust=0, angle = 90), size=3) +
#geom_text(data=label(dendr), aes(x=x, y=y, label=label, hjust=0), size=3) +
scale_y_reverse(expand=c(0.2, 0)) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.y=element_blank(),
axis.title.y=element_blank(),
panel.background=element_rect(fill="white"),
panel.grid=element_blank())
ggsave(file = "plots/dendrogram_plot.pdf", dendrogram_plot, height = 80, width = 178*2, units = "mm")
Calculate gini coefficient to separate most mutation intolerant positions
gini_frame <- data.frame("position" = as.character(rownames(pten_double2)))
gini_frame$gini <- 0
for(x in 1:nrow(gini_frame)){gini_frame$gini[x] <- gini(unlist(pten_double2[x,]))}
paste("Minimum Gini coefficient:", round(min(gini_frame$gini, na.rm = T),2))
## [1] "Minimum Gini coefficient: 0.03"
paste("Maximum Gini coefficient:", round(max(gini_frame$gini, na.rm = T),2))
## [1] "Maximum Gini coefficient: 0.35"
Gini_coefficient_plot <- ggplot() +
theme_bw() +
scale_y_continuous(expand = c(0,0.1), breaks = c(0,5,10)) +
geom_hline(yintercept = 0, size = 1.5) +
geom_histogram(data = subset(gini_frame, gini <= 0.121), aes(x = gini), binwidth = 0.02, color = "black", fill = "blue", alpha = 0.8) +
geom_histogram(data = subset(gini_frame, gini > 0.121 & gini <= 0.24), aes(x = gini), binwidth = 0.02, color = "black", fill = "thistle2", alpha = 0.8) +
geom_histogram(data = subset(gini_frame, gini > 0.24), aes(x = gini), binwidth = 0.02, color = "black", fill = "red", alpha = 0.8)
Gini_coefficient_plot
ggsave(file = "plots/Gini_coefficient_plot.pdf", Gini_coefficient_plot, height = 25, width = 35, units = "mm")
paste("select mut_intolerant_abundance, resi",gsub(", ","+",toString(as.numeric(as.character((mutation_intolerant_positions <- gini_frame %>% filter(gini > 0.25))$position)))))
## [1] "select mut_intolerant_abundance, resi 67+173+181+243+249+251+253+325+326+343"
paste("select polar_mut_intolerant_abundance, resi 251+173+326+169")
## [1] "select polar_mut_intolerant_abundance, resi 251+173+326+169"
Combine Gini coefficient with clustered heatmap
gini_frame2 <- gini_frame
colnames(gini_frame2) <- c("label","value")
gini_frame2[gini_frame2$value <= 0.07191892, "value"] <- 0.072
gini_frame2[gini_frame2$value >= 0.07191892 & gini_frame2$value <= 0.121, "value"] <- 0.072
gini_frame2[gini_frame2$value > 0.24, "value"] <- 1.5
gini_frame2[gini_frame2$value > 0.121 & gini_frame2$value <= 0.24, "value"] <- 0.9
gini_frame2$variable <- "Gini"
pten_double2_melt_gini <- rbind(pten_double2_melt, gini_frame2)
## Frame for denoting WT resudiues
pten_residues <- data.frame("position" = seq(1,nchar(pten_seq)), "residue" = NA)
for(x in 1:nrow(pten_residues)){pten_residues$residue[x] <- substr(pten_seq,x,x)}
pten_residues2 <- subset(pten_residues, position %in% gini_frame2$label)
pten_residues2$position <- as.character(pten_residues2$position)
pten_residues2$position <- factor(pten_residues2$position, levels = pten_double_label_order)
pten_residues2$residue <- factor(pten_residues2$residue, levels = pten_double_variable_order)
pten_mutational_spectra <- ggplot() +
xlab(NULL) + ylab(NULL) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_gradientn(colours = c("blue","white","red"), values = rescale(c(pten_lower_threshold,1,1.5)), limits=c(pten_lower_threshold-0.1,1.5)) +
geom_tile(data = pten_double2_melt_gini, aes(x = label, y = variable, fill = value)) +
geom_point(data = pten_residues2, aes(x = position, y = residue)) +
geom_hline(yintercept = 20.5)
pten_mutational_spectra
ggsave(file = "plots/pten_abundance_mutational_spectra.pdf", pten_mutational_spectra, height = 65, width = 175*1.2, units = "mm")
combined_for_heatmap <- combined
combined_for_heatmap$position <- as.numeric(combined_for_heatmap$position)
combined_for_heatmap$end <- factor(combined_for_heatmap$end, levels = c("G","S","T","C","A","M","W","Y","F","L","I","V","D","E","N","Q","H","K","R","P","X"))
combined_for_heatmap[!is.na(combined_for_heatmap$score_orig) & combined_for_heatmap$score_orig < pten_lower_threshold,"score_orig"] <- pten_lower_threshold
combined_for_heatmap[!is.na(combined_for_heatmap$score_abundance) & combined_for_heatmap$score_abundance < pten_lower_threshold,"score_abundance"] <- pten_lower_threshold
PTEN_full_heatmap <- ggplot() + theme(panel.background = element_rect("grey80"), panel.grid.major = element_blank()) +
geom_tile(data = combined_for_heatmap, aes(x = position, y = end, fill = score_orig), width = 1) +
geom_tile(data = combined_for_heatmap, aes(x = position, y = end, fill = score_abundance), height = 0.6, width = 0.4) +
scale_x_continuous(expand = c(0,0)) +
scale_fill_gradientn(colours = c("blue","white","red"), values = rescale(c(pten_lower_threshold,1,2)), limits=c(pten_lower_threshold,2))
ggsave(fil = "Plots/PTEN_full_heatmap.pdf", PTEN_full_heatmap, height = 3, width = 15)
PTEN_full_heatmap
Comparisons of PTEN abundance and activity
Now let’s add in PTEN functional data
# Import the supplementary data from the Mighell et al AJHG paper
pten_func <- read.delim(file = "input_datatables/Mighell_PTEN_phosphatase_data.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
pten_func <- pten_func[,c("Variant..one.letter.","Cum_score","Cum_SE","High_conf")]
colnames(pten_func) <- c("variant","func","func_se","high_conf")
pten_func$position <- gsub("[^0-9]","",pten_func$variant)
pten_func$variant <- gsub("\\*","X",pten_func$variant)
for(x in 1:nrow(pten_func)){
pten_func$start[x] <- substr(gsub("[^A-Z]","",pten_func$variant[x]),1,1)
pten_func$end[x] <- substr(gsub("[^A-Z]","",pten_func$variant[x]),2,2)
}
pten_func <- subset(pten_func, end != "" & position != 1)
pten_func$func4 <- -(pten_func$func / quantile((pten_func$func), 0.05, na.rm = TRUE)) + 1
pten_func$func5 <- ((pten_func$func4) - quantile((pten_func$func4), 0.05, na.rm = TRUE)) / (1 - quantile((pten_func$func4), 0.05, na.rm = TRUE))
pten_func$score_activity <- pten_func$func5
combined_abund_act <- merge(combined_clinvar, pten_func[,c("variant","score_activity","high_conf")], by = "variant", all.x = T)[c("variant","position","snv","class","score_abundance","abundance_class","score_activity","high_conf","clinvar_pathog","clinvar_likely_pathog","clinvar_uncertain")]
## Set some boundaries
abundance_syn_5th <- quantile(subset(combined_abund_act, class == "synonymous")$score_abundance,0.05,na.rm = T)
abundance_nonsense_5th <- quantile(subset(combined_abund_act, position > 50 & position < 350 & class == "nonsense")$score_abundance,0.95, na.rm = T)
pten_func_syn <- -1.11
pten_func_syn3 <- -(pten_func_syn / quantile((pten_func$func), 0.05, na.rm = TRUE)) + 1
pten_func_syn4 <- ((pten_func_syn3) - quantile((pten_func$func4), 0.05, na.rm = TRUE)) / (1 - quantile((pten_func$func4), 0.05, na.rm = TRUE))
activity_syn_5th <- pten_func_syn4
pten_func_nonsense <- -2.13
pten_func_nonsense3 <- -(pten_func_nonsense / quantile((pten_func$func), 0.05, na.rm = TRUE)) + 1
pten_func_nonsense4 <- ((pten_func_nonsense3) - quantile((pten_func$func4), 0.05, na.rm = TRUE)) / (1 - quantile((pten_func$func4), 0.05, na.rm = TRUE))
activity_nonsense_5th <- pten_func_nonsense4
count_abundance_scores <- nrow(subset(combined_abund_act, !is.na(score_abundance) & class == "missense"))
paste("Number of variants with abundance scores:", count_abundance_scores)
## [1] "Number of variants with abundance scores: 4388"
count_activity_scores <- nrow(subset(combined_abund_act, !is.na(score_activity) & class == "missense" & high_conf == T))
paste("Number of variants with activity scores:", count_activity_scores)
## [1] "Number of variants with activity scores: 6562"
Making a comparison of abundance and functional data
complete_abund_act <- subset(combined_abund_act, !is.na(score_abundance) & !(is.na(score_activity)))
complete_abund_act$abund_act_class <- "NA"
paste("Number of missense variants with abundance and activity scores:", nrow(complete_abund_act %>% filter(class == "missense")))
## [1] "Number of missense variants with abundance and activity scores: 4178"
for(x in 1:nrow(complete_abund_act)){
if(!(is.na(complete_abund_act$score_activity[x])) & !(is.na(complete_abund_act$score_abundance[x]))){
if((complete_abund_act$abundance_class[x] == "low") & (complete_abund_act$score_activity[x] > activity_syn_5th)){complete_abund_act$abund_act_class[x] <- "2_loss_of_abundance_only"}
if((complete_abund_act$abundance_class[x] == "wt-like") & (complete_abund_act$score_activity[x] < activity_nonsense_5th)){complete_abund_act$abund_act_class[x] <- "4_loss_of_activity_only"}
if((complete_abund_act$abundance_class[x] == "low") & (complete_abund_act$score_activity[x] < activity_nonsense_5th)){complete_abund_act$abund_act_class[x] <- "3_loss_of_both"}
if((complete_abund_act$abundance_class[x] == "wt-like") & (complete_abund_act$score_activity[x] > activity_syn_5th)){complete_abund_act$abund_act_class[x] <- "1_wt_like"}
}
}
complete_abund_act$abund_act_class <- factor(complete_abund_act$abund_act_class, levels = c("2_loss_of_abundance_only","4_loss_of_activity_only","3_loss_of_both","1_wt_like","NA"))
custom_colorscale2 <- c("2_loss_of_abundance_only" = "turquoise","4_loss_of_activity_only" = "orange","3_loss_of_both" = "purple","1_wt_like" = "dark green", "NA" = "grey75")
complete_abund_act_missense <- subset(complete_abund_act, class == "missense" & !is.na(score_abundance) & !(is.na(score_activity)))
complete_table <- data.frame(table(complete_abund_act_missense$position))
complete_table <- complete_table[order(complete_table$Freq, decreasing = T),]
loss_of_abundance_table <- data.frame(table(subset(complete_abund_act_missense, abund_act_class == "2_loss_of_abundance_only")$position))
loss_of_abundance_table <- loss_of_abundance_table[order(loss_of_abundance_table$Freq, decreasing = T),]
loss_of_activity_table <- data.frame(table(subset(complete_abund_act_missense, abund_act_class == "4_loss_of_activity_only")$position))
loss_of_activity_table <- loss_of_activity_table[order(loss_of_activity_table$Freq, decreasing = T),]
complete_table <- merge(complete_table, loss_of_abundance_table, by = "Var1", all.x = T)
complete_table <- merge(complete_table, loss_of_activity_table, by = "Var1", all.x = T)
colnames(complete_table) <- c("position","complete","loss_of_abundance","loss_of_activity")
complete_table[is.na(complete_table)] <- 0
loss_of_both_table <- data.frame(table(subset(complete_abund_act_missense, abund_act_class == "3_loss_of_both")$position))
loss_of_both_table <- loss_of_both_table[order(loss_of_both_table$Freq, decreasing = T),]
colnames(loss_of_both_table) <- c("position","loss_of_both")
complete_table <- merge(complete_table, loss_of_both_table, by = "position", all.x = T)
complete_table[is.na(complete_table)] <- 0
complete_table$percent_loss_of_abundance <- complete_table$loss_of_abundance / complete_table$complete * 100
complete_table$percent_loss_of_activity <- complete_table$loss_of_activity / complete_table$complete * 100
complete_table$percent_loss_of_both <- complete_table$loss_of_both / complete_table$complete * 100
low_abund_act_diff <- subset(complete_table, percent_loss_of_abundance >= 50 & complete >= 5)
high_abund_act_diff <- subset(complete_table, percent_loss_of_activity >= 50 & complete >= 5)
low_total <- subset(complete_table, percent_loss_of_both >= 50 & complete >= 5)
paste("select low_diff, resi",gsub(", ","+",toString(low_abund_act_diff$position)), "and not name c+n+o")
## [1] "select low_diff, resi 169+172+188+246+247+258+270+274+278+280+322+331+337+348+388+58+66 and not name c+n+o"
paste("select high_diff, resi",gsub(", ","+",toString(high_abund_act_diff$position)), "and not name c+n+o")
## [1] "select high_diff, resi 123+125+130+15+159+333+401+47+92 and not name c+n+o"
paste("select low, resi",gsub(", ","+",toString(low_total$position)), "and not name c+n+o")
## [1] "select low, resi 101+104+105+107+108+111+119+120+122+131+136+137+140+148+166+170+175+177+195+217+241+249+25+252+253+255+27+271+275+277+28+325+326+346+35+53+56+57+61+67+68+95 and not name c+n+o"
quadrant_plot <- ggplot() +
theme_classic() +
theme(legend.position = "none", panel.grid.major = element_line("grey95")) +
xlab("Abundance score") + ylab("Activity score") +
scale_x_continuous(breaks = c(0,1)) +
scale_y_continuous(breaks = c(0,1)) +
scale_color_manual(values = custom_colorscale2) +
geom_point(data = complete_abund_act_missense, aes(x = score_abundance, y = score_activity, color = abund_act_class), alpha = 0.2) +
geom_hline(yintercept = activity_syn_5th, linetype = 2, color = "red", alpha = 0.5) +
geom_hline(yintercept = activity_nonsense_5th, linetype = 2, color = "blue", alpha = 0.5) +
geom_vline(xintercept = quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T), linetype = 2, color = "red", alpha = 0.5) +
geom_vline(xintercept = quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99), linetype = 2, color = "blue", alpha = 0.5) +
geom_text(aes(x = -0.2, y = 1.6, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "2_loss_of_abundance_only")))), color = "turquoise", hjust = 0) +
geom_text(aes(x = -0.2, y = 1.45, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity > activity_syn_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99) )))), color = "grey50", hjust = 0) +
geom_text(aes(x = -0.2, y = -0.25, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "3_loss_of_both")))), color = "purple", hjust = 0) +
geom_text(aes(x = -0.2, y = -0.4, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity < activity_nonsense_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99) )))), color = "grey50", hjust = 0) +
geom_text(aes(x = 1.4, y = 1.6, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "1_wt_like")))), color = "dark green", hjust = 1) +
geom_text(aes(x = 1.4, y = 1.45, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity > activity_syn_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T) )))), color = "grey50", hjust = 1) +
geom_text(aes(x = 1.4, y = -0.25, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "4_loss_of_activity_only")))), color = "orange", hjust = 1) +
geom_text(aes(x = 1.4, y = -0.4, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity < activity_nonsense_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T) )))), color = "grey50", hjust = 1) +
geom_text(aes(x = 0.7, y = 1.45, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity > activity_syn_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T) & score_abundance > quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99) )))), color = "grey50", hjust = 0.5) +
geom_text(aes(x = 0.7, y = -0.4, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity < activity_nonsense_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T) & score_abundance > quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99) )))), color = "grey50", hjust = 0.5) +
geom_text(aes(x = -0.2, y = 0.6, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity > activity_nonsense_5th & score_activity < activity_syn_5th & score_abundance < quantile(subset(complete_abund_act_missense, abundance_class == "low")$score_abundance,0.99) )))), color = "grey50", hjust = 0) +
geom_text(aes(x = 1.4, y = 0.6, label = paste("n=",nrow(subset(complete_abund_act_missense, abund_act_class == "NA" & score_activity > activity_nonsense_5th & score_activity > activity_syn_5th & score_abundance > quantile(subset(complete_abund_act_missense, abundance_class == "wt-like")$score_abundance,0.01, na.rm = T) )))), color = "grey50", hjust = 1)
quadrant_plot
ggsave(file = "plots/quadrant_plot.pdf", quadrant_plot, height = 80, width = 80, units = "mm")
paste("Number of confidently assessed PTEN variants:",nrow(subset(complete_abund_act_missense, abund_act_class != "NA")))
## [1] "Number of confidently assessed PTEN variants: 2377"
paste("Fraction of confidently assessed PTEN variants that were WT-like for both:",nrow(subset(complete_abund_act_missense, abund_act_class == "1_wt_like"))/nrow(subset(complete_abund_act_missense, abund_act_class != "NA")))
## [1] "Fraction of confidently assessed PTEN variants that were WT-like for both: 0.511148506520825"
paste("Fraction of confidently assessed PTEN variants that were WT-like for both:",nrow(subset(complete_abund_act_missense, abund_act_class == "3_loss_of_both"))/nrow(subset(complete_abund_act_missense, abund_act_class != "NA")))
## [1] "Fraction of confidently assessed PTEN variants that were WT-like for both: 0.211190576356752"
paste("Fraction of confidently assessed PTEN variants that were WT-like for both:",nrow(subset(complete_abund_act_missense, abund_act_class == "2_loss_of_abundance_only"))/nrow(subset(complete_abund_act_missense, abund_act_class != "NA")))