/
CopelandDeletionsPca.Rmd
129 lines (107 loc) · 3.53 KB
/
CopelandDeletionsPca.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
---
title: "Reanalysis of Deletions PCA data from Copeland et al."
output:
workflowr::wflow_html:
toc: true
toc_float: yes
theme: journal
highlight: textmate
code_folding: hide
df_print: paged
---
```{r setup, echo=FALSE, include=FALSE}
knitr::opts_chunk$set(
autodep = TRUE,
cache = FALSE,
cache.lazy = FALSE,
dev = c("png", "pdf"),
echo = TRUE,
error = FALSE,
fig.align = "center",
fig.width = 8,
fig.asp = 0.618,
message = FALSE,
warning = FALSE
)
# Load tidyverse infrastructure packages
suppressPackageStartupMessages({
library(tidyverse)
library(here)
library(reshape)
})
# Set paths
src_dir <- here('code')
raw_dir <- here('1_Raw')
data_dir <- here('2_Derived')
plots_dir <- here("3_Results")
# set seed
reseed <- 42
set.seed(seed = reseed)
```
## 1: READ Authors-provided data
```{r load}
Pca = read.table(here(
raw_dir,
"CopelandDeletionsPca/PCA_vectors_c0toc2_01.txt"))
```
take only the third component (which is called component 2 in the file
because they start from zero) = see
"1Raw/CopelandDeletionsPca/ReadMe.txt" and derive bin coordinates
```{r derive-pc3-bins}
Pca = Pca[12807:nrow(Pca),]
for (i in 1:nrow(Pca))
{ # i = 1
FirstBin = unlist(strsplit(Pca$V1[i],','))[1]
Pca$FirstBinStart[i] = as.numeric(unlist(strsplit(FirstBin,':'))[1])
Pca$FirstBinEnd[i] = as.numeric(unlist(strsplit(FirstBin,':'))[2])
SecondBin = unlist(strsplit(Pca$V1[i],','))[2]
Pca$SecondBinStart[i] = as.numeric(unlist(strsplit(SecondBin,':'))[1])
Pca$SecondBinEnd[i] = as.numeric(unlist(strsplit(SecondBin,':'))[2])
}
summary(Pca$SecondBinStart - Pca$SecondBinEnd) # 207 is a step!
summary(Pca$FirstBinStart - Pca$FirstBinEnd) # 207 is a step!
Pca$FirstBinCenter = Pca$FirstBinStart + (Pca$FirstBinEnd - Pca$FirstBinStart)/2
Pca$SecondBinCenter = Pca$SecondBinStart + (Pca$SecondBinEnd - Pca$SecondBinStart)/2
```
## 2: plot PC3 and extract major arc. Major arc is from Ol (5721) till the end of mtDNA (16569) and a bit more (till Oh: 110)
```{r plt-pc3}
par(mfrow=c(2,1))
plot(Pca$FirstBinCenter,Pca$V2, pch = 20, cex = 0.5)
plot(Pca$SecondBinCenter,Pca$V2, pch = 20, cex = 0.5)
```
```{r plt-extract-major-arc}
Pca = Pca[Pca$FirstBinCenter > 5721 & Pca$SecondBinCenter > 5721,]
plot(Pca$FirstBinCenter,Pca$SecondBinCenter)
```
```{r plt-pc3-major-arc}
par(mfrow=c(2,1))
plot(Pca$FirstBinCenter,Pca$V2, pch = 20, cex = 0.5)
plot(Pca$SecondBinCenter,Pca$V2, pch = 20, cex = 0.5)
```
## 3: rounded to 1kb cells
```{r copeland-third-component-pca1}
Pca$FirstBinCenterRound = round(Pca$FirstBinCenter,-3) # till thousands
Pca$SecondBinCenterRound = round(Pca$SecondBinCenter,-3) # till thousands
Agg = aggregate(as.numeric(Pca$V2), by = list(Pca$FirstBinCenterRound,Pca$SecondBinCenterRound), FUN = mean)
names(Agg)=c('Start','End','Value')
head(Agg)
ggp1 <- ggplot(Agg, aes(Start, End)) + # Create heatmap with ggplot2
geom_tile(aes(fill = Value))
ggp1
```
## 4: original cells (PAPER)
```{r copeland-third-component-pca2}
Agg = aggregate(as.numeric(Pca$V2), by = list(Pca$FirstBinCenter,Pca$SecondBinCenter), FUN = mean)
names(Agg)=c('Start','End','Value')
ContactZone = Agg[Agg$Start >= 6000 & Agg$Start <= 9000 & Agg$End >= 13000 & Agg$Start <= 16000,]$Value
ggp2 <- ggplot(Agg, aes(Start, End)) + # Create heatmap with ggplot2
geom_tile(aes(fill = Value))
ggp2
```
## 5: Tests
```{r wilcox-test}
wilcox.test(ContactZone,Agg$Value)
```
```{r t-test}
t.test(ContactZone,Agg$Value)
```