analysis/CopelandDeletionsPca.Rmd

---
title: "Reanalysis of Deletions PCA data from Copeland et al."
output:
  workflowr::wflow_html:
    toc: true
    toc_float: yes
    theme: journal
    highlight: textmate
    code_folding: hide
    df_print: paged
---

```{r setup, echo=FALSE, include=FALSE}
knitr::opts_chunk$set(
  autodep        = TRUE,
  cache          = FALSE,
  cache.lazy     = FALSE,
  dev            = c("png", "pdf"),
  echo           = TRUE,
  error          = FALSE,
  fig.align      = "center",
  fig.width      = 8,
  fig.asp        = 0.618,
  message        = FALSE,
  warning        = FALSE
)
# Load tidyverse infrastructure packages
suppressPackageStartupMessages({
  library(tidyverse)
  library(here)
  library(reshape)
})

# Set paths
src_dir   <- here('code')
raw_dir   <- here('1_Raw')
data_dir  <- here('2_Derived')
plots_dir <- here("3_Results")

# set seed
reseed <- 42
set.seed(seed = reseed)
```

## 1: READ Authors-provided data

```{r load}
Pca = read.table(here(
  raw_dir, 
  "CopelandDeletionsPca/PCA_vectors_c0toc2_01.txt"))
```

take only the third component (which is called component 2 in the file
because they start from zero) = see
"1Raw/CopelandDeletionsPca/ReadMe.txt" and derive bin coordinates

```{r derive-pc3-bins}
Pca = Pca[12807:nrow(Pca),]
for (i in 1:nrow(Pca))
{ # i = 1
  FirstBin = unlist(strsplit(Pca$V1[i],','))[1]
  Pca$FirstBinStart[i] = as.numeric(unlist(strsplit(FirstBin,':'))[1])
  Pca$FirstBinEnd[i] = as.numeric(unlist(strsplit(FirstBin,':'))[2])
  
  SecondBin = unlist(strsplit(Pca$V1[i],','))[2]
  Pca$SecondBinStart[i] = as.numeric(unlist(strsplit(SecondBin,':'))[1])
  Pca$SecondBinEnd[i] = as.numeric(unlist(strsplit(SecondBin,':'))[2])
}

summary(Pca$SecondBinStart - Pca$SecondBinEnd) # 207 is a step!
summary(Pca$FirstBinStart - Pca$FirstBinEnd) # 207 is a step! 

Pca$FirstBinCenter = Pca$FirstBinStart + (Pca$FirstBinEnd - Pca$FirstBinStart)/2
Pca$SecondBinCenter = Pca$SecondBinStart + (Pca$SecondBinEnd - Pca$SecondBinStart)/2
```

## 2: plot PC3 and extract major arc. Major arc is from Ol (5721) till the end of mtDNA (16569) and a bit more (till Oh: 110)

```{r plt-pc3}
par(mfrow=c(2,1))
plot(Pca$FirstBinCenter,Pca$V2, pch = 20, cex = 0.5)
plot(Pca$SecondBinCenter,Pca$V2, pch = 20, cex = 0.5)
```

```{r plt-extract-major-arc}
Pca = Pca[Pca$FirstBinCenter > 5721 & Pca$SecondBinCenter > 5721,]
plot(Pca$FirstBinCenter,Pca$SecondBinCenter)
```

```{r plt-pc3-major-arc}
par(mfrow=c(2,1))
plot(Pca$FirstBinCenter,Pca$V2, pch = 20, cex = 0.5)
plot(Pca$SecondBinCenter,Pca$V2, pch = 20, cex = 0.5)
```

## 3: rounded to 1kb cells

```{r copeland-third-component-pca1}
Pca$FirstBinCenterRound  = round(Pca$FirstBinCenter,-3) # till thousands
Pca$SecondBinCenterRound = round(Pca$SecondBinCenter,-3) # till thousands
Agg = aggregate(as.numeric(Pca$V2), by = list(Pca$FirstBinCenterRound,Pca$SecondBinCenterRound), FUN = mean)
names(Agg)=c('Start','End','Value')
head(Agg)
ggp1 <- ggplot(Agg, aes(Start, End)) +                           # Create heatmap with ggplot2
  geom_tile(aes(fill = Value))
ggp1
```

## 4: original cells (PAPER)

```{r copeland-third-component-pca2}
Agg = aggregate(as.numeric(Pca$V2), by = list(Pca$FirstBinCenter,Pca$SecondBinCenter), FUN = mean)
names(Agg)=c('Start','End','Value')
ContactZone = Agg[Agg$Start >= 6000 & Agg$Start <= 9000 & Agg$End >= 13000 & Agg$Start <= 16000,]$Value

ggp2 <- ggplot(Agg, aes(Start, End)) +                           # Create heatmap with ggplot2
  geom_tile(aes(fill = Value))
ggp2
```

## 5: Tests

```{r wilcox-test}
wilcox.test(ContactZone,Agg$Value)
```

```{r t-test}
t.test(ContactZone,Agg$Value) 
```