-
Notifications
You must be signed in to change notification settings - Fork 0
/
README.Rmd
224 lines (193 loc) · 9.74 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
---
title: "Simulation population-scale Allele Differential Expression from Metagenomic and metranscriptomic data"
author: "Amin Madoui"
date: "25 mars 2020"
output: rmarkdown::github_document
fig_width: 4
fig_height: 4
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Modelling genomic parameters
The depth of coverage of the genome is modelled by a negative binomial distribution. To learn the NB parameters, I use the sum of the read counts of the reference and alternative alleles generated by DiscoSNP++ ran on metaG of each sample. The allele frequency $freq=\frac{read\_count A}{read\_count A + read\_count B}$ distribution shows a U shape and is modelled by a Beta distribution. The distribution of the loci expression is a model admixture that cannot be estimated, we approximate the distribution by a gamma distribution.
```{r warning=FALSE }
library(fitdistrplus)
# Input: read counts file
# 1. fit the allele frequency by a beta distribution
# 2. fit the depth of coverage by a negative binomial
# 3. fit the loci expression level by a gamma distribution
# Return: the distribution parameters for the three fits
getNBparam <- function(file){
RC = read.table(file)
colnames(RC) = c("genome_RC_A", "genome_RC_B",
"transcript_RC_A","transcript_RC_B")
# 1. Genome coverage is fitted by a negative binomial distribution
fit_Gcoverage = fitdist(RC$genome_RC_A + RC$genome_RC_B,
method = "mme", distr="nbinom")
plot(fit_Gcoverage)
GenomeCov_NB_theta = fit_Gcoverage$estimate[1]
GenomeCov_NB_mu = fit_Gcoverage$estimate[2]
# 2. Genome variant frequency distribution is fitted by a beta distribution
fit_freq = fitdist( RC$genome_RC_A / (RC$genome_RC_A + RC$genome_RC_B),
method = "mme", distr="beta")
VarFreq_Beta_shape1 = fit_freq$estimate[1]
VarFreq_Beta_shape2 = fit_freq$estimate[2]
plot(fit_freq)
# 3. Loci expression level is fitted by a gamma distribituion
fit_Tcoverage = fitdist(RC$transcript_RC_A + RC$transcript_RC_B,
distr="gamma", method = "mme")
Expression_Gamma_shape = fit_Tcoverage$estimate[1]
Expression_Gamma_rate = fit_Tcoverage$estimate[2]
plot(fit_Tcoverage)
return (as.numeric(as.character(c(GenomeCov_NB_theta, GenomeCov_NB_mu,
VarFreq_Beta_shape1, VarFreq_Beta_shape2,
Expression_Gamma_shape, Expression_Gamma_rate ))))
}
```
## Learning genomic and transcriptomic parameters from real data
The input files format is the following, col 1: RC_genomic_allele_A, col 2: RC_genomic_allele_B, col 3: RC_transcriptomic_allele_A, col 4: RC_transcriptomic_allele_B. We estimate the distribution parameters from seven samples.
```{r warning = FALSE}
# One read count file per sample
files = list.files ("RC")
files = paste("RC/",files,sep="")
parameters = matrix(unlist(lapply(files,getNBparam)), ncol=6, byrow = TRUE)
colnames(parameters) = c("GenomeCov_NB_theta", "GenomeCov_NB_mu",
"VarFreq_Beta_shape1", "VarFreq_Beta_shape2",
"Expression_Gamma_shape", "Expression_Gamma_rate")
rownames(parameters) = c("S155","S158", "S178","S206","S208","S209","S210")
write.table(x = parameters, "genomic_distributions_parameters.txt", sep = "\t", quote = FALSE)
print (parameters)
```
## Estimation of the Negative Binomial parameter $\theta$ for any depth of coverage
We modelled the relationship between $\mu$ and $\theta$ by a linear model such as $\theta = \frac{p}{1-p} . \mu$. With $p$ the probabilty of a loci to belong to single-copy region of the genome
```{r}
library(ggplot2)
p = ggplot(data = as.data.frame(parameters),
aes( x = GenomeCov_NB_mu, y = GenomeCov_NB_theta))+
geom_point()+
geom_smooth(method = lm)+
xlab(expression(mu))+
ylab(expression(theta))+
theme_bw()
print(p)
# Predict the NB theta with mu by a linear model
fitParam = summary(lm(parameters[,1]~parameters[,2]))
print (fitParam)
mu2theta <- function (mu){
theta = fitParam$coefficients[1] + fitParam$coefficients[2] * mu
return (theta)
}
```
## Estimation of read counts for genome and transcriptome
The read A count follows a NB distribution of mean = coverage A (random value between 1 and the genome coverage) and size estimated from the data. The read count B follows a NB distribution of mean = coverage B = genome coverage - coverage A and size estimated from the data
```{r warning=FALSE}
test_RC_Noise <- function (genome_coverage, theta, fbeta1, fbeta2, gamshape, gamrate, transcript_sd){
# simulate random allele frequency given the beta distrib parameters (shape1 and shape2)
variant_frequency = rbeta(n = 1, shape1 = fbeta1, shape2 = fbeta2)
genomic_expect_cov_A = round( variant_frequency * genome_coverage , 0)
genomic_expect_cov_B = genome_coverage - genomic_expect_cov_A
## simulate genomic read counts for allele A and B
# given expected coverages and the NB distrib parameters
# first find theta
thetaA = round(mu2theta(genomic_expect_cov_A))
thetaB = round(mu2theta(genomic_expect_cov_B))
# then simulate
genomic_RC_A = rnbinom (n = 1, size = thetaA, mu = genomic_expect_cov_A)
genomic_RC_B = rnbinom (n = 1, size = thetaB, mu = genomic_expect_cov_B)
# replace NA by 0
if (is.na(genomic_RC_A)) genomic_RC_A = 0
if (is.na(genomic_RC_B)) genomic_RC_B = 0
# simulate allele expression level given the gamma distrib parameters (shape and rate)
gene_expression_Level = round (rgamma(1, gamshape, gamrate), 0)
transcriptomic_expect_Cov_A = round(variant_frequency * gene_expression_Level , 0)
transcriptomic_expect_Cov_B = gene_expression_Level - transcriptomic_expect_Cov_A
# simulate transcriptomic read counts for allele A and B
# given a poisson distribution
transcript_RC_A = rpois(n = 1, transcriptomic_expect_Cov_A )
transcript_RC_B = rpois(n = 1, transcriptomic_expect_Cov_B )
if (transcript_RC_A<0) transcript_RC_A = 0
if (transcript_RC_B<0) transcript_RC_B = 0
# Apply Fisher exact test
# only if the gene is expressed and the loci is heterozygous
observed_expression = transcript_RC_A + transcript_RC_B
if( (observed_expression>0 ) & genomic_RC_A !=0 & genomic_RC_B !=0 ){
Fisher_pvalue = fisher.test( matrix(c(genomic_RC_A,
genomic_RC_B,
transcript_RC_A,
transcript_RC_B),
nrow=2))$p.value
}
else{
Fisher_pvalue = "NA"
}
return(as.numeric(as.character(c(Fisher_pvalue, genomic_RC_A , genomic_RC_B,
transcript_RC_A, transcript_RC_B))))
}
```
# Simulation of genomic and transcriptomic read counts
I simulate read counts for genomic and transcriptomic for each sample and I apply the FET on each loci and the FDR.
```{r warning=FALSE}
n_test = 50000
results = c()
# To compare to psADE found in Oithona %ADE for q_value <0.05
oithona = c(0.21, 0.14, 0.33, 1.15, 0.27, 0.03, 0.43)
names(oithona) = rownames(parameters)
for (sample in rownames(parameters)){
FisherTest = matrix(c(rep(0, 5*n_test)), ncol=5)
for (j in 1:n_test){
FisherTest[j,] = test_RC_Noise(round(parameters[sample,"GenomeCov_NB_mu"],0), thetaMu,
parameters[sample,"VarFreq_Beta_shape1"],
parameters[sample,"VarFreq_Beta_shape2"],
parameters[sample,"Expression_Gamma_shape"],
parameters[sample,"Expression_Gamma_rate"],
parameters[sample,"Expression_variation_sd"]
)
}
# Writesimulation output
dir.create("simulated_data")
colnames(FisherTest) = c("Fisher_pvalue", "genomic_RC_A" , "genomic_RC_B",
"transcript_RC_A", "transcript_RC_B")
write.table(FisherTest, paste("simulated_data/",sample,".txt", sep = ""))
# Check if the genomic read count simulation is OK
print (sample)
genome_simulated_coverage = FisherTest[,2] + FisherTest[,3]
# expected distribution
theta = parameters[sample,"GenomeCov_NB_theta"]
mu = parameters[sample,"GenomeCov_NB_mu"]
genome_expected_coverage = rnbinom (n = n_test, size = theta, mu = mu)
plot(density(genome_simulated_coverage), col="red",
lwd=2, main = paste(sample," expected vs observed coverage distribution"))
lines (density(genome_expected_coverage), col="blue", lwd = 2, add = TRUE)
legend(1, 95, legend=c("Simulation", "Real data"),
col=c("red", "blue"), cex=0.8)
# remove tests with NA
FisherTest = as.numeric(as.character(FisherTest[!is.na(FisherTest[,1]),1]))
hist(FisherTest, breaks=100, freq = F)
# Apply FDR
FisherAdj = p.adjust(FisherTest, method = "fdr")
# output metrics
cutoff = 0.05
percentage_FET_positives = 100 * length(FisherAdj[FisherAdj<cutoff]) / length(FisherAdj)
count_FET_positives = length(FisherAdj[FisherAdj<cutoff])
n_loci_tested = length(FisherAdj)
# Results formatting
res = c(round(parameters[sample,"GenomeCov_NB_mu"],0),
cutoff,
percentage_FET_positives,
n_loci_tested,
count_FET_positives,oithona[sample],
oithona[sample]-percentage_FET_positives,
100*(oithona[sample]-percentage_FET_positives)/oithona[sample]
)
results = rbind(results, res)
}
colnames (results) = c( "Genome coverage", "q-value cutoff",
"%FET positives","Number of tested loci",
"counts","%FET real data",
"% true psADE in real data",
"% of detected psADE - noise")
rownames(results) = rownames(parameters)
write.table(results,"simulation_FET_results.txt", sep = "\t", quote=FALSE)
knitr::kable (results)
```