-
Notifications
You must be signed in to change notification settings - Fork 1
/
simDynamic.R
155 lines (135 loc) · 5.98 KB
/
simDynamic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
##' A function to simulate dynamic spatial proteomics data using a bootstrap method
##'
##'
##' @title Generate a dynamic spatial proteomics experiment
##' @param object A instance of class `MSnSet` from which to generate a spatial
##' proteomics dataset.
##' @param subsample how many proteins to subsample to speed up analysis. Default is NULL.
##' @param knn_par the number of nearest neighbours to use in KNN classification to simulate
##' dataset.
##' Default is 10
##' @param fcol feature column to indicate markers. Default is "markers". Proteins
##' with unknown localisations must be encoded as "unknown".
##' @param numRep The total number of datasets to generate. Default is 6. An
##' integer must be provided
##' @param method The bootstrap method to use to simulate dataset. Default is "wild".
##' refer to BANDLE paper for more details.
##' @param batch Whether or not to include batch effects. Default is FALSE.
##' @param frac_perm whether or not to permute the fractions. Default is FALSE
##' @param nu parameter to generate residual inflated noise. Default is 2. See BANDLE
##' paper for more details
##' @param numDyn An integer number of protein to simulate dynamic transitions. Default is 20
##' @return returns simulate dynamic lopit datasets and the name of the relocalated protein.
##' @md
##'
##' @examples
##' library(pRolocdata)
##' data("tan2009r1")
##' set.seed(1)
##' tansim <- sim_dynamic(object = tan2009r1, numRep = 6L, numDyn = 100L)
##'
##'
##' @rdname bandle-sim
sim_dynamic <- function(object,
subsample = NULL,
knn_par = 10L,
fcol = "markers",
numRep = 6L,
method = "wild",
batch = FALSE,
frac_perm = FALSE,
nu = 2,
numDyn = 20L) {
# checks
stopifnot("object is not an instance of class MSnSet"=is(object, "MSnSet"))
stopifnot("number of Replicates is not valid, provide an integer"=is(numRep, "integer"))
stopifnot("number of Translocations is not valid, provide an integer"=is(numDyn, "integer"))
stopifnot("Knn parameers must be an integer"=is(knn_par, "integer"))
stopifnot("method is not valid"=method %in% c("wild", "rand", "const"))
stopifnot("batch is not valid"=batch %in% c(FALSE, "rand", "systematic"))
# generate marker and unknown MSnSets
lopitmarkerset <- markerMSnSet(object, fcol = fcol)
lopitunknownset <- unknownMSnSet(object, fcol = fcol)
# number of proteins that are unannotated
Nunk <- nrow(lopitunknownset)
# if subsampling start to subsample
if (!is.null(subsample)){
subs <- sample.int(Nunk, size = subsample)
object <- combine(lopitmarkerset, lopitunknownset[subs, ])
}
## Perform K-NN with K classification to get clustering
object <- knnClassification(object = object, k = knn_par, fcol = fcol)
# Get Mean of each organelle
orgM <- meanOrganelle(object = object, fcol = fcol)$M
## Use means as fitted function for regression and compute residuals
residual <- matrix(NA, nrow = nrow(object), ncol = ncol(object))
residual <- exprs(object) - orgM[fData(object)[, "knn"], ]
## Bootstrap residuals
numRep <- numRep # number of additional datasets
## storage for datasets
lopitrep <- list()
K <- length(getMarkerClasses(object, fcol = fcol))
# Methods for creating replicates, constant, random or wild boostrap
if (method == "const") {
nu <- nu
} else if (method == "rand") {
nu <- runif(nrow(object), 1, nu)
} else if (method == "wild") {
nu <- runif(K, 1, nu)[fData(object)$knn] # cluster specific residual inflated noise
}
# code to generate replicates
for (i in seq.int(numRep)) {
myobject <- object
idxBoot <- matrix(sample.int(ncol(myobject),
size = ncol(myobject) * nrow(myobject),
replace = TRUE),
nrow = nrow(myobject), ncol(myobject))
bootres <- matrix(NA, nrow = nrow(myobject), ncol(myobject))
for (k in seq.int(nrow(myobject))) {
bootres[k, ] <- residual[k, idxBoot[k, ]]
}
lopitrep[[i]] <- myobject
newExprs <- orgM[fData(myobject)$knn, ] + nu * bootres ## add boostraps
colnames(newExprs) <- colnames(myobject)
rownames(newExprs) <- rownames(myobject)
exprs(lopitrep[[i]]) <- newExprs
}
# should we had batch effects
if (batch == "rand") {
for (i in seq.int(numRep)){
batch_effect <- rgamma(1, 1, 1) # sample batch effect
g <- sample(c(seq.int(ncol(myobject))), size = 1) # sample which fraction to add the effect to
exprs(lopitrep[[i]])[, g] <- exprs(lopitrep[[i]])[, g] + batch_effect
}
} else if (batch == "systematic") {
g <- sample(c(seq.int(ncol(myobject))), size = 1) # sample which fraction to add the effect to
batch_effect <- rgamma(1, 1, 1) # sample batch effect
for (i in seq.int(numRep)) {
exprs(lopitrep[[i]])[, g] <- exprs(lopitrep[[i]])[, g] + batch_effect
}
}
# should we permute the fractions
if (frac_perm == TRUE) {
for (i in seq.int(numRep)){
permute <- sample.int(ncol(myobject))
exprs(lopitrep[[i]]) <- exprs(lopitrep[[i]])[, permute]
}
}
## simulate translocation events
mlopit <- unknownMSnSet(object)
perm1 <- sample.int(nrow(mlopit), size = numDyn, replace = TRUE)
perm1_names <- rownames(mlopit)[perm1]
## compute statistics for simulated replicates
replicateStats <- lapply(lopitrep, function(x) meanOrganelle(x, fcol = fcol))
# Simulate dynamics, make sure to use correct
for (i in perm1_names) {
K <- length(getMarkerClasses(object, fcol = fcol))
r <- sample.int(K, size = 1)
for (j in seq.int(numRep/2 + 1, numRep)) {
exprs(lopitrep[[j]])[i, ] <- replicateStats[[j]]$M[r, ] +
rnorm(ncol(object), mean = 0, sd = sqrt(replicateStats[[j]]$V[r, ]))
}
}
# return simulate replicates and the names of dynamic translocations
return(list(lopitrep = lopitrep, perm1_names = perm1_names))
}