/
saver.R
181 lines (171 loc) · 6.18 KB
/
saver.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#' Runs SAVER
#'
#' Recovers expression using the SAVER method.
#'
#' The SAVER method starts by estimating the prior mean and variance for the
#' true expression level for each gene and cell. The prior mean is obtained
#' through predictions from a LASSO Poisson regression for each gene
#' implemented using the \code{glmnet} package. Then, the variance is estimated
#' through maximum likelihood assuming constant variance, Fano factor, or
#' coefficient of variation variance structure for each gene. The posterior
#' distribution is calculated and the posterior mean is reported as the SAVER
#' estimate.
#'
#' @param x An expression count matrix. The rows correspond to genes and
#' the columns correspond to cells. Can be sparse.
#'
#' @param do.fast Approximates the prediction step. Default is TRUE.
#'
#' @param ncores Number of cores to use. Default is 1.
#'
#' @param size.factor Vector of cell size normalization factors.
#' If \code{x} is already normalized or normalization is not desired, use
#' \code{size.factor = 1}. Default uses mean library size normalization.
#'
#' @param npred Number of genes for regression prediction. Selects the top
#' \code{npred} genes in terms of mean expression for regression prediction.
#' Default is all genes.
#'
#' @param pred.cells Indices of cells to perform regression prediction.
#' Default is all cells.
#'
#' @param pred.genes Indices of specific genes to perform regression
#' prediction. Overrides \code{npred}. Default is all genes.
#'
#' @param pred.genes.only Return expression levels of only \code{pred.genes}.
#' Default is FALSE (returns expression levels of all genes).
#'
#' @param null.model Whether to use mean gene expression as prediction.
#'
#' @param mu Matrix of prior means.
#'
#' @param estimates.only Only return SAVER estimates. Default is FALSE.
#'
#'
#' @return If `estimates.only = TRUE`, then a matrix of SAVER estimates.
#'
#' If `estimates.only = FALSE`, a list with the following components
#' \item{\code{estimate}}{Recovered (normalized) expression.}
#' \item{\code{se}}{Standard error of estimates.}
#' \item{\code{info}}{Information about dataset.}
#'
#' The \code{info} element is a list with the following components:
#' \item{\code{size.factor}}{Size factor used for normalization.}
#' \item{\code{maxcor}}{Maximum absolute correlation for each gene. 2 if not
#' calculated}
#' \item{\code{lambda.max}}{Smallest value of lambda which gives the null
#' model.}
#' \item{\code{lambda.min}}{Value of lambda from which the prediction model is
#' used}
#' \item{\code{sd.cv}}{Difference in the number of standard deviations in
#' deviance between the model with lowest cross-validation error and the null
#' model}
#' \item{\code{pred.time}}{Time taken to generate predictions.}
#' \item{\code{var.time}}{Time taken to estimate variance.}
#' \item{\code{maxcor}}{Maximum absolute correlation cutoff used to determine
#' if a gene should be predicted.}
#' \item{\code{lambda.coefs}}{Coefficients for estimating lambda with lowest
#' cross-validation error.}
#' \item{\code{total.time}}{Total time for SAVER estimation.}
#'
#' @examples
#' data("linnarsson")
#'
#' \dontrun{
#' system.time(linnarsson_saver <- saver(linnarsson, ncores = 12))
#' }
#'
#' # predictions for top 5 highly expressed genes
#' \dontrun{
#' saver2 <- saver(linnarsson, npred = 5)
#' }
#'
#' # predictions for certain genes
#' \dontrun{
#' genes <- c("Thy1", "Mbp", "Stim2", "Psmc6", "Rps19")
#' genes.ind <- which(rownames(linnarsson) %in% genes)
#' saver3 <- saver(linnarsson, pred.genes = genes.ind)
#' }
#'
#' # return only certain genes
#' \dontrun{
#' saver4 <- saver(linnarsson, pred.genes = genes.ind, pred.genes.only = TRUE)
#' }
#'
#' @import foreach methods
#'
#' @export
saver <- function(x, do.fast = TRUE, ncores = 1, size.factor = NULL,
npred = NULL, pred.cells = NULL, pred.genes = NULL,
pred.genes.only = FALSE, null.model = FALSE, mu = NULL,
estimates.only = FALSE) {
if (!is.null(mu)) {
mu <- check.mu(x, mu)
}
x <- clean.data(x)
np <- dim(x)
ngenes <- as.integer(np[1])
ncells <- as.integer(np[2])
message(ngenes, " genes, ", ncells, " cells")
# assign size factor
sf.out <- calc.size.factor(x, size.factor, ncells)
sf <- sf.out[[1]]
scale.sf <- sf.out[[2]]
gene.names <- rownames(x)
cell.names <- colnames(x)
# Set up parallel backend
cl.create <- FALSE
if (foreach::getDoParWorkers() > 1) {
if (ncores > 1 & foreach::getDoParWorkers() != ncores) {
message(paste("Parallel backend already registered and is inconsistent",
"with ncores."))
}
ncores <- getDoParWorkers()
} else {
if (ncores > 1) {
cl.create <- TRUE
cl <- parallel::makeCluster(ncores, outfile = "")
doParallel::registerDoParallel(cl)
on.exit({
parallel::stopCluster(cl)
foreach::registerDoSEQ()
})
}
}
# if prior means are provided
if (!is.null(mu)) {
out <- saver.fit.mean(x, ncores, sf, scale.sf, mu, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
} else if (null.model) {
out <- saver.fit.null(x, ncores, sf, scale.sf, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
} else {
# assign pred.cells and pred.genes
pred.cells <- get.pred.cells(pred.cells, ncells)
pred.genes <- get.pred.genes(x, pred.genes, npred, ngenes)
npred <- length(pred.genes)
good.genes <- which(Matrix::rowMeans(x) >= 0.1)
x.est <- t(as.matrix(log(sweep(x[good.genes, ] + 1, 2, sf, "/"))))
if (pred.genes.only) {
x <- x[pred.genes, , drop = FALSE]
pred.genes <- 1:nrow(x)
gene.names <- rownames(x)
cell.names <- colnames(x)
}
out <- saver.fit(x, x.est, do.fast, ncores, sf, scale.sf, pred.genes,
pred.cells, null.model, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
}
message("Done!")
message("Finish time: ", Sys.time())
message("Total time: ", format(out$info$total.time))
if (!estimates.only) {
class(out) <- "saver"
out
} else {
out$estimate
}
}