-
Notifications
You must be signed in to change notification settings - Fork 9
/
alevin-loadFry.R
452 lines (409 loc) · 16.8 KB
/
alevin-loadFry.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#' Load in data from alevin-fry USA mode
#'
#' Enables easy loading of sparse data matrices provided by
#' alevin-fry USA mode.
#'
#' @param fryDir path to the output directory returned by
#' alevin-fry quant command. This directory should contain a
#' \code{metainfo.json}, and an alevin folder which contains
#' \code{quants_mat.mtx}, \code{quants_mat_cols.txt} and
#' \code{quants_mat_rows.txt}
#' @param outputFormat can be \emph{either} be a list that defines the
#' desired format of the output \code{SingleCellExperiment} object
#' \emph{or} a string that represents one of the pre-defined output
#' formats, which are "scRNA", "snRNA", "all", "scVelo", "velocity", "U+S+A" and "S+A".
#' See details for the explanations of the pre-defined formats and
#' how to define custom format.
#' @param nonzero whether to filter cells with non-zero expression
#' value across all genes (default \code{FALSE}).
#' If \code{TRUE}, this will filter based on all assays.
#' If a string vector of assay names, it will filter based
#' on the matching assays in the vector.
#' If not in USA mode, it must be TRUE/FALSE/counts.
#' @param quiet logical whether to display no messages
#'
#' @section Details about \code{loadFry}:
#' This function consumes the result folder returned by running
#' alevin-fry quant in unspliced, spliced, ambiguous (USA)
#' quantification mode, and returns a \code{SingleCellExperiment} object
#' that contains a final count for each gene within each cell. In
#' USA mode, alevin-fry quant returns a count matrix contains three
#' types of count for each feature (gene) within each sample (cell
#' or nucleus), which represent the spliced mRNA count of the gene (S),
#' the unspliced mRNA count of the gene (U), and the count of UMIs whose
#' splicing status is ambiguous for the gene (A). For each assay
#' defined by \code{outputFormat}, these three counts of a gene
#' within a cell will be summed to get the final count of the gene
#' according to the rule defined in the \code{outputFormat}. The
#' returned object will contains the desired assays defined by
#' \code{outputFormat}, with rownames as the barcode of samples and
#' colnames as the feature names.
#'
#' @section Details about the output format:
#' The \code{outputFormat} argument takes \emph{either} be a list that defines
#' the desired format of the output
#' \code{SingleCellExperiment} object \emph{or} a string that represents one of
#' the pre-defined output format.
#'
#' Currently the pre-defined formats
#' of the output \code{SingleCellExperiment} object are:
#' \describe{
#' \item{"scRNA":}{This format is recommended for single cell experiments.
#' It returns a \code{counts} assay that contains the S+A count of each gene in each cell,
#' and a \code{unspliced} assay that contains the U count of each gene in each cell.}
#' \item{"snRNA", "all" and "U+S+A":}{These three formats are the same.
#' They return a \code{counts} assay that contains the U+S+A count of each gene in
#' each cell without any extra layers. "snRNA" is recommended for single-nucleus
#' RNA-sequencing experiments. "raw" is recommended for mimicing CellRanger 7's behavior,
#' which returns this format for both single-cell and single-nucleus experiments.}
#' \item{"S+A":}{This format returns a \code{counts} assay that contains the S+A
#' count of each gene in each cell.}
#' \item{"raw":}{This format puts the three kinds of counts into three separate assays,
#' which are \code{unspliced}, \code{spliced} and \code{ambiguous}.}
#' \item{"velocity":}{This format contains two assays.
#' The \code{spliced} assay contains the S+A count of each gene in each cell.
#' The \code{unspliced} assay contains the U counts of each gene in each cell.}
#' \item{"scVelo":}{This format is for direct entry into velociraptor R package or
#' other scVelo downstream analysis pipeline for velocity
#' analysis in R with Bioconductor. It adds the expected
#' "S"-pliced assay and removes errors for size factors being
#' non-positive.}
#' }
#'
#' A custom output format can be defined using a list. Each element in the list
#' defines an assay in the output \code{SingleCellExperiment} object.
#' The name of an element in the list will be the name of the corresponding
#' assay in the output object. Each element in the list should be defined as
#' a vector that takes at least one of the three kinds of count, which are U, S and A.
#' See the provided toy example for defining a custom output format.
#'
#' @return A \code{SingleCellExperiment} object that contains one
#' or more assays. Each assay consists of a gene by cell count matrix.
#' The row names are feature names, and the column names are cell
#' barcodes
#'
#' @references
#'
#' alevin-fry publication:
#'
#' He, D., Zakeri, M., Sarkar, H. et al. "Alevin-fry unlocks rapid, accurate and
#' memory-frugal quantification of single-cell RNA-seq data."
#' Nature Methods 19, 316–322 (2022).
#' \url{https://doi.org/10.1038/s41592-022-01408-3}
#'
#' @examples
#'
#' # Get path for minimal example avelin-fry output dir
#' testdat <- fishpond:::readExampleFryData("fry-usa-basic")
#'
#' # This is exactly how the velocity format defined internally.
#' custom_velocity_format <- list("spliced"=c("S","A"), "unspliced"=c("U"))
#'
#' # Load alevin-fry gene quantification in velocity format
#' sce <- loadFry(fryDir=testdat$parent_dir, outputFormat=custom_velocity_format)
#' SummarizedExperiment::assayNames(sce)
#'
#' # Load the same data but use pre-defined, velociraptor R pckage desired format
#' scvelo_format <- "scVelo"
#'
#' scev <- loadFry(fryDir=testdat$parent_dir, outputFormat=scvelo_format, nonzero=TRUE)
#' SummarizedExperiment::assayNames(scev)
#'
#' @author Dongze He, with contributions from Steve Lianoglou, Wes Wilson
#'
#' @export
loadFry <- function(fryDir,
outputFormat = "scRNA",
nonzero = FALSE,
quiet = FALSE) {
# load in fry result
fry.raw <- load_fry_raw(fryDir, quiet)
meta.data <- fry.raw$meta.data
# if in usa.mode, sum up counts in different status according to which.counts
if (meta.data$usa.mode) {
# preparation
predefined.format <- list("scrna" = list("counts" = c("S", "A"), "unspliced" = c("U")),
"snrna" = list("counts" = c("U", "S", "A")),
"all" = list("counts" = c("U", "S", "A")),
"u+s+a" = list("counts" = c("U", "S", "A")),
"s+a" = list("counts" = c("S", "A")),
"velocity" = list("spliced" = c("S", "A"), "unspliced" = c("U")),
"scvelo" = list("counts" = c("S", "A"), "spliced" = c("S", "A"), "unspliced" = c("U")),
"raw" = list("spliced" = c("S"), "unspliced" = c("U"), "ambiguous" = c("A"))
)
valid.counts <- c("U", "S", "A")
# check outputFormat
if (is.character(outputFormat)) {
outputFormat = tolower(outputFormat)
# Check whether outputFormat is a predefined format
if (! (outputFormat %in% names(predefined.format))) {
stop("Provided outputFormat string is invalid. Please check the function description for the list of predifined format")
}
if (!quiet) {
message("Using pre-defined output format: ", outputFormat)
}
# get the assays
output.assays <- predefined.format[[outputFormat]]
} else if (is.list(outputFormat)) {
# check whether user-defined assays are all
for (assay.name in names(outputFormat)) {
if (is.null(outputFormat[[assay.name]])) {
stop(paste0("The provided assay '", assay.name, "' is empty. Please remove it"))
}
else if (!all(outputFormat[[assay.name]] %in% valid.counts)) {
stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay")
}
}
if (!all(unlist(outputFormat) %in% valid.counts)) {
stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay")
}
output.assays <- outputFormat
if (!quiet) {
message("Using user-defined output assays")
}
}
# If we are here, the output.assays is valid.
# then we check the assay names in nonzero
if (is.logical(nonzero)) {
if (nonzero) {
nonzero <- names(output.assays)
} else {
if (is.character(outputFormat) && outputFormat == "scvelo") {
nonzero <- c("counts")
} else {
nonzero <- c()
}
}
} else if (is.character(nonzero)) {
if (length(nonzero) > 0) {
for (idx in seq_along(nonzero)) {
if (!nonzero[idx] %in% names(output.assays)) {
warning(paste0("In the provided nonzero vector, '",
nonzero[idx],
"' is not one of the output assays, ignored"))
nonzero <- nonzero[-idx]
}
}
}
} else {
warning("Invalid nonzero, ignored")
nonzero <- c()
}
# assembly
alist <- vector(mode = "list", length = length(output.assays))
names(alist) <- names(output.assays)
ng <- meta.data$num.genes
rd <- list("S" = seq(1, ng), "U" = seq(ng + 1, 2 * ng),
"A" = seq(2 * ng + 1, 3 * ng))
# fill in each assay
for (assay.name in names(output.assays)) {
which.counts <- output.assays[[assay.name]]
alist[[assay.name]] <- fry.raw$count.mat[, rd[[which.counts[1]]], drop = FALSE]
if (length(which.counts) > 1) {
# build assay
for (wc in which.counts[-1]) {
alist[[assay.name]] <- alist[[assay.name]] + fry.raw$count.mat[, rd[[wc]], drop = FALSE]
}
}
alist[[assay.name]] <- t(alist[[assay.name]])
if (!quiet) {
message(paste(c(paste0("Building the '",
assay.name,
"' assay, which contains"),
which.counts),
collapse = " "))
}
}
} else {
# not in USA mode
if (!quiet) {
message("Not in USA mode, set assay name as \"counts\"")
}
if (is.logical(nonzero)) {
if (nonzero) {
nonzero <- c("counts")
} else {
nonzero <- c()
}
} else {
if (nonzero != "counts") {
message("Not in USA mode, nonzero must be TRUE/FALSE/counts")
message("Set nonzero as FALSE")
nonzero <- c()
} else {
nonzero <- c("counts")
}
}
# define output matrix
alist <- list(counts = t(fry.raw$count.mat))
}
if (!quiet) {
message("Constructing output SingleCellExperiment object")
}
# create SingleCellExperiment object
sce <- SingleCellExperiment(alist,
colData = fry.raw$barcodes,
rowData = fry.raw$gene.names)
# filter all zero cells in zero, one or multiple assays
for (assay.name in nonzero) {
sce <- sce[, colSums(assay(sce, assay.name)) > 0]
}
if (!quiet) {
message("Done")
}
sce
}
load_fry_raw <- function(fryDir, quiet = FALSE) {
# Check `fryDir` is legit
if (!quiet) {
message("locating quant file")
}
quant.file <- file.path(fryDir, "alevin", "quants_mat.mtx")
if (!file.exists(quant.file)) {
stop("The `fryDir` directory provided does not look like a directory generated from alevin-fry:\n",
sprintf("Missing quant file: %s", quant.file)
)
}
# Since alevin-fry 0.4.1, meta_info.json is changed to quant.json, we check both
# read in metadata
qfile <- file.path(fryDir, "quant.json")
if (!file.exists(qfile)) {
qfile <- file.path(fryDir, "meta_info.json")
}
# read in metadata
meta.info <- fromJSON(qfile)
ng <- meta.info$num_genes
usa.mode <- meta.info$usa_mode
if (!quiet) {
message("Reading meta data")
message(paste0("USA mode: ", usa.mode))
}
# if usa mode, each gene gets 3 rows, so the actual number of genes is ng/3
if (usa.mode) {
if (ng %% 3 != 0) {
stop("The number of quantified targets is not a multiple of 3")
}
ng <- as.integer(ng/3)
}
# read in count matrix, gene names, and barcodes
count.mat <- my_as_dgcmatrix(readMM(file = quant.file))
afg <- read.table(file.path(fryDir, "alevin", "quants_mat_cols.txt"),
strip.white = TRUE, header = FALSE, nrows = ng,
col.names = c("gene_ids"))
rownames(afg) <- afg$gene_ids
afc <- read.table(file.path(fryDir, "alevin", "quants_mat_rows.txt"),
strip.white = TRUE, header = FALSE,
col.names = c("barcodes"))
rownames(afc) <- afc$barcodes
if (!quiet) {
message(paste("Processing", ng, "genes", "and", nrow(count.mat), "barcodes"))
}
list(count.mat = count.mat, gene.names = afg, barcodes = afc, meta.data = list(num.genes = ng, usa.mode = usa.mode))
}
# Functions to help with running or creating test data
# none of these functions are exported on purpose
# Example alevin-fry quant dataset ---------------------------------------------
#
# These methods provide an orthologous way to create and read in test examples
# from alevin outputs than what is implemented in loadFry().
#
# The functions provide a mechanism that enables you to define the sample output
# in a plain text matrix format (data.frame) named `example-dat.csv` and
# place that in a new directory under extdata/alevin/example-quants.
#
# Look at the `extdata/alevin/example-quants/fry-usa-basic/example-dat.csv` for
# an example data file that created a sample `salmon alevin-fry quant` directory
# we can use to test `loadFry` against.
#
# Once we created the extdata/alevin/example-quants/fry-usa-basic directory
# and put the the example-dat.csv file in it, we then run the following command
# create all of the *.mtx and other files to make this look like an alevin
# output directoy.
#
# ```{r}
# devtools::load_all(".")
# writeExampleFryDat("fry-usa-basic")
# ```
#' Loads an example data matrix from a csv data from a top-level example
#' fry output directory.
#'
#' @noRd
#' @param example_name the name of the folder that holds the example data.
#' @return a list of primitive data types you can use to serialize a mock
#' output dir from a `salmon alevin-fry quant` run.
readExampleFryData <- function(example_name = "fry-usa-basic", usa = TRUE, ...) {
example_dir <- system.file("extdata", "alevin", "example-quants",
example_name, package = "fishpond",
mustWork = TRUE)
dat <- read.csv(file.path(example_dir, "example-dat.csv"),
strip.white = TRUE, comment.char = "#")
m <- as.matrix(dat)
dimnames(m) <- list(NULL, NULL)
if (usa) {
genes <- unique(sub("_.*$", "", colnames(dat)))
} else {
genes <- colnames(dat)
}
out <- list(
parent_dir = example_dir,
matrix = m,
barcodes = rownames(dat),
genes = colnames(dat))
if (usa) {
out$genes <- unique(sub("_.*$", "", out$genes))
out$usa <- list(
U = grep("_U$", colnames(dat)),
S = grep("_S$", colnames(dat)),
A = grep("_A$", colnames(dat)))
}
out
}
#' Serializes example fry output data from an `example-fry-dat.csv` as if
#' it were produced from an alevin-fry quant run
#'
#' @noRd
#' @param x the name of the top level directory in
#' `extdata/alevin/example-fry-USA-quants` or a list result from calling
#' `readExampleFryData`
writeExampleFryDat <- function(x = "fry-usa-basic", ...) {
if (is.character(x)) {
x <- readExampleFryData(x)
}
stopifnot(
is.list(x),
all(c("matrix", "genes", "barcodes", "parent_dir") %in% names(x)))
if (is.null(x$usa)) {
stop("Haven't kicked the tires on a non USA like output")
}
out.dir <- file.path(x$parent_dir, "alevin")
if (!dir.exists(out.dir)) {
stopifnot(dir.create(out.dir))
}
m <- my_as_dgcmatrix(Matrix::Matrix(x$matrix, sparse = TRUE))
Matrix::writeMM(m, file.path(out.dir, "quants_mat.mtx"))
write.table(
x$genes,
file = file.path(out.dir, "quants_mat_cols.txt"),
quote = FALSE,col.names = FALSE, row.names = FALSE, sep = "\t")
write.table(
x$barcodes,
file = file.path(out.dir, "quants_mat_rows.txt"),
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
# create metadata
meta_info = list()
meta_info[["alt_resolved_cell_numbers"]] = list()
meta_info[["cmd"]] = ""
meta_info[["dump_eq"]] = FALSE
meta_info[["num_genes"]] = length(x$genes) * ifelse(is.null(x$usa), 1, 3)
meta_info[["num_quantified_cells"]] = length(x$barcodes)
meta_info[["resolution_strategy"]] = "CellRangerLike"
meta_info[["usa_mode"]] = TRUE
write(
jsonlite::toJSON(meta_info, pretty=TRUE),
file = file.path(x$parent_dir, "meta_info.json"))
}
# added August 2022
# see https://cran.r-project.org/web/packages/Matrix/vignettes/Design-issues.pdf
my_as_dgcmatrix <- function(matrix) {
as(as(as(matrix, "dMatrix"), "generalMatrix"), "CsparseMatrix")
}