-
Notifications
You must be signed in to change notification settings - Fork 25
/
importMetaphlan.R
435 lines (407 loc) · 16.9 KB
/
importMetaphlan.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#' Import Metaphlan results to \code{TreeSummarizedExperiment}
#
#' @param file a single \code{character} value defining the file
#' path of the Metaphlan file. The file must be in merged Metaphlan format.
#'
#' @param colData a DataFrame-like object that includes sample names in
#' rownames, or a single \code{character} value defining the file
#' path of the sample metadata file. The file must be in \code{tsv} format
#' (default: \code{colData = NULL}).
#'
#' @param sample_meta a DataFrame-like object that includes sample names in
#' rownames, or a single \code{character} value defining the file
#' path of the sample metadata file. The file must be in \code{tsv} format
#' (default: \code{sample_meta = NULL}).
#'
#' @param phy_tree a single \code{character} value defining the file
#' path of the phylogenetic tree.
#' (default: \code{phy_tree = NULL}).
#'
#' @param ... additional arguments:
#' \itemize{
#' \item \code{assay.type}: A single character value for naming
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}
#' (default: \code{assay.type = "counts"})
#' \item \code{assay_name}: A single \code{character} value for specifying which
#' assay to use for calculation. (Please use \code{assay.type} instead.
#' At some point \code{assay_name} will be disabled.)
#' \item \code{removeTaxaPrefixes}: \code{TRUE} or \code{FALSE}: Should
#' taxonomic prefixes be removed? (default:
#' \code{removeTaxaPrefixes = FALSE})
#' \item \code{remove.suffix}: \code{TRUE} or \code{FALSE}: Should
#' suffixes of sample names be removed? Metaphlan pipeline adds suffixes
#' to sample names. Suffixes are formed from file names. By selecting
#' \code{remove.suffix = TRUE}, you can remove pattern from end of sample
#' names that is shared by all. (default: \code{remove.suffix = FALSE})
#' \item \code{set.ranks}: \code{TRUE} or \code{FALSE}: Should the columns
#' in the rowData that are treated as taxonomy ranks be updated according to
#' the ranks found in the imported data?
#' (default: \code{set.ranks = FALSE})
#' }
#'
#' @details
#' Import Metaphlan (versions 2, 3 and 4 supported) results.
#' Input must be in merged Metaphlan format.
#' (See
#' \href{https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4#merging-tables}{
#' the Metaphlan documentation and \code{merge_metaphlan_tables} method.})
#' Data is imported so that data at the lowest rank is imported as a
#' \code{TreeSummarizedExperiment} object. Data at higher rank is imported as a
#' \code{SummarizedExperiment} objects which are stored to \code{altExp} of
#' \code{TreeSummarizedExperiment} object.
#'
#' Currently Metaphlan versions 2, 3, and 4 are supported.
#'
#' @return A
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object
#'
#' @name importMetaPhlAn
#' @seealso
#' \code{\link[=importHUMAnN]{importHUMAnN}}
#' \code{\link[=makeTreeSEFromPhyloseq]{makeTreeSEFromPhyloseq}}
#' \code{\link[=makeTreeSEFromBiom]{makeTreeSEFromBiom}}
#' \code{\link[=makeTreeSEFromDADA2]{makeTreeSEFromDADA2}}
#' \code{\link[=importQIIME2]{importQIIME2}}
#' \code{\link[=importMothur]{importMothur}}
#'
#' @export
#' @author Leo Lahti and Tuomas Borman. Contact: \url{microbiome.github.io}
#'
#' @references
#' Beghini F, McIver LJ, Blanco-Míguez A, Dubois L, Asnicar F, Maharjan S, Mailyan A,
#' Manghi P, Scholz M, Thomas AM, Valles-Colomer M, Weingart G, Zhang Y, Zolfo M,
#' Huttenhower C, Franzosa EA, & Segata N (2021) Integrating taxonomic, functional,
#' and strain-level profiling of diverse microbial communities with bioBakery 3.
#' \emph{eLife}. 10:e65088. doi: 10.7554/eLife.65088
#'
#' @examples
#' # (Data is from tutorial
#' # https://github.com/biobakery/biobakery/wiki/metaphlan3#merge-outputs)
#'
#' # File path
#' file_path <- system.file("extdata", "merged_abundance_table.txt", package = "mia")
#' # Import data
#' tse <- importMetaPhlAn(file_path)
#' # Data at the lowest rank
#' tse
#' # Data at higher rank is stored in altExp
#' altExps(tse)
#' # Higher rank data is in SE format, for example, Phylum rank
#' altExp(tse, "Phylum")
#'
NULL
importMetaPhlAn <- function(
file, colData = sample_meta, sample_meta = NULL, phy_tree = NULL, ...){
################################ Input check ################################
if(!.is_non_empty_string(file)){
stop("'file' must be a single character value.",
call. = FALSE)
}
if (!file.exists(file)) {
stop(file, " does not exist", call. = FALSE)
}
if(!is.null(colData) &&
!(.is_non_empty_string(colData) || is.data.frame(colData) ||
is.matrix(colData) || is(colData, "DataFrame")) ){
stop("'colData' must be a single character value, DataFrame or NULL.",
call. = FALSE)
}
if(!is.null(phy_tree) && !.is_non_empty_string(phy_tree)){
stop("'phy_tree' must be a single character value or NULL.",
call. = FALSE)
}
############################## Input check end #############################
# Get rowdata columns. metaphlan v2 has ID column. Metaphlan > v2 has
# clade_name for taxonomy names. Some has taxonomy.
rowdata_col <- c("clade_name", "ID", "_id", "taxonomy")
# Read metaphlan data
data <- .read_metaphlan(file, rowdata_col, ...)
# Parse data into separate tables, which include data at certain taxonomy rank
tables <- .parse_metaphlan(data, ...)
# Create multiple SE objects at different rank from the data
available_ranks <- names(tables)
se_objects <- lapply(tables, function(x){
.create_se_from_metaphlan(
x, rowdata_col, returned.ranks = available_ranks, ...)
})
# Get the object with lowest rank
tse <- se_objects[[ length(se_objects) ]]
# Convert it to TreeSE so that it has altExp
tse <- as(tse, "TreeSummarizedExperiment")
# Remove it, so that it is not added multiple times
se_objects[[ length(se_objects) ]] <- NULL
# Add rest of the objects to altExp
if( length(se_objects) > 0 ){
for( rank in names(se_objects) ){
# Add SEs to altExp of tse, give name according to rank
altExp(tse, rank) <- se_objects[[rank]]
}
}
# Set taxonomy ranks using .set_taxonomy_ranks
.set_ranks_based_on_rowdata(tse,...)
# Load sample meta data if it is provided
if( !is.null(colData) ) {
tse <- .add_coldata(tse, colData)
}
# Load tree if it is provided
if (!is.null(phy_tree)) {
tree <- ape::read.tree(phy_tree)
rowTree(tse) <- tree
}
return(tse)
}
################################ HELP FUNCTIONS ################################
# Read Metaphlan file, catch error if it occurs
.read_metaphlan <- function(file, rowdata_col, remove.suffix = FALSE, ...){
################################ Input check ###############################
if(!.is_a_bool(remove.suffix)){
stop("'remove.suffix' must be TRUE or FALSE.", call. = FALSE)
}
############################## Input check end #############################
# Read the table. Catch error and give more informative message
table <- tryCatch(
{
read.table(file, header = TRUE, comment.char = "#", check.names = FALSE)
},
error = function(condition){
stop("Error while reading ", file,
"\nPlease check that the file is in merged Metaphlan file format.",
call. = FALSE)
}
)
# Check that file is in right format
if( .check_metaphlan(table, rowdata_col) ){
stop("Error while reading ", file,
"\nPlease check that the file is in merged Metaphlan file format.",
call. = FALSE)
}
# Remove possible suffix from the colnames if user has specified
if( remove.suffix ){
table <- .remove_suffix(table, rowdata_col)
}
return(table)
}
# Check that metaphlan file contains correct information
.check_metaphlan <- function(data, rowdata_col){
# Get rowData columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(data)))
rowdata_columns <- data[ , rowdata_id, drop = FALSE]
# Get columns that go to assay
assay_columns <- data[ , -rowdata_id, drop = FALSE]
# Initialize result
result <- TRUE
# Check rowdata column names that they contain right information, and check that
# rest of the columns represents abundances in samples.
# If these requirements are met, give FALSE. Otherwise, give TRUE.
if( length(rowdata_id) > 0 &&
all(unlist(lapply(assay_columns, is.numeric))) ){
result <- FALSE
}
return(result)
}
# Get metaphlan table as input and return multiple tables which each include data at
# certain taxonomy rank
.parse_metaphlan <- function(table, ...){
# ID in Metaphlan v2, > 2 clade_name
col <- colnames(table) %in% c("clade_name", "ID")
if( sum(col) != 1 ){
stop("Error in parsing Metaphlan file.", call. = FALSE)
}
# Get the lowest level of each row
levels <- lapply(table[ , col], FUN = .get_lowest_taxonomic_level)
# Convert list to vector
levels <- unlist(levels)
# Split table so that each individual table contains information only
# at specific rank
tables <- split(table, levels)
# Get the order
metaphlan_tax = c(TAXONOMY_RANKS, "strain")
indices <- match(metaphlan_tax, tolower(names(tables)))
# Remove NAs which occurs if rank is not included
indices <- indices[!is.na(indices)]
# Order tables
tables <- tables[indices]
return(tables)
}
# Get the lowest level of the string that contains multiple taxonomic levels with prefixes
# Output is single character that specifies the rank, e.g, "s" == "Species"
.get_lowest_taxonomic_level <- function(string){
# Get indices that specify location of rank prefixes
levels <- gregexpr("([kpcofgst]+)__", string)[[1]]
# Get the location of lowest rank
lowest_level_ind <- levels[length(levels)]
# Get the lowest rank that was found
lowest_level <- substr(string, start = lowest_level_ind, stop = lowest_level_ind)
# List all ranks and what prefix they correspond
ranks <- c(
Domain = "d", Kingdom = "k", Phylum = "p", Class = "c", Order = "o",
Family = "f", Genus = "g", Species = "s", Strain = "t")
# Convert prefix into full rank name
lowest_level <- names(ranks[ match(lowest_level, ranks) ])
return(lowest_level)
}
# Create SE object that include rowdata and assay, from the metaphlan table
.create_se_from_metaphlan <- function(
table, rowdata_col, assay.type = assay_name, assay_name = "counts",
...){
# Check assay.type
if( !.is_non_empty_character(assay.type) ){
stop("'assay.type' must be a non-empty character value.",
call. = FALSE)
}
# Get rowdata columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(table)))
# Get those columns that belong to rowData
rowdata <- table[, rowdata_id, drop = FALSE]
# Get those columns that belong to assay
assay <- table[, -rowdata_id, drop = FALSE]
# Parse taxonomic levels
taxonomy <- .parse_taxonomy(
rowdata[ , 1, drop = FALSE], sep = "\\|",
column_name = colnames(rowdata)[[1]], ...)
# Add parsed taxonomy level information to rowdata
rowdata <- cbind(taxonomy, rowdata)
# Ensure that rowData is DataFrame
rowdata <- DataFrame(rowdata, check.names = FALSE)
# Ensure that assay is matrix
assay <- as.matrix(assay)
# Create assays list and add assay with specific name
assays <- S4Vectors::SimpleList()
assays[[assay.type]] <- assay
# Create TreeSE
se <- TreeSummarizedExperiment(
assays = assays, rowData = rowdata)
# Add taxonomy labels
rownames(se) <- getTaxonomyLabels(se)
return(se)
}
# This function can be used to add colData to TreeSE. It checks that sample
# names match (full or partial) and adds the metadata to altExps also.
.add_coldata <- function(tse, coldata){
# If the coldata is character specifying the path
if( .is_non_empty_character(coldata) ){
coldata <- read.table(file = coldata, header = TRUE, sep = "\t")
rownames(coldata) <- coldata[, 1]
}
# Ensure that the coldata is DF
coldata <- DataFrame(coldata)
# Usually when metaphlan sample names are created, the workflow adds file
# name as a suffix. Because of that sample names do not match anymore
# necessarily. Try partial match if full match is not found.
if( all(rownames(coldata) %in% colnames(tse)) ){
sample_names <- rownames(coldata)
names(sample_names) <- sample_names
} else{
sample_names <- lapply(rownames(coldata), function(x){
x <- colnames(tse)[grep(x, colnames(tse))]
if( length(x) != 1 ){
x <- NULL
}
return(x)
})
names(sample_names) <- rownames(coldata)
sample_names <- unlist(sample_names)
}
# Check if all samples were found. (In partial match certain sample name
# is missing if one matching name was not found.). In this part, all
# colnames should be found if data sets are matching. (More samples in
# metadata is allowed.)
if( !(all(colnames(tse) %in% sample_names) &&
length(sample_names) == ncol(tse)) ){
warning(
"The sample names in 'colData' do not match with the data. ",
"The sample metadata is not added.", call. = FALSE
)
return(tse)
}
# Reorder sample metadata based on the data
sample_names <- sample_names[match(colnames(tse), sample_names)]
coldata <- coldata[names(sample_names), ]
# Give warning if partial match was used
if( !all(rownames(coldata) %in% colnames(tse)) ){
warning("Partial match was used to match sample names between ",
"'colData' and the data. Please check that they are correct.",
call. = FALSE
)
# Replace colnames with names from sample metadata. They are without
# additional suffix.
coldata[["colnames_data"]] <- colnames(tse)
coldata[["colnames_metadata"]] <- rownames(coldata)
colnames(tse) <- rownames(coldata)
}
# Add sample metadata
colData(tse) <- coldata
# If there are altExps, add the metadata also there
if( length(altExps(tse)) ){
altexps <- altExps(tse)
altexps <- lapply(altexps, function(x){
colnames(x) <- rownames(coldata)
colData(x) <- coldata
return(x)
})
altexps <- SimpleList(altexps)
altExps(tse) <- altexps
}
return(tse)
}
# In humann/metaphlan pipeline, suffix is added to colnames. The suffix comes
# from file names. This can cause problems when, e.g., taxonomy and pathway
# information is combined. Because their suffixes differ, the sample names
# differ. The purpose of the function is to remove those file names.
.remove_suffix <- function(
data, rowdata_col = c("clade_name", "ID", "_id", "taxonomy")){
# Get rowdata columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(data)))
# Get sample names
sample_names <- colnames(data)[-rowdata_id]
# Get the possible suffixes, i.e., words that are divided by underscores
# based on first sample name.
suffix <- strsplit(sample_names[[1]], "_")[[1]]
# Loop over these suffixes. From the end, add words one by one. Store those
# suffixes that match with all sample names. The result is suffix that is
# the longest.
shared <- NULL
for( i in length(suffix):1 ){
pattern <- paste0("_", paste(suffix[i:length(suffix)], collapse="_"))
if( all(grepl(pattern, sample_names)) ){
shared <- pattern
}
}
# Remove suffix from sample names
if( !is.null(shared) ){
sample_names <- gsub(shared, "", sample_names)
colnames(data)[-rowdata_id] <- sample_names
}
return(data)
}
# This function sets taxonomy ranks based on rowData of TreeSE. With this,
# user can automatically set ranks based on imported data.
.set_ranks_based_on_rowdata <- function(tse, set.ranks = FALSE, ...){
#
if( !.is_a_bool(set.ranks) ){
stop("'set.ranks' must be TRUE or FALSE.", call. = FALSE)
}
#
# If user do not want to set ranks
if( !set.ranks ){
return(NULL)
}
# Get ranks from rowData
ranks <- colnames(rowData(tse))
# Ranks must be character columns
is_char <- lapply(rowData(tse), function(x) is.character(x) || is.factor(x))
is_char <- unlist(is_char)
ranks <- ranks[ is_char ]
# rowData is empty, cannot set ranks
if( length(ranks) == 0 ){
warning(
"Ranks cannot be set. rowData(x) does not include columns ",
"specifying character values.", call. = FALSE)
return(NULL)
}
# Finally, set ranks and give message
temp <- setTaxonomyRanks(ranks)
message("TAXONOMY_RANKS set to: '", paste0(ranks, collapse = "', '"), "'")
return(NULL)
}