-
Notifications
You must be signed in to change notification settings - Fork 3
/
gather_files.R
177 lines (176 loc) · 6.71 KB
/
gather_files.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#' Gather files
#'
#' Recursively find peak/picard files stored within subdirectories and import
#' them as a list of \link[GenomicRanges]{GRanges} objects.
#'
#' For "peaks.stringent" files called with
#' \href{https://github.com/FredHutch/SEACR}{SEACR}, column names will be
#' automatically added:
#' \itemize{
#' \item{total_signal : }{Total signal contained within denoted coordinates.}
#' \item{max_signal}{Maximum bedgraph signal attained at any base pair
#' within denoted coordinates.}
#' \item{max_signal_region}{ Region representing the farthest upstream
#' and farthest downstream bases within the denoted coordinates
#' that are represented by the maximum bedgraph signal.}
#' }
#'
#' @param dir Directory to search within.
#' @param type File type to search for. Options include:
#' \itemize{
#' \item{"<pattern>"}{Finds files matching an arbitrary regex pattern
#' specified by user.}
#' \item{"peaks.stringent"}{Finds files ending in "*.stringent.bed$"}
#' \item{"peaks.consensus"}{Finds files ending in "*.consensus.peaks.bed$"}
#' \item{"peaks.consensus.filtered"}{
#' Finds files ending in"*.consensus.peaks.filtered.awk.bed$"}
#' \item{"picard"}{Finds files ending in
#' "*.target.markdup.MarkDuplicates.metrics.txt$"}
#' }
#' @param nfcore_cutandrun Whether the files were generated by the
#' \href{https://nf-co.re/cutandrun}{nf-core/cutandrun} Nextflow pipeline.
#' If \code{TRUE}, can use the standardised folder structure to
#' automatically generate more descriptive file names with sample IDs.
#' @param return_paths Return only the file paths without actually reading them
#' in as \link[GenomicRanges]{GRanges}.
#' @param rbind_list Bind all objects into one.
#' @param verbose Print messages.
#' @inheritParams check_workers
#' @inheritParams BiocParallel::MulticoreParam
#' @returns A named list of \link[GenomicRanges]{GRanges} objects.
#'
#' @export
#' @importFrom stats setNames
#' @importFrom stringr str_split
#' @importFrom data.table fread := rbindlist
#' @importFrom GenomicRanges GRangesList
#' @examples
#' #### Make example files ####
#' save_paths <- EpiCompare::write_example_peaks()
#' dir <- unique(dirname(save_paths))
#' #### Gather/import files ####
#' peaks <- EpiCompare::gather_files(dir=dir,
#' type="peaks.narrow",
#' workers = 1)
gather_files <- function(dir,
type = "peaks.stringent",
nfcore_cutandrun = FALSE,
return_paths = FALSE,
rbind_list = FALSE,
workers = check_workers(),
verbose = TRUE){
#### Parse type arg ####
workers <- check_workers(workers = workers)
type_key <- c(
## peak files
"peaks.stringent"="*.stringent.bed$",
"peaks.consensus"="*.consensus.peaks.bed$",
"peaks.consensus.filtered"="*.consensus.peaks.filtered.awk.bed$",
"peaks.pooled"="pooledPeak",
"peaks.narrow"="narrowPeak",
"peaks.broad"="broadPeak",
## picard files
# "picard"= "*.target.markdup.MarkDuplicates.metrics.txt$",
"picard"= "^multiqc_picard*",
## multiQC files
# "multiqc"=c("^meta_table"),
"multiqc"="multiqc_general_stats.txt$|meta_table_ctrl.csv",
## trimgalore files
"trimgalore"="*.fastq.gz_trimming_report.txt$",
## bowtie files
"bowtie.stats"="*\\.stats$",
"bowtie.idxstats"="*\\.idxstats$",
"bowtie.target.stats"=".*\\.stats$",
"bowtie.target.idxstats"=".*\\.idxstats$",
## bam files
"bam"="*.bam$",
"bam.dedup"="*.dedup.bam$",
"bam.dedup.sorted"="*.dedup.sorted.bam$",
"bam.dedup.sorted.target"="*.target.dedup.sorted.bam$"
)
#### Check for known file types ####
pattern <- if(all(type %in% names(type_key))) {
type_key[tolower(type)]
} else {
type
}
if(all(is.na(pattern))){
stop("type must be at least one of:\n",
paste("-",c(names(type_key),"<regex query>"), collapse = "\n"))
}
#### Search for files recursively ####
messager("Searching for",type,"files...",v=verbose)
paths <- list.files(path = dir,
pattern = paste(unname(pattern), collapse = "|"),
recursive = TRUE,
full.names = TRUE,
ignore.case = TRUE)
#### Remove any R scripts ####
paths <- grep("\\.R", paths, value = TRUE, invert = TRUE)
#### Omit duplicate files ####
## nfcore creates duplicates of same peak files
## in different subfolders: "04_reporting" and "04_called_peaks".
## Omit one of these subfolders.
if(isTRUE(nfcore_cutandrun) &&
(type=="peaks")){
paths <- paths[!grepl("04_reporting",paths)]
}
#### Report files found ####
if(length(paths)==0) {
msg <- "0 matching files identified. Returning NULL."
messager(msg,v=verbose)
return(NULL)
}
messager(formatC(length(paths),big.mark = ","),
"matching files identified.",v=verbose)
#### Construct names ####
paths <- gather_files_names(paths=paths,
type=type,
nfcore_cutandrun=nfcore_cutandrun)
if(isTRUE(return_paths)){
messager("Returning paths.",v=verbose)
return(paths)
}
#### Import files ####
messager("Importing files.",v=verbose)
files <- bpplapply(X = paths,
workers = workers,
FUN = function(x){
if(type=="picard"){
dat <- read_picard(path = x,
verbose = verbose)
} else if(type=="multiqc"){
dat <- read_multiqc(path = x,
verbose = verbose)
} else if(startsWith(type,"bowtie")){
dat <- read_bowtie(path = x,
verbose = verbose)
} else if(type=="trimgalore"){
dat <- read_trimgalore(path = x,
verbose = verbose)
} else if(startsWith(type,"bam")){
dat <- read_bam(path = x,
verbose = verbose)
} else {
dat <- read_peaks(path = x,
type = type,
verbose = verbose)
}
return(dat)
})
#### rbind into one object ####
if(isTRUE(rbind_list)){
messager("Binding all files into one.",v=verbose)
if(startsWith(type,"peaks")){
files <- unlist(GenomicRanges::GRangesList(files))
} else {
files <- data.table::rbindlist(files,
use.names = TRUE,
idcol = "batch",
fill = TRUE)
}
}
#### Report ####
messager(formatC(length(files),big.mark = ","),"files retrieved.",v=verbose)
return(files)
}