Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Amplicon/Illumina/Workflow_Documentation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

|Pipeline Version|Current Workflow Version (for respective pipeline version)|
|:---------------|:---------------------------------------------------------|
|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.0.1](SW_AmpIllumina-A)|
|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.1.0](SW_AmpIllumina-A)|

*Current GeneLab Pipeline/Workflow Implementation

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Workflow change log

## [1.1.0](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.1.0/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
- 18S option added
- will use the PR2 database for taxonomy provided by the DECIPHER developers here: http://www2.decipher.codes/Downloads.html
- option added to be able to just concatenate reads instead of merge them (in dada2), which is useful in scenarios like when primers such as 515-926 are used which can capture 18S sequences that are not expected to overlap

## [1.0.1](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.0.1/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
- documentation links updated in config.yaml to match changes in host site
- added config file for multiqc to trim suffixes from sample names and not include paths in output report
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
############################################################################################
## Snakefile for GeneLab's Illumina amplicon workflow ##
## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ##
## Version 1.0.1 ##
## Version 1.1.0 ##
############################################################################################

import os
Expand Down Expand Up @@ -145,14 +145,15 @@ if config["data_type"] == "PE":
filtered_R2_suffix = config["filtered_R2_suffix"],
final_outputs_dir = config["final_outputs_dir"],
target_region = config["target_region"],
output_prefix = config["output_prefix"]
output_prefix = config["output_prefix"],
concatenate_reads_only = config["concatenate_reads_only"]
log:
"R-processing.log"
benchmark:
"benchmarks/run_R-benchmarks.tsv"
shell:
"""
Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
"""

# if we did not trim the primers
Expand Down Expand Up @@ -187,14 +188,15 @@ if config["data_type"] == "PE":
filtered_R2_suffix = config["filtered_R2_suffix"],
final_outputs_dir = config["final_outputs_dir"],
target_region = config["target_region"],
output_prefix = config["output_prefix"]
output_prefix = config["output_prefix"],
concatenate_reads_only = config["concatenate_reads_only"]
log:
"R-processing.log"
benchmark:
"benchmarks/run_R-benchmarks.tsv"
shell:
"""
Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
"""


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,22 @@ R_linked_primer:
discard_untrimmed:
"TRUE"

## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification)
## target region (16S, 18S, or ITS is acceptable)
# this determines which reference database is used for taxonomic classification
# all are pulled from the pre-packaged DECIPHER downloads page here: http://www2.decipher.codes/Downloads.html
# 16S uses SILVA
# ITS uses UNITE
# 18S uses PR2
target_region:
"16S"

## concatenate only with dada2 instead of merging paired reads if TRUE
# this is typically used with primers like 515-926, that captured 18S fragments that are typically too long to merge
# note that 16S and 18S should have been separated already prior to running this workflow
# this should likely be left as FALSE for any option other than "18S" above
concatenate_reads_only:
"FALSE"

## values to be passed to dada2's filterAndTrim() function:
left_trunc:
0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ##
##################################################################################

# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region>
# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region> <concatenate_reads_only>
# where <left_trim> and <right_trim> are the values to be passed to the truncLen parameter of dada2's filterAndTrim()
# and <left_maxEE> and <right_maxEE> are the values to be passed to the maxEE parameter of dada2's filterAndTrim()

Expand All @@ -29,6 +29,7 @@ if (length(args) < 15) {
suppressWarnings(final_outputs_dir <- args[13])
suppressWarnings(output_prefix <- args[14])
suppressWarnings(target_region <- args[15])
suppressWarnings(concatenate_reads_only <- args[16])

}

Expand All @@ -37,7 +38,13 @@ if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right
}

if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) {
stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE)
stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)
} else {
GL_trimmed_primers <- as.logical(GL_trimmed_primers)
}

if ( ! concatenate_reads_only %in% c("TRUE", "FALSE") ) {
stop("The sixteenth positional argument needs to be 'TRUE' or 'FALSE' for whether or not the mergePairs function of dada2 should just concatenate the reads on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)
} else {
GL_trimmed_primers <- as.logical(GL_trimmed_primers)
}
Expand Down Expand Up @@ -92,8 +99,17 @@ reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE)
forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE)
reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE)

# merging forward and reverse reads
merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
# merging forward and reverse reads (just concatenating if that was specified)
if ( concatenate_reads_only ) {

merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE, justConcatenate=TRUE)

} else {

merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)

}


# generating a sequence table that holds the counts of each sequence per sample
seqtab <- makeSequenceTable(merged_contigs)
Expand Down Expand Up @@ -159,10 +175,19 @@ if ( target_region == "16S" ) {
# removing downloaded file
file.remove("UNITE_v2020_February2020.RData")

ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
ranks <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")

} else if (target_region == "18S" ) {

} else {
download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")
# loading reference taxonomy object
load("PR2_v4_13_March2021.RData")
# removing downloaded file
file.remove("PR2_v4_13_March2021.RData")

ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")

} else {
cat("\n\n The requested target_region of ", target_region, " is not accepted.\n\n")
quit(status = 1)
}
Expand Down Expand Up @@ -201,7 +226,7 @@ asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE)
write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE)

# making and writing out a taxonomy table:
# vector of desired ranks was created above in ITS/16S target_region if statement
# vector of desired ranks was created above in ITS/16S/18S target_region if statement

# creating table of taxonomy and setting any that are unclassified as "NA"
tax_tab <- t(sapply(tax_info, function(x) {
Expand All @@ -214,13 +239,18 @@ tax_tab <- t(sapply(tax_info, function(x) {
colnames(tax_tab) <- ranks
row.names(tax_tab) <- NULL

# need to add a column for domain if this is ITS
if (target_region == "ITS" ) {
# need to add a column for domain if this is ITS (due to how the reference taxonomy object is structured)
if ( target_region == "ITS" ) {
tax_tab <- data.frame("ASV_ID"=asv_ids, "domain"="Eukarya", tax_tab, check.names=FALSE)
} else {
tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE)
}

# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured)
if ( target_region == "18S" ) {
colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain"
}

write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE)

### generating and writing out biom file format ###
Expand Down
Loading