nasa · AstrobioMike · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/Amplicon/Illumina/Workflow_Documentation/README.md b/Amplicon/Illumina/Workflow_Documentation/README.md
@@ -6,7 +6,7 @@
 
 |Pipeline Version|Current Workflow Version (for respective pipeline version)|
 |:---------------|:---------------------------------------------------------|
-|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.0.1](SW_AmpIllumina-A)|
+|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.1.0](SW_AmpIllumina-A)|
 
 *Current GeneLab Pipeline/Workflow Implementation
 

diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Workflow change log
 
+## [1.1.0](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.1.0/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
+- 18S option added
+  - will use the PR2 database for taxonomy provided by the DECIPHER developers here: http://www2.decipher.codes/Downloads.html
+  - option added to be able to just concatenate reads instead of merge them (in dada2), which is useful in scenarios like when primers such as 515-926 are used which can capture 18S sequences that are not expected to overlap
+
 ## [1.0.1](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.0.1/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
 - documentation links updated in config.yaml to match changes in host site
 - added config file for multiqc to trim suffixes from sample names and not include paths in output report

diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile
@@ -1,7 +1,7 @@
 ############################################################################################
 ## Snakefile for GeneLab's Illumina amplicon workflow                                     ##
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                                        ##
-## Version 1.0.1                                                                          ##
+## Version 1.1.0                                                                          ##
 ############################################################################################
 
 import os
@@ -145,14 +145,15 @@ if config["data_type"] == "PE":
                 filtered_R2_suffix = config["filtered_R2_suffix"],
                 final_outputs_dir = config["final_outputs_dir"],
                 target_region = config["target_region"],
-                output_prefix = config["output_prefix"]
+                output_prefix = config["output_prefix"],
+                concatenate_reads_only = config["concatenate_reads_only"]
             log:
                 "R-processing.log"
             benchmark:
                 "benchmarks/run_R-benchmarks.tsv"
             shell:
                 """
-                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
+                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
                 """
 
     # if we did not trim the primers
@@ -187,14 +188,15 @@ if config["data_type"] == "PE":
                 filtered_R2_suffix = config["filtered_R2_suffix"],
                 final_outputs_dir = config["final_outputs_dir"],
                 target_region = config["target_region"],
-                output_prefix = config["output_prefix"]
+                output_prefix = config["output_prefix"],
+                concatenate_reads_only = config["concatenate_reads_only"]
             log:
                 "R-processing.log"
             benchmark:
                 "benchmarks/run_R-benchmarks.tsv"
             shell:
                 """
-                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
+                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
                 """
 
 

diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml
@@ -57,10 +57,22 @@ R_linked_primer:
 discard_untrimmed:
     "TRUE"
 
-## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification)
+## target region (16S, 18S, or ITS is acceptable)
+  # this determines which reference database is used for taxonomic classification
+  # all are pulled from the pre-packaged DECIPHER downloads page here: http://www2.decipher.codes/Downloads.html
+  # 16S uses SILVA
+  # ITS uses UNITE
+  # 18S uses PR2
 target_region:
     "16S"
 
+## concatenate only with dada2 instead of merging paired reads if TRUE
+  # this is typically used with primers like 515-926, that captured 18S fragments that are typically too long to merge
+  # note that 16S and 18S should have been separated already prior to running this workflow
+  # this should likely be left as FALSE for any option other than "18S" above
+concatenate_reads_only:
+    "FALSE"
+
 ## values to be passed to dada2's filterAndTrim() function:
 left_trunc:
     0

diff --git a/.../Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R b/.../Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R
@@ -3,7 +3,7 @@
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                              ##
 ##################################################################################
 
-# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region>
+# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region> <concatenate_reads_only>
     # where <left_trim> and <right_trim> are the values to be passed to the truncLen parameter of dada2's filterAndTrim()
     # and <left_maxEE> and <right_maxEE> are the values to be passed to the maxEE parameter of dada2's filterAndTrim()
 
@@ -29,6 +29,7 @@ if (length(args) < 15) {
     suppressWarnings(final_outputs_dir <- args[13])
     suppressWarnings(output_prefix <- args[14])
     suppressWarnings(target_region <- args[15])
+    suppressWarnings(concatenate_reads_only <- args[16])
 
 }
 
@@ -37,7 +38,13 @@ if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right
 }
 
 if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) {
-    stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE)    
+    stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)    
+} else {
+    GL_trimmed_primers <- as.logical(GL_trimmed_primers)
+}
+
+if ( ! concatenate_reads_only %in% c("TRUE", "FALSE") ) {
+    stop("The sixteenth positional argument needs to be 'TRUE' or 'FALSE' for whether or not the mergePairs function of dada2 should just concatenate the reads on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)    
 } else {
     GL_trimmed_primers <- as.logical(GL_trimmed_primers)
 }
@@ -92,8 +99,17 @@ reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE)
 forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE)
 reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE)
 
-    # merging forward and reverse reads
-merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
+    # merging forward and reverse reads (just concatenating if that was specified)
+if ( concatenate_reads_only ) {
+
+    merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE, justConcatenate=TRUE)
+
+} else {
+
+    merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
+
+}
+
 
     # generating a sequence table that holds the counts of each sequence per sample
 seqtab <- makeSequenceTable(merged_contigs)
@@ -159,10 +175,19 @@ if ( target_region == "16S" ) {
     # removing downloaded file
     file.remove("UNITE_v2020_February2020.RData")
 
-    ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
+    ranks <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
+
+} else if (target_region == "18S" ) {
 
-} else { 
+    download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")    
+    # loading reference taxonomy object
+    load("PR2_v4_13_March2021.RData")
+    # removing downloaded file
+    file.remove("PR2_v4_13_March2021.RData")
 
+    ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")
+
+} else {
     cat("\n\n  The requested target_region of ", target_region, " is not accepted.\n\n")
     quit(status = 1)
 }
@@ -201,7 +226,7 @@ asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE)
 write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE)
 
     # making and writing out a taxonomy table:
-    # vector of desired ranks was created above in ITS/16S target_region if statement
+    # vector of desired ranks was created above in ITS/16S/18S target_region if statement
 
     # creating table of taxonomy and setting any that are unclassified as "NA"
 tax_tab <- t(sapply(tax_info, function(x) {
@@ -214,13 +239,18 @@ tax_tab <- t(sapply(tax_info, function(x) {
 colnames(tax_tab) <- ranks
 row.names(tax_tab) <- NULL
 
-# need to add a column for domain if this is ITS
-if (target_region == "ITS" ) {
+# need to add a column for domain if this is ITS (due to how the reference taxonomy object is structured)
+if ( target_region == "ITS" ) {
     tax_tab <- data.frame("ASV_ID"=asv_ids, "domain"="Eukarya", tax_tab, check.names=FALSE)
 } else {
     tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE)
 }
 
+# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured)
+if ( target_region == "18S" ) {
+    colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain"
+}
+
 write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE)
 
     ### generating and writing out biom file format ###