From 97bd71db5ec682854bab9164bfc6ece20a57cdb9 Mon Sep 17 00:00:00 2001
From: AstrobioMike <michael.lee0517@gmail.com>
Date: Mon, 13 Nov 2023 22:37:10 -0800
Subject: [PATCH 1/3] updating amplicon workflow to version 1.1.0 - 18S option
 added     - will use the PR2 database provided by the DECIPHER developers
 from here: http://www2.decipher.codes/Downloads.html     - option to just
 concatenate reads instead of merging them has been added, which can be useful
 in scenarios like when primers such as 515-926 were used and captured 18S
 sequences that are not expected to overlap

---
 .../SW_AmpIllumina-A/workflow_code/Snakefile  |  12 +-
 .../workflow_code/config.yaml                 |  14 +-
 .../scripts/Illumina-PE-R-processing.R        |  48 ++++-
 .../scripts/Illumina-R-processing.R           | 204 ------------------
 .../scripts/Illumina-SE-R-processing.R        |  15 ++
 5 files changed, 74 insertions(+), 219 deletions(-)
 delete mode 100644 Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R

diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile
index a00f59ef7..78c9398d2 100644
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile
+++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile
@@ -1,7 +1,7 @@
 ############################################################################################
 ## Snakefile for GeneLab's Illumina amplicon workflow                                     ##
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                                        ##
-## Version 1.0.1                                                                          ##
+## Version 1.1.0                                                                          ##
 ############################################################################################
 
 import os
@@ -145,14 +145,15 @@ if config["data_type"] == "PE":
                 filtered_R2_suffix = config["filtered_R2_suffix"],
                 final_outputs_dir = config["final_outputs_dir"],
                 target_region = config["target_region"],
-                output_prefix = config["output_prefix"]
+                output_prefix = config["output_prefix"],
+                concatenate_reads_only = config["concatenate_reads_only"]
             log:
                 "R-processing.log"
             benchmark:
                 "benchmarks/run_R-benchmarks.tsv"
             shell:
                 """
-                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
+                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
                 """
 
     # if we did not trim the primers
@@ -187,14 +188,15 @@ if config["data_type"] == "PE":
                 filtered_R2_suffix = config["filtered_R2_suffix"],
                 final_outputs_dir = config["final_outputs_dir"],
                 target_region = config["target_region"],
-                output_prefix = config["output_prefix"]
+                output_prefix = config["output_prefix"],
+                concatenate_reads_only = config["concatenate_reads_only"]
             log:
                 "R-processing.log"
             benchmark:
                 "benchmarks/run_R-benchmarks.tsv"
             shell:
                 """
-                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1
+                Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1
                 """
 
 
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml
index 920497129..056826b32 100644
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml
+++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml
@@ -57,10 +57,22 @@ R_linked_primer:
 discard_untrimmed:
     "TRUE"
 
-## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification)
+## target region (16S, 18S, or ITS is acceptable)
+  # this determines which reference database is used for taxonomic classification
+  # all are pulled from the pre-packaged DECIPHER downloads page here: http://www2.decipher.codes/Downloads.html
+  # 16S uses SILVA
+  # ITS uses UNITE
+  # 18S uses PR2
 target_region:
     "16S"
 
+## concatenate only with dada2 instead of merging paired reads if TRUE
+  # this is typically used with primers like 515-926, that captured 18S fragments that are typically too long to merge
+  # note that 16S and 18S should have been separated already prior to running this workflow
+  # this should likely be left as FALSE for any option other than "18S" above
+concatenate_reads_only:
+    "FALSE"
+
 ## values to be passed to dada2's filterAndTrim() function:
 left_trunc:
     0
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R
index 1f539f299..d419d0500 100644
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R
+++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R
@@ -3,7 +3,7 @@
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                              ##
 ##################################################################################
 
-# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region>
+# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region> <concatenate_reads_only>
     # where <left_trim> and <right_trim> are the values to be passed to the truncLen parameter of dada2's filterAndTrim()
     # and <left_maxEE> and <right_maxEE> are the values to be passed to the maxEE parameter of dada2's filterAndTrim()
 
@@ -29,6 +29,7 @@ if (length(args) < 15) {
     suppressWarnings(final_outputs_dir <- args[13])
     suppressWarnings(output_prefix <- args[14])
     suppressWarnings(target_region <- args[15])
+    suppressWarnings(concatenate_reads_only <- args[16])
 
 }
 
@@ -37,7 +38,13 @@ if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right
 }
 
 if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) {
-    stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE)    
+    stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)    
+} else {
+    GL_trimmed_primers <- as.logical(GL_trimmed_primers)
+}
+
+if ( ! concatenate_reads_only %in% c("TRUE", "FALSE") ) {
+    stop("The sixteenth positional argument needs to be 'TRUE' or 'FALSE' for whether or not the mergePairs function of dada2 should just concatenate the reads on this dataset, see top of R script and config.yaml for more info.", call.=FALSE)    
 } else {
     GL_trimmed_primers <- as.logical(GL_trimmed_primers)
 }
@@ -92,8 +99,17 @@ reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE)
 forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE)
 reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE)
 
-    # merging forward and reverse reads
-merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
+    # merging forward and reverse reads (just concatenating if that was specified)
+if ( concatenate_reads_only ) {
+
+    merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE, justConcatenate=TRUE)
+
+} else {
+
+    merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
+
+}
+
 
     # generating a sequence table that holds the counts of each sequence per sample
 seqtab <- makeSequenceTable(merged_contigs)
@@ -159,10 +175,19 @@ if ( target_region == "16S" ) {
     # removing downloaded file
     file.remove("UNITE_v2020_February2020.RData")
 
-    ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
+    ranks <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
+
+} else if (target_region == "18S" ) {
 
-} else { 
+    download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")    
+    # loading reference taxonomy object
+    load("PR2_v4_13_March2021.RData")
+    # removing downloaded file
+    file.remove("PR2_v4_13_March2021.RData")
 
+    ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")
+
+} else {
     cat("\n\n  The requested target_region of ", target_region, " is not accepted.\n\n")
     quit(status = 1)
 }
@@ -201,7 +226,7 @@ asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE)
 write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE)
 
     # making and writing out a taxonomy table:
-    # vector of desired ranks was created above in ITS/16S target_region if statement
+    # vector of desired ranks was created above in ITS/16S/18S target_region if statement
 
     # creating table of taxonomy and setting any that are unclassified as "NA"
 tax_tab <- t(sapply(tax_info, function(x) {
@@ -214,13 +239,18 @@ tax_tab <- t(sapply(tax_info, function(x) {
 colnames(tax_tab) <- ranks
 row.names(tax_tab) <- NULL
 
-# need to add a column for domain if this is ITS
-if (target_region == "ITS" ) {
+# need to add a column for domain if this is ITS (due to how the reference taxonomy object is structured)
+if ( target_region == "ITS" ) {
     tax_tab <- data.frame("ASV_ID"=asv_ids, "domain"="Eukarya", tax_tab, check.names=FALSE)
 } else {
     tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE)
 }
 
+# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured)
+if ( target_region == "18S" ) {
+    colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain"
+}
+
 write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE)
 
     ### generating and writing out biom file format ###
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R
deleted file mode 100644
index 47b29c201..000000000
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R
+++ /dev/null
@@ -1,204 +0,0 @@
-##################################################################################
-## R processing script for Illumina amplicon data                               ##
-## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                              ##
-##################################################################################
-
-# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region>
-    # where <left_trim> and <right_trim> are the values to be passed to the truncLen parameter of dada2's filterAndTrim()
-    # and <left_maxEE> and <right_maxEE> are the values to be passed to the maxEE parameter of dada2's filterAndTrim()
-
-# checking at least 8 arguments provided, first 4 are integers, and setting variables used within R:
-args <- commandArgs(trailingOnly = TRUE)
-
-if (length(args) < 15) {
-    stop("At least 15 positional arguments are required, see top of this R script for more info.", call.=FALSE)
-} else {
-    suppressWarnings(left_trunc <- as.integer(args[1]))
-    suppressWarnings(right_trunc <- as.integer(args[2]))
-    suppressWarnings(left_maxEE <- as.integer(args[3]))
-    suppressWarnings(right_maxEE <- as.integer(args[4]))
-
-    suppressWarnings(GL_trimmed_primers <- args[5])
-    suppressWarnings(sample_IDs_file <- args[6])
-    suppressWarnings(input_reads_dir <- args[7])
-    suppressWarnings(filtered_reads_dir <- args[8])
-    suppressWarnings(input_file_R1_suffix <- args[9])
-    suppressWarnings(input_file_R2_suffix <- args[10])
-    suppressWarnings(filtered_filename_R1_suffix <- args[11])
-    suppressWarnings(filtered_filename_R2_suffix <- args[12])
-    suppressWarnings(final_outputs_dir <- args[13])
-    suppressWarnings(output_prefix <- args[14])
-    suppressWarnings(target_region <- args[15])
-
-}
-
-if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right_maxEE) ) {
-    stop("All 4 first arguments must be integers, see top of R script for more info.", call.=FALSE)
-}
-
-if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) {
-    stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE)    
-} else {
-    GL_trimmed_primers <- as.logical(GL_trimmed_primers)
-}
-
-# general procedure comes largely from these sources:
-    # https://benjjneb.github.io/dada2/tutorial.html
-    # https://astrobiomike.github.io/amplicon/dada2_workflow_ex
-
-    # loading libraries
-library(dada2); packageVersion("dada2") # 1.12.1
-library(DECIPHER); packageVersion("DECIPHER") # 2.12.0
-library(biomformat); packageVersion("biomformat") # 1.12.0
-
-    ### general processing ###
-    # reading in unique sample names into variable
-sample.names <- scan(sample_IDs_file, what="character")
-
-    # setting variables holding the paths to the forward and reverse primer-trimmed reads (or "raw" if primers were trimmed prior to submission to GeneLab)
-forward_reads <- paste0(input_reads_dir, sample.names, input_file_R1_suffix)
-reverse_reads <- paste0(input_reads_dir, sample.names, input_file_R2_suffix)
-
-    # setting variables holding what will be the output paths of all forward and reverse filtered reads
-forward_filtered_reads <- paste0(filtered_reads_dir, sample.names, filtered_filename_R1_suffix)
-reverse_filtered_reads <- paste0(filtered_reads_dir, sample.names, filtered_filename_R2_suffix)
-
-    # adding sample names to the vectors holding the filtered-reads' paths
-names(forward_filtered_reads) <- sample.names
-names(reverse_filtered_reads) <- sample.names
-
-    # running filering step
-    # reads are written to the files specified in the variables, the "filtered_out" object holds the summary results within R
-filtered_out <- filterAndTrim(fwd=forward_reads, forward_filtered_reads, reverse_reads, reverse_filtered_reads, truncLen=c(left_trunc,right_trunc), maxN=0, maxEE=c(left_maxEE,right_maxEE), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE)
-
-    # making and writing out summary table that includes counts of filtered reads
-if ( GL_trimmed_primers ) {
-
-    filtered_count_summary_tab <- data.frame(sample=sample.names, cutadapt_trimmed=filtered_out[,1], dada2_filtered=filtered_out[,2])
-
-} else {
-
-    filtered_count_summary_tab <- data.frame(sample=sample.names, starting_reads=filtered_out[,1], dada2_filtered=filtered_out[,2])
-
-}
-
-write.table(filtered_count_summary_tab, paste0(filtered_reads_dir, output_prefix, "filtered-read-counts.tsv"), sep="\t", quote=F, row.names=F)
-
-    # learning errors step
-forward_errors <- learnErrors(forward_filtered_reads, multithread=TRUE)
-reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE)
-
-    # inferring sequences
-forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE)
-reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE)
-
-    # merging forward and reverse reads
-merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE)
-
-    # generating a sequence table that holds the counts of each sequence per sample
-seqtab <- makeSequenceTable(merged_contigs)
-
-    # removing putative chimeras
-seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
-
-    # checking what percentage of sequences were retained after chimera removal
-sum(seqtab.nochim)/sum(seqtab) * 100
-
-    # making and writing out a summary table that includes read counts at all steps
-    # helper function
-getN <- function(x) sum(getUniques(x))
-
-if ( GL_trimmed_primers ) {
-    
-    raw_and_trimmed_read_counts <- read.table(paste0(input_reads_dir, output_prefix, "trimmed-read-counts.tsv"), header=T, sep="\t")
-    # reading in filtered read counts
-    filtered_read_counts <- read.table(paste0(filtered_reads_dir, output_prefix, "filtered-read-counts.tsv"), header=T, sep="\t")
-
-    count_summary_tab <- data.frame(raw_and_trimmed_read_counts, dada2_filtered=filtered_read_counts[,3],
-                                    dada2_denoised_F=sapply(forward_seqs, getN),
-                                    dada2_denoised_R=sapply(reverse_seqs, getN),
-                                    dada2_merged=rowSums(seqtab),
-                                    dada2_chimera_removed=rowSums(seqtab.nochim),
-                                    final_perc_reads_retained=round(rowSums(seqtab.nochim)/raw_and_trimmed_read_counts$raw_reads * 100, 1),
-                                    row.names=NULL)
-
-} else {
-
-    count_summary_tab <- data.frame(filtered_count_summary_tab,
-                                dada2_denoised_F=sapply(forward_seqs, getN),
-                                dada2_denoised_R=sapply(reverse_seqs, getN),
-                                dada2_merged=rowSums(seqtab),
-                                dada2_chimera_removed=rowSums(seqtab.nochim),
-                                final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_count_summary_tab$starting_reads * 100, 1),
-                                row.names=NULL)
-
-}
-
-write.table(count_summary_tab, paste0(final_outputs_dir, output_prefix, "read-count-tracking.tsv"), sep = "\t", quote=F, row.names=F)
-
-    ### assigning taxonomy ###
-    # creating a DNAStringSet object from the ASVs
-dna <- DNAStringSet(getSequences(seqtab.nochim))
-
-    # downloading reference R taxonomy object (at some point this will be stored somewhere on GeneLab's server and we won't download it, but should leave the code here, just commented out)
-cat("\n\n  Downloading reference database...\n\n")
-download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData")
-    # loading reference taxonomy object
-load("SILVA_SSU_r138_2019.RData")
-    # removing downloaded file
-file.remove("SILVA_SSU_r138_2019.RData")
-
-    # classifying
-cat("\n\n  Assigning taxonomy...\n\n")
-tax_info <- IdTaxa(dna, trainingSet, strand="both", processors=NULL)
-
-    ### generating and writing out standard outputs ###
-    # giving our sequences more manageable names (e.g. ASV_1, ASV_2..., rather than the sequence itself)
-asv_seqs <- colnames(seqtab.nochim)
-asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
-
-for (i in 1:dim(seqtab.nochim)[2]) {
-    asv_headers[i] <- paste(">ASV", i, sep="_")
-}
-
-cat("\n\n  Making and writing outputs...\n\n")
-    # making and writing out a fasta of our final ASV seqs:
-asv_fasta <- c(rbind(asv_headers, asv_seqs))
-write(asv_fasta, paste0(final_outputs_dir, output_prefix, "ASVs.fasta"))
-
-    # making and writing out a count table:
-asv_tab <- t(seqtab.nochim)
-asv_ids <- sub(">", "", asv_headers)
-row.names(asv_tab) <- NULL
-asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE)
-
-write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE)
-
-    # making and writing out a taxonomy table:
-    # creating vector of desired ranks
-ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species")
-
-    # creating table of taxonomy and setting any that are unclassified as "NA"
-tax_tab <- t(sapply(tax_info, function(x) {
-    m <- match(ranks, x$rank)
-    taxa <- x$taxon[m]
-    taxa[startsWith(taxa, "unclassified_")] <- NA
-    taxa
-}))
-
-colnames(tax_tab) <- ranks
-row.names(tax_tab) <- NULL
-tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE)
-
-write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE)
-
-    ### generating and writing out biom file format ###
-biom_object <- make_biom(data=asv_tab, observation_metadata=tax_tab)
-write_biom(biom_object, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.biom"))
-
-    # making a tsv of combined tax and counts
-tax_and_count_tab <- merge(tax_tab, asv_tab)
-write.table(tax_and_count_tab, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.tsv"), sep="\t", quote=FALSE, row.names=FALSE)
-
-cat("\n\n  Session info:\n\n")
-sessionInfo()
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R
index 28aed8292..d53629b5c 100644
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R
+++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R
@@ -145,6 +145,16 @@ if ( target_region == "16S" ) {
 
     ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 
+} else if (target_region == "18S" ) {
+
+    download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")    
+    # loading reference taxonomy object
+    load("PR2_v4_13_March2021.RData")
+    # removing downloaded file
+    file.remove("PR2_v4_13_March2021.RData")
+
+    ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")
+
 } else { 
 
     cat("\n\n  The requested target_region of ", target_region, " is not accepted.\n\n")
@@ -205,6 +215,11 @@ if (target_region == "ITS" ) {
     tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE)
 }
 
+# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured)
+if ( target_region == "18S" ) {
+    colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain"
+}
+
 write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE)
 
     ### generating and writing out biom file format ###

From 98cb8ddabbf4802d9b77cfcddc65021c789d65b5 Mon Sep 17 00:00:00 2001
From: Mike Lee <michael.lee0517@gmail.com>
Date: Mon, 13 Nov 2023 22:40:28 -0800
Subject: [PATCH 2/3] Update amplicon illumina workflow documentation README.md

- updating current workflow version listed
---
 Amplicon/Illumina/Workflow_Documentation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Amplicon/Illumina/Workflow_Documentation/README.md b/Amplicon/Illumina/Workflow_Documentation/README.md
index 7fff1f2a7..63dcf8920 100644
--- a/Amplicon/Illumina/Workflow_Documentation/README.md
+++ b/Amplicon/Illumina/Workflow_Documentation/README.md
@@ -6,7 +6,7 @@
 
 |Pipeline Version|Current Workflow Version (for respective pipeline version)|
 |:---------------|:---------------------------------------------------------|
-|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.0.1](SW_AmpIllumina-A)|
+|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.1.0](SW_AmpIllumina-A)|
 
 *Current GeneLab Pipeline/Workflow Implementation
 

From aef7626e51e861c1a92271eb76429d959bd67149 Mon Sep 17 00:00:00 2001
From: Mike Lee <michael.lee0517@gmail.com>
Date: Mon, 13 Nov 2023 22:43:16 -0800
Subject: [PATCH 3/3] Updating Amplicon Illumina CHANGELOG.md

---
 .../Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md
index 17cfd44f6..5d17373c0 100644
--- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md
+++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Workflow change log
 
+## [1.1.0](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.1.0/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
+- 18S option added
+  - will use the PR2 database for taxonomy provided by the DECIPHER developers here: http://www2.decipher.codes/Downloads.html
+  - option added to be able to just concatenate reads instead of merge them (in dada2), which is useful in scenarios like when primers such as 515-926 are used which can capture 18S sequences that are not expected to overlap
+
 ## [1.0.1](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.0.1/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A)
 - documentation links updated in config.yaml to match changes in host site
 - added config file for multiqc to trim suffixes from sample names and not include paths in output report