From 97bd71db5ec682854bab9164bfc6ece20a57cdb9 Mon Sep 17 00:00:00 2001 From: AstrobioMike Date: Mon, 13 Nov 2023 22:37:10 -0800 Subject: [PATCH 1/3] updating amplicon workflow to version 1.1.0 - 18S option added - will use the PR2 database provided by the DECIPHER developers from here: http://www2.decipher.codes/Downloads.html - option to just concatenate reads instead of merging them has been added, which can be useful in scenarios like when primers such as 515-926 were used and captured 18S sequences that are not expected to overlap --- .../SW_AmpIllumina-A/workflow_code/Snakefile | 12 +- .../workflow_code/config.yaml | 14 +- .../scripts/Illumina-PE-R-processing.R | 48 ++++- .../scripts/Illumina-R-processing.R | 204 ------------------ .../scripts/Illumina-SE-R-processing.R | 15 ++ 5 files changed, 74 insertions(+), 219 deletions(-) delete mode 100644 Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile index a00f59ef7..78c9398d2 100644 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile +++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/Snakefile @@ -1,7 +1,7 @@ ############################################################################################ ## Snakefile for GeneLab's Illumina amplicon workflow ## ## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -## Version 1.0.1 ## +## Version 1.1.0 ## ############################################################################################ import os @@ -145,14 +145,15 @@ if config["data_type"] == "PE": filtered_R2_suffix = config["filtered_R2_suffix"], final_outputs_dir = config["final_outputs_dir"], target_region = config["target_region"], - output_prefix = config["output_prefix"] + output_prefix = config["output_prefix"], + concatenate_reads_only = config["concatenate_reads_only"] log: "R-processing.log" benchmark: "benchmarks/run_R-benchmarks.tsv" shell: """ - Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1 + Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.primer_trimmed_R1_suffix}" "{params.primer_trimmed_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1 """ # if we did not trim the primers @@ -187,14 +188,15 @@ if config["data_type"] == "PE": filtered_R2_suffix = config["filtered_R2_suffix"], final_outputs_dir = config["final_outputs_dir"], target_region = config["target_region"], - output_prefix = config["output_prefix"] + output_prefix = config["output_prefix"], + concatenate_reads_only = config["concatenate_reads_only"] log: "R-processing.log" benchmark: "benchmarks/run_R-benchmarks.tsv" shell: """ - Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1 + Rscript scripts/Illumina-PE-R-processing.R "{params.left_trunc}" "{params.right_trunc}" "{params.left_maxEE}" "{params.right_maxEE}" "{params.trim_primers}" "{sample_IDs_file}" "{params.raw_reads_dir}" "{params.filtered_reads_dir}" "{params.raw_R1_suffix}" "{params.raw_R2_suffix}" "{params.filtered_R1_suffix}" "{params.filtered_R2_suffix}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" "{params.concatenate_reads_only}" > {log} 2>&1 """ diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml index 920497129..056826b32 100644 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml +++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/config.yaml @@ -57,10 +57,22 @@ R_linked_primer: discard_untrimmed: "TRUE" -## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification) +## target region (16S, 18S, or ITS is acceptable) + # this determines which reference database is used for taxonomic classification + # all are pulled from the pre-packaged DECIPHER downloads page here: http://www2.decipher.codes/Downloads.html + # 16S uses SILVA + # ITS uses UNITE + # 18S uses PR2 target_region: "16S" +## concatenate only with dada2 instead of merging paired reads if TRUE + # this is typically used with primers like 515-926, that captured 18S fragments that are typically too long to merge + # note that 16S and 18S should have been separated already prior to running this workflow + # this should likely be left as FALSE for any option other than "18S" above +concatenate_reads_only: + "FALSE" + ## values to be passed to dada2's filterAndTrim() function: left_trunc: 0 diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R index 1f539f299..d419d0500 100644 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R +++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-PE-R-processing.R @@ -3,7 +3,7 @@ ## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## ################################################################################## -# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R +# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R # where and are the values to be passed to the truncLen parameter of dada2's filterAndTrim() # and and are the values to be passed to the maxEE parameter of dada2's filterAndTrim() @@ -29,6 +29,7 @@ if (length(args) < 15) { suppressWarnings(final_outputs_dir <- args[13]) suppressWarnings(output_prefix <- args[14]) suppressWarnings(target_region <- args[15]) + suppressWarnings(concatenate_reads_only <- args[16]) } @@ -37,7 +38,13 @@ if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right } if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) { - stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE) + stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script and config.yaml for more info.", call.=FALSE) +} else { + GL_trimmed_primers <- as.logical(GL_trimmed_primers) +} + +if ( ! concatenate_reads_only %in% c("TRUE", "FALSE") ) { + stop("The sixteenth positional argument needs to be 'TRUE' or 'FALSE' for whether or not the mergePairs function of dada2 should just concatenate the reads on this dataset, see top of R script and config.yaml for more info.", call.=FALSE) } else { GL_trimmed_primers <- as.logical(GL_trimmed_primers) } @@ -92,8 +99,17 @@ reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE) forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE) reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE) - # merging forward and reverse reads -merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE) + # merging forward and reverse reads (just concatenating if that was specified) +if ( concatenate_reads_only ) { + + merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE, justConcatenate=TRUE) + +} else { + + merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE) + +} + # generating a sequence table that holds the counts of each sequence per sample seqtab <- makeSequenceTable(merged_contigs) @@ -159,10 +175,19 @@ if ( target_region == "16S" ) { # removing downloaded file file.remove("UNITE_v2020_February2020.RData") - ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species") + ranks <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species") + +} else if (target_region == "18S" ) { -} else { + download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData") + # loading reference taxonomy object + load("PR2_v4_13_March2021.RData") + # removing downloaded file + file.remove("PR2_v4_13_March2021.RData") + ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species") + +} else { cat("\n\n The requested target_region of ", target_region, " is not accepted.\n\n") quit(status = 1) } @@ -201,7 +226,7 @@ asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE) write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE) # making and writing out a taxonomy table: - # vector of desired ranks was created above in ITS/16S target_region if statement + # vector of desired ranks was created above in ITS/16S/18S target_region if statement # creating table of taxonomy and setting any that are unclassified as "NA" tax_tab <- t(sapply(tax_info, function(x) { @@ -214,13 +239,18 @@ tax_tab <- t(sapply(tax_info, function(x) { colnames(tax_tab) <- ranks row.names(tax_tab) <- NULL -# need to add a column for domain if this is ITS -if (target_region == "ITS" ) { +# need to add a column for domain if this is ITS (due to how the reference taxonomy object is structured) +if ( target_region == "ITS" ) { tax_tab <- data.frame("ASV_ID"=asv_ids, "domain"="Eukarya", tax_tab, check.names=FALSE) } else { tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE) } +# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured) +if ( target_region == "18S" ) { + colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain" +} + write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE) ### generating and writing out biom file format ### diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R deleted file mode 100644 index 47b29c201..000000000 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-R-processing.R +++ /dev/null @@ -1,204 +0,0 @@ -################################################################################## -## R processing script for Illumina amplicon data ## -## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -################################################################################## - -# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R - # where and are the values to be passed to the truncLen parameter of dada2's filterAndTrim() - # and and are the values to be passed to the maxEE parameter of dada2's filterAndTrim() - -# checking at least 8 arguments provided, first 4 are integers, and setting variables used within R: -args <- commandArgs(trailingOnly = TRUE) - -if (length(args) < 15) { - stop("At least 15 positional arguments are required, see top of this R script for more info.", call.=FALSE) -} else { - suppressWarnings(left_trunc <- as.integer(args[1])) - suppressWarnings(right_trunc <- as.integer(args[2])) - suppressWarnings(left_maxEE <- as.integer(args[3])) - suppressWarnings(right_maxEE <- as.integer(args[4])) - - suppressWarnings(GL_trimmed_primers <- args[5]) - suppressWarnings(sample_IDs_file <- args[6]) - suppressWarnings(input_reads_dir <- args[7]) - suppressWarnings(filtered_reads_dir <- args[8]) - suppressWarnings(input_file_R1_suffix <- args[9]) - suppressWarnings(input_file_R2_suffix <- args[10]) - suppressWarnings(filtered_filename_R1_suffix <- args[11]) - suppressWarnings(filtered_filename_R2_suffix <- args[12]) - suppressWarnings(final_outputs_dir <- args[13]) - suppressWarnings(output_prefix <- args[14]) - suppressWarnings(target_region <- args[15]) - -} - -if ( is.na(left_trunc) || is.na(right_trunc) || is.na(left_maxEE) || is.na(right_maxEE) ) { - stop("All 4 first arguments must be integers, see top of R script for more info.", call.=FALSE) -} - -if ( ! GL_trimmed_primers %in% c("TRUE", "FALSE") ) { - stop("The fifth positional argument needs to be 'TRUE' or 'FALSE' for whether or not GL trimmed primers on this dataset, see top of R script for more info.", call.=FALSE) -} else { - GL_trimmed_primers <- as.logical(GL_trimmed_primers) -} - -# general procedure comes largely from these sources: - # https://benjjneb.github.io/dada2/tutorial.html - # https://astrobiomike.github.io/amplicon/dada2_workflow_ex - - # loading libraries -library(dada2); packageVersion("dada2") # 1.12.1 -library(DECIPHER); packageVersion("DECIPHER") # 2.12.0 -library(biomformat); packageVersion("biomformat") # 1.12.0 - - ### general processing ### - # reading in unique sample names into variable -sample.names <- scan(sample_IDs_file, what="character") - - # setting variables holding the paths to the forward and reverse primer-trimmed reads (or "raw" if primers were trimmed prior to submission to GeneLab) -forward_reads <- paste0(input_reads_dir, sample.names, input_file_R1_suffix) -reverse_reads <- paste0(input_reads_dir, sample.names, input_file_R2_suffix) - - # setting variables holding what will be the output paths of all forward and reverse filtered reads -forward_filtered_reads <- paste0(filtered_reads_dir, sample.names, filtered_filename_R1_suffix) -reverse_filtered_reads <- paste0(filtered_reads_dir, sample.names, filtered_filename_R2_suffix) - - # adding sample names to the vectors holding the filtered-reads' paths -names(forward_filtered_reads) <- sample.names -names(reverse_filtered_reads) <- sample.names - - # running filering step - # reads are written to the files specified in the variables, the "filtered_out" object holds the summary results within R -filtered_out <- filterAndTrim(fwd=forward_reads, forward_filtered_reads, reverse_reads, reverse_filtered_reads, truncLen=c(left_trunc,right_trunc), maxN=0, maxEE=c(left_maxEE,right_maxEE), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE) - - # making and writing out summary table that includes counts of filtered reads -if ( GL_trimmed_primers ) { - - filtered_count_summary_tab <- data.frame(sample=sample.names, cutadapt_trimmed=filtered_out[,1], dada2_filtered=filtered_out[,2]) - -} else { - - filtered_count_summary_tab <- data.frame(sample=sample.names, starting_reads=filtered_out[,1], dada2_filtered=filtered_out[,2]) - -} - -write.table(filtered_count_summary_tab, paste0(filtered_reads_dir, output_prefix, "filtered-read-counts.tsv"), sep="\t", quote=F, row.names=F) - - # learning errors step -forward_errors <- learnErrors(forward_filtered_reads, multithread=TRUE) -reverse_errors <- learnErrors(reverse_filtered_reads, multithread=TRUE) - - # inferring sequences -forward_seqs <- dada(forward_filtered_reads, err=forward_errors, pool="pseudo", multithread=TRUE) -reverse_seqs <- dada(reverse_filtered_reads, err=reverse_errors, pool="pseudo", multithread=TRUE) - - # merging forward and reverse reads -merged_contigs <- mergePairs(forward_seqs, forward_filtered_reads, reverse_seqs, reverse_filtered_reads, verbose=TRUE) - - # generating a sequence table that holds the counts of each sequence per sample -seqtab <- makeSequenceTable(merged_contigs) - - # removing putative chimeras -seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) - - # checking what percentage of sequences were retained after chimera removal -sum(seqtab.nochim)/sum(seqtab) * 100 - - # making and writing out a summary table that includes read counts at all steps - # helper function -getN <- function(x) sum(getUniques(x)) - -if ( GL_trimmed_primers ) { - - raw_and_trimmed_read_counts <- read.table(paste0(input_reads_dir, output_prefix, "trimmed-read-counts.tsv"), header=T, sep="\t") - # reading in filtered read counts - filtered_read_counts <- read.table(paste0(filtered_reads_dir, output_prefix, "filtered-read-counts.tsv"), header=T, sep="\t") - - count_summary_tab <- data.frame(raw_and_trimmed_read_counts, dada2_filtered=filtered_read_counts[,3], - dada2_denoised_F=sapply(forward_seqs, getN), - dada2_denoised_R=sapply(reverse_seqs, getN), - dada2_merged=rowSums(seqtab), - dada2_chimera_removed=rowSums(seqtab.nochim), - final_perc_reads_retained=round(rowSums(seqtab.nochim)/raw_and_trimmed_read_counts$raw_reads * 100, 1), - row.names=NULL) - -} else { - - count_summary_tab <- data.frame(filtered_count_summary_tab, - dada2_denoised_F=sapply(forward_seqs, getN), - dada2_denoised_R=sapply(reverse_seqs, getN), - dada2_merged=rowSums(seqtab), - dada2_chimera_removed=rowSums(seqtab.nochim), - final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_count_summary_tab$starting_reads * 100, 1), - row.names=NULL) - -} - -write.table(count_summary_tab, paste0(final_outputs_dir, output_prefix, "read-count-tracking.tsv"), sep = "\t", quote=F, row.names=F) - - ### assigning taxonomy ### - # creating a DNAStringSet object from the ASVs -dna <- DNAStringSet(getSequences(seqtab.nochim)) - - # downloading reference R taxonomy object (at some point this will be stored somewhere on GeneLab's server and we won't download it, but should leave the code here, just commented out) -cat("\n\n Downloading reference database...\n\n") -download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData") - # loading reference taxonomy object -load("SILVA_SSU_r138_2019.RData") - # removing downloaded file -file.remove("SILVA_SSU_r138_2019.RData") - - # classifying -cat("\n\n Assigning taxonomy...\n\n") -tax_info <- IdTaxa(dna, trainingSet, strand="both", processors=NULL) - - ### generating and writing out standard outputs ### - # giving our sequences more manageable names (e.g. ASV_1, ASV_2..., rather than the sequence itself) -asv_seqs <- colnames(seqtab.nochim) -asv_headers <- vector(dim(seqtab.nochim)[2], mode="character") - -for (i in 1:dim(seqtab.nochim)[2]) { - asv_headers[i] <- paste(">ASV", i, sep="_") -} - -cat("\n\n Making and writing outputs...\n\n") - # making and writing out a fasta of our final ASV seqs: -asv_fasta <- c(rbind(asv_headers, asv_seqs)) -write(asv_fasta, paste0(final_outputs_dir, output_prefix, "ASVs.fasta")) - - # making and writing out a count table: -asv_tab <- t(seqtab.nochim) -asv_ids <- sub(">", "", asv_headers) -row.names(asv_tab) <- NULL -asv_tab <- data.frame("ASV_ID"=asv_ids, asv_tab, check.names=FALSE) - -write.table(asv_tab, paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", quote=F, row.names=FALSE) - - # making and writing out a taxonomy table: - # creating vector of desired ranks -ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species") - - # creating table of taxonomy and setting any that are unclassified as "NA" -tax_tab <- t(sapply(tax_info, function(x) { - m <- match(ranks, x$rank) - taxa <- x$taxon[m] - taxa[startsWith(taxa, "unclassified_")] <- NA - taxa -})) - -colnames(tax_tab) <- ranks -row.names(tax_tab) <- NULL -tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE) - -write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE) - - ### generating and writing out biom file format ### -biom_object <- make_biom(data=asv_tab, observation_metadata=tax_tab) -write_biom(biom_object, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.biom")) - - # making a tsv of combined tax and counts -tax_and_count_tab <- merge(tax_tab, asv_tab) -write.table(tax_and_count_tab, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.tsv"), sep="\t", quote=FALSE, row.names=FALSE) - -cat("\n\n Session info:\n\n") -sessionInfo() diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R index 28aed8292..d53629b5c 100644 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R +++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/workflow_code/scripts/Illumina-SE-R-processing.R @@ -145,6 +145,16 @@ if ( target_region == "16S" ) { ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species") +} else if (target_region == "18S" ) { + + download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData") + # loading reference taxonomy object + load("PR2_v4_13_March2021.RData") + # removing downloaded file + file.remove("PR2_v4_13_March2021.RData") + + ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species") + } else { cat("\n\n The requested target_region of ", target_region, " is not accepted.\n\n") @@ -205,6 +215,11 @@ if (target_region == "ITS" ) { tax_tab <- data.frame("ASV_ID"=asv_ids, tax_tab, check.names=FALSE) } +# need to change "kingdom" to "domain" if this is 18S (due to how the reference taxonomy object is structured) +if ( target_region == "18S" ) { + colnames(tax_tab)[colnames(tax_tab) == "kingdom"] <- "domain" +} + write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE) ### generating and writing out biom file format ### From 98cb8ddabbf4802d9b77cfcddc65021c789d65b5 Mon Sep 17 00:00:00 2001 From: Mike Lee Date: Mon, 13 Nov 2023 22:40:28 -0800 Subject: [PATCH 2/3] Update amplicon illumina workflow documentation README.md - updating current workflow version listed --- Amplicon/Illumina/Workflow_Documentation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/README.md b/Amplicon/Illumina/Workflow_Documentation/README.md index 7fff1f2a7..63dcf8920 100644 --- a/Amplicon/Illumina/Workflow_Documentation/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/README.md @@ -6,7 +6,7 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)| |:---------------|:---------------------------------------------------------| -|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.0.1](SW_AmpIllumina-A)| +|*[GL-DPPD-7104-A.md](../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-A.md)|[1.1.0](SW_AmpIllumina-A)| *Current GeneLab Pipeline/Workflow Implementation From aef7626e51e861c1a92271eb76429d959bd67149 Mon Sep 17 00:00:00 2001 From: Mike Lee Date: Mon, 13 Nov 2023 22:43:16 -0800 Subject: [PATCH 3/3] Updating Amplicon Illumina CHANGELOG.md --- .../Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md index 17cfd44f6..5d17373c0 100644 --- a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md +++ b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A/CHANGELOG.md @@ -1,5 +1,10 @@ # Workflow change log +## [1.1.0](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.1.0/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A) +- 18S option added + - will use the PR2 database for taxonomy provided by the DECIPHER developers here: http://www2.decipher.codes/Downloads.html + - option added to be able to just concatenate reads instead of merge them (in dada2), which is useful in scenarios like when primers such as 515-926 are used which can capture 18S sequences that are not expected to overlap + ## [1.0.1](https://github.com/nasa/GeneLab_Data_Processing/tree/SW_AmpIllumina-A_1.0.1/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-A) - documentation links updated in config.yaml to match changes in host site - added config file for multiqc to trim suffixes from sample names and not include paths in output report