diff --git a/README.md b/README.md index cce36108..2038154c 100644 --- a/README.md +++ b/README.md @@ -254,6 +254,12 @@ SQK-RPB004_barcode03.bam unclassified.bam ``` +#### Demuxing mapped reads + +If the input data files contain mapping data, this information can be preserved in the output files. To do this, you must use the `--no-trim` option. Trimming the barcodes will invalidate any mapping information that may be contained in the input files, and therefore the application will exclude any mapping information if `--no-trim` is not specified. + +It is also possible to get the demux application to sort and index any output bam files that contain mapped reads. To enable this use the `--sort-bam` option. If you use this option then you must also use the `--no-trim` option, as trimming will prevent any mapping information from being included in the output files. Index files (.bai extension) will only be created for bam files that contain mapped reads and were sorted. Note that for large datasets sorting the output files may take a few minutes. + #### Using a sample sheet Dorado is able to use a sample sheet to restrict the barcode classifications to only those present, and to apply aliases to the detected classifications. This is enabled by passing the path to a sample sheet to the `--sample-sheet` argument when using the `basecaller` or `demux` commands. See [here](documentation/SampleSheets.md) for more information. diff --git a/dorado/cli/aligner.cpp b/dorado/cli/aligner.cpp index 0679c794..378d76d8 100644 --- a/dorado/cli/aligner.cpp +++ b/dorado/cli/aligner.cpp @@ -274,7 +274,7 @@ int aligner(int argc, char* argv[]) { [&](size_t progress) { tracker.update_post_processing_progress(static_cast(progress)); }, - writer_threads); + writer_threads, true); tracker.summarize(); diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 037824fd..d593789a 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -326,7 +326,7 @@ void setup(std::vector args, [&](size_t progress) { tracker.update_post_processing_progress(static_cast(progress)); }, - thread_allocations.writer_threads); + thread_allocations.writer_threads, true); // Give the user a nice summary. tracker.summarize(); diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 2d8f36f2..41fbbb28 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -107,7 +107,15 @@ int demuxer(int argc, char* argv[]) { .default_value(false) .implicit_value(true); parser.visible.add_argument("--no-trim") - .help("Skip barcode trimming. If option is not chosen, trimming is enabled.") + .help("Skip barcode trimming. If this option is not chosen, trimming is enabled. " + "Note that you should use this option if your input data is mapped and you " + "want to preserve the mapping in the output files, as trimming will result " + "in any mapping information from the input file(s) being discarded.") + .default_value(false) + .implicit_value(true); + parser.visible.add_argument("--sort-bam") + .help("Sort any BAM output files that contain mapped reads. Using this option " + "requires that the --no-trim option is also set.") .default_value(false) .implicit_value(true); parser.visible.add_argument("--barcode-arrangement") @@ -133,6 +141,13 @@ int demuxer(int argc, char* argv[]) { std::exit(1); } + auto no_trim(parser.visible.get("--no-trim")); + auto sort_bam(parser.visible.get("--sort-bam")); + if (sort_bam && !no_trim) { + spdlog::error("If --sort-bam is specified then --no-trim must also be specified."); + std::exit(1); + } + auto progress_stats_frequency(parser.hidden.get("progress_stats_frequency")); if (progress_stats_frequency > 0) { utils::EnsureInfoLoggingEnabled(static_cast(verbosity)); @@ -146,7 +161,6 @@ int demuxer(int argc, char* argv[]) { auto emit_summary = parser.visible.get("emit-summary"); auto threads(parser.visible.get("threads")); auto max_reads(parser.visible.get("max-reads")); - auto no_trim(parser.visible.get("--no-trim")); alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true}; if (!processing_items.initialise()) { @@ -287,9 +301,11 @@ int demuxer(int argc, char* argv[]) { // Finalise the files that were created. tracker.set_description("Sorting output files"); - demux_writer_ref.finalise_hts_files([&](size_t progress) { - tracker.update_post_processing_progress(static_cast(progress)); - }); + demux_writer_ref.finalise_hts_files( + [&](size_t progress) { + tracker.update_post_processing_progress(static_cast(progress)); + }, + sort_bam); tracker.summarize(); progress_stats.notify_stats_collector_completed(final_stats); diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index d8e2473d..27100dc5 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -559,7 +559,7 @@ int duplex(int argc, char* argv[]) { [&](size_t progress) { tracker.update_post_processing_progress(static_cast(progress)); }, - WRITER_THREADS); + WRITER_THREADS, true); tracker.summarize(); if (!dump_stats_file.empty()) { diff --git a/dorado/cli/trim.cpp b/dorado/cli/trim.cpp index f8d97857..83f0215c 100644 --- a/dorado/cli/trim.cpp +++ b/dorado/cli/trim.cpp @@ -185,7 +185,7 @@ int trim(int argc, char* argv[]) { [&](size_t progress) { tracker.update_post_processing_progress(static_cast(progress)); }, - trim_writer_threads); + trim_writer_threads, false); tracker.summarize(); spdlog::info("> finished adapter/primer trimming"); diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.cpp b/dorado/read_pipeline/BarcodeDemuxerNode.cpp index 5b82b59b..409564f4 100644 --- a/dorado/read_pipeline/BarcodeDemuxerNode.cpp +++ b/dorado/read_pipeline/BarcodeDemuxerNode.cpp @@ -88,7 +88,8 @@ void BarcodeDemuxerNode::set_header(const sam_hdr_t* const header) { } void BarcodeDemuxerNode::finalise_hts_files( - const utils::HtsFile::ProgressCallback& progress_callback) { + const utils::HtsFile::ProgressCallback& progress_callback, + bool sort_if_mapped) { const size_t num_files = m_files.size(); size_t current_file_idx = 0; for (auto& [bc, hts_file] : m_files) { @@ -98,7 +99,7 @@ void BarcodeDemuxerNode::finalise_hts_files( const size_t total_progress = (current_file_idx * 100 + progress) / num_files; progress_callback(total_progress); }, - m_htslib_threads); + m_htslib_threads, sort_if_mapped); ++current_file_idx; } diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.h b/dorado/read_pipeline/BarcodeDemuxerNode.h index d259b079..ab5d8157 100644 --- a/dorado/read_pipeline/BarcodeDemuxerNode.h +++ b/dorado/read_pipeline/BarcodeDemuxerNode.h @@ -37,7 +37,8 @@ class BarcodeDemuxerNode : public MessageSink { // Finalisation must occur before destruction of this node. // Note that this isn't safe to call until after this node has been terminated. - void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback); + void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback, + bool sort_bam); private: std::filesystem::path m_output_dir; diff --git a/dorado/utils/hts_file.cpp b/dorado/utils/hts_file.cpp index e040201f..16432b88 100644 --- a/dorado/utils/hts_file.cpp +++ b/dorado/utils/hts_file.cpp @@ -73,7 +73,9 @@ HtsFile::~HtsFile() { // in order to generate a map of sort coordinates to virtual file offsets. we can then jump around in the // file to write out the records in the sorted order. finally we can delete the unsorted file. // in case an error occurs, the unsorted file is left on disk, so users can recover their data. -void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_threads) { +void HtsFile::finalise(const ProgressCallback& progress_callback, + int writer_threads, + bool sort_if_mapped) { assert(progress_callback); // Rough divisions of how far through we are at the start of each section. @@ -120,7 +122,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr SamHdrPtr in_header(sam_hdr_read(in_file.get())); file_is_mapped = (sam_hdr_nref(in_header.get()) > 0); - if (file_is_mapped) { + if (file_is_mapped && sort_if_mapped) { // We only need to sort and index the file if contains mapped reads. HtsFilePtr out_file(hts_open(filepath.string().c_str(), "wb")); if (bgzf_mt(out_file->fp.bgzf, writer_threads, 128) < 0) { @@ -177,7 +179,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr } } - if (file_is_mapped) { + if (file_is_mapped && sort_if_mapped) { progress_callback(percent_start_indexing); if (sam_index_build(filepath.string().c_str(), 0) < 0) { spdlog::error("Failed to build index for file {}", filepath.string()); diff --git a/dorado/utils/hts_file.h b/dorado/utils/hts_file.h index 8243522a..b47966f0 100644 --- a/dorado/utils/hts_file.h +++ b/dorado/utils/hts_file.h @@ -27,7 +27,9 @@ class HtsFile { int write(const bam1_t* record); bool finalise_is_noop() const { return m_finalise_is_noop; } - void finalise(const ProgressCallback& progress_callback, int writer_threads); + void finalise(const ProgressCallback& progress_callback, + int writer_threads, + bool sort_if_mapped); private: HtsFilePtr m_file; diff --git a/tests/BamWriterTest.cpp b/tests/BamWriterTest.cpp index d0d181dd..a024b099 100644 --- a/tests/BamWriterTest.cpp +++ b/tests/BamWriterTest.cpp @@ -44,7 +44,7 @@ class HtsWriterTestsFixture { auto& writer_ref = dynamic_cast(pipeline->get_node_ref(writer)); stats = writer_ref.sample_stats(); - hts_file.finalise([](size_t) { /* noop */ }, num_threads); + hts_file.finalise([](size_t) { /* noop */ }, num_threads, true); } stats::NamedStats stats; @@ -97,7 +97,7 @@ TEST_CASE("HtsWriterTest: Read and write FASTQ with tag", TEST_GROUP) { CHECK_THAT(bam_aux2Z(bam_aux_get(reader.record.get(), "st")), Equals("2023-06-22T07:17:48.308+00:00")); writer.write(reader.record.get()); - hts_file.finalise([](size_t) { /* noop */ }, 2); + hts_file.finalise([](size_t) { /* noop */ }, 2, false); } // Read temporary file to make sure tags were correctly set. diff --git a/tests/BarcodeDemuxerNodeTest.cpp b/tests/BarcodeDemuxerNodeTest.cpp index ab410610..421779a9 100644 --- a/tests/BarcodeDemuxerNodeTest.cpp +++ b/tests/BarcodeDemuxerNodeTest.cpp @@ -68,7 +68,7 @@ TEST_CASE("BarcodeDemuxerNode: check correct output files are created", TEST_GRO pipeline->terminate(DefaultFlushOptions()); - demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ }); + demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ }, true); const std::unordered_set expected_files = { "bc01.bam", "bc01.bam.bai", "bc02.bam", "bc02.bam.bai", "bc03.bam", "bc03.bam.bai",