From b26e9a67b4af43474545e7631a58ca58d9e0deed Mon Sep 17 00:00:00 2001
From: Kevin Dolan <Kevin.Dolan@nanoporetech.com>
Date: Tue, 26 Mar 2024 16:13:55 +0000
Subject: [PATCH 1/3] Added --sort-bam to dorado demux.

---
 dorado/cli/demux.cpp                        | 26 +++++++++++++++++----
 dorado/read_pipeline/BarcodeDemuxerNode.cpp |  5 ++--
 dorado/read_pipeline/BarcodeDemuxerNode.h   |  3 ++-
 dorado/utils/hts_file.cpp                   |  8 ++++---
 dorado/utils/hts_file.h                     |  4 +++-
 tests/BarcodeDemuxerNodeTest.cpp            |  2 +-
 6 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp
index 2d8f36f2..41fbbb28 100644
--- a/dorado/cli/demux.cpp
+++ b/dorado/cli/demux.cpp
@@ -107,7 +107,15 @@ int demuxer(int argc, char* argv[]) {
             .default_value(false)
             .implicit_value(true);
     parser.visible.add_argument("--no-trim")
-            .help("Skip barcode trimming. If option is not chosen, trimming is enabled.")
+            .help("Skip barcode trimming. If this option is not chosen, trimming is enabled. "
+                  "Note that you should use this option if your input data is mapped and you "
+                  "want to preserve the mapping in the output files, as trimming will result "
+                  "in any mapping information from the input file(s) being discarded.")
+            .default_value(false)
+            .implicit_value(true);
+    parser.visible.add_argument("--sort-bam")
+            .help("Sort any BAM output files that contain mapped reads. Using this option "
+                  "requires that the --no-trim option is also set.")
             .default_value(false)
             .implicit_value(true);
     parser.visible.add_argument("--barcode-arrangement")
@@ -133,6 +141,13 @@ int demuxer(int argc, char* argv[]) {
         std::exit(1);
     }
 
+    auto no_trim(parser.visible.get<bool>("--no-trim"));
+    auto sort_bam(parser.visible.get<bool>("--sort-bam"));
+    if (sort_bam && !no_trim) {
+        spdlog::error("If --sort-bam is specified then --no-trim must also be specified.");
+        std::exit(1);
+    }
+
     auto progress_stats_frequency(parser.hidden.get<int>("progress_stats_frequency"));
     if (progress_stats_frequency > 0) {
         utils::EnsureInfoLoggingEnabled(static_cast<dorado::utils::VerboseLogLevel>(verbosity));
@@ -146,7 +161,6 @@ int demuxer(int argc, char* argv[]) {
     auto emit_summary = parser.visible.get<bool>("emit-summary");
     auto threads(parser.visible.get<int>("threads"));
     auto max_reads(parser.visible.get<int>("max-reads"));
-    auto no_trim(parser.visible.get<bool>("--no-trim"));
 
     alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true};
     if (!processing_items.initialise()) {
@@ -287,9 +301,11 @@ int demuxer(int argc, char* argv[]) {
 
     // Finalise the files that were created.
     tracker.set_description("Sorting output files");
-    demux_writer_ref.finalise_hts_files([&](size_t progress) {
-        tracker.update_post_processing_progress(static_cast<float>(progress));
-    });
+    demux_writer_ref.finalise_hts_files(
+            [&](size_t progress) {
+                tracker.update_post_processing_progress(static_cast<float>(progress));
+            },
+            sort_bam);
 
     tracker.summarize();
     progress_stats.notify_stats_collector_completed(final_stats);
diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.cpp b/dorado/read_pipeline/BarcodeDemuxerNode.cpp
index 5b82b59b..409564f4 100644
--- a/dorado/read_pipeline/BarcodeDemuxerNode.cpp
+++ b/dorado/read_pipeline/BarcodeDemuxerNode.cpp
@@ -88,7 +88,8 @@ void BarcodeDemuxerNode::set_header(const sam_hdr_t* const header) {
 }
 
 void BarcodeDemuxerNode::finalise_hts_files(
-        const utils::HtsFile::ProgressCallback& progress_callback) {
+        const utils::HtsFile::ProgressCallback& progress_callback,
+        bool sort_if_mapped) {
     const size_t num_files = m_files.size();
     size_t current_file_idx = 0;
     for (auto& [bc, hts_file] : m_files) {
@@ -98,7 +99,7 @@ void BarcodeDemuxerNode::finalise_hts_files(
                     const size_t total_progress = (current_file_idx * 100 + progress) / num_files;
                     progress_callback(total_progress);
                 },
-                m_htslib_threads);
+                m_htslib_threads, sort_if_mapped);
         ++current_file_idx;
     }
 
diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.h b/dorado/read_pipeline/BarcodeDemuxerNode.h
index d259b079..ab5d8157 100644
--- a/dorado/read_pipeline/BarcodeDemuxerNode.h
+++ b/dorado/read_pipeline/BarcodeDemuxerNode.h
@@ -37,7 +37,8 @@ class BarcodeDemuxerNode : public MessageSink {
 
     // Finalisation must occur before destruction of this node.
     // Note that this isn't safe to call until after this node has been terminated.
-    void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback);
+    void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback,
+                            bool sort_bam);
 
 private:
     std::filesystem::path m_output_dir;
diff --git a/dorado/utils/hts_file.cpp b/dorado/utils/hts_file.cpp
index e040201f..16432b88 100644
--- a/dorado/utils/hts_file.cpp
+++ b/dorado/utils/hts_file.cpp
@@ -73,7 +73,9 @@ HtsFile::~HtsFile() {
 // in order to generate a map of sort coordinates to virtual file offsets. we can then jump around in the
 // file to write out the records in the sorted order. finally we can delete the unsorted file.
 // in case an error occurs, the unsorted file is left on disk, so users can recover their data.
-void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_threads) {
+void HtsFile::finalise(const ProgressCallback& progress_callback,
+                       int writer_threads,
+                       bool sort_if_mapped) {
     assert(progress_callback);
 
     // Rough divisions of how far through we are at the start of each section.
@@ -120,7 +122,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr
 
         SamHdrPtr in_header(sam_hdr_read(in_file.get()));
         file_is_mapped = (sam_hdr_nref(in_header.get()) > 0);
-        if (file_is_mapped) {
+        if (file_is_mapped && sort_if_mapped) {
             // We only need to sort and index the file if contains mapped reads.
             HtsFilePtr out_file(hts_open(filepath.string().c_str(), "wb"));
             if (bgzf_mt(out_file->fp.bgzf, writer_threads, 128) < 0) {
@@ -177,7 +179,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr
         }
     }
 
-    if (file_is_mapped) {
+    if (file_is_mapped && sort_if_mapped) {
         progress_callback(percent_start_indexing);
         if (sam_index_build(filepath.string().c_str(), 0) < 0) {
             spdlog::error("Failed to build index for file {}", filepath.string());
diff --git a/dorado/utils/hts_file.h b/dorado/utils/hts_file.h
index 8243522a..b033e629 100644
--- a/dorado/utils/hts_file.h
+++ b/dorado/utils/hts_file.h
@@ -27,7 +27,9 @@ class HtsFile {
     int write(const bam1_t* record);
 
     bool finalise_is_noop() const { return m_finalise_is_noop; }
-    void finalise(const ProgressCallback& progress_callback, int writer_threads);
+    void finalise(const ProgressCallback& progress_callback,
+                  int writer_threads,
+                  bool sort_if_mapped = true);
 
 private:
     HtsFilePtr m_file;
diff --git a/tests/BarcodeDemuxerNodeTest.cpp b/tests/BarcodeDemuxerNodeTest.cpp
index ab410610..421779a9 100644
--- a/tests/BarcodeDemuxerNodeTest.cpp
+++ b/tests/BarcodeDemuxerNodeTest.cpp
@@ -68,7 +68,7 @@ TEST_CASE("BarcodeDemuxerNode: check correct output files are created", TEST_GRO
 
         pipeline->terminate(DefaultFlushOptions());
 
-        demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ });
+        demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ }, true);
 
         const std::unordered_set<std::string> expected_files = {
                 "bc01.bam", "bc01.bam.bai", "bc02.bam", "bc02.bam.bai", "bc03.bam", "bc03.bam.bai",

From 740290e9b7b7e79d1d21bd9484adb92530a5db77 Mon Sep 17 00:00:00 2001
From: Kevin Dolan <Kevin.Dolan@nanoporetech.com>
Date: Tue, 26 Mar 2024 16:27:29 +0000
Subject: [PATCH 2/3] Updated readme file with documentation on the --no-trim
 and --sort-bam options from the demux app.

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index cce36108..ee13c241 100644
--- a/README.md
+++ b/README.md
@@ -254,6 +254,12 @@ SQK-RPB004_barcode03.bam
 unclassified.bam
 ```
 
+#### Demuxing mapped reads
+
+If the input data files containing mapping data, this information can be preserved in the output files. To do this, you must use the `--no-trim` option. Trimming the barcodes will invalidate any mapping information that may be contained in the input files, and therefore the application will exclude any mapping information if `--no-trim` is not specified.
+
+It is also possible to get the demux application to sort and index any output bam files that contain mapped reads. To enable this use the `--sort-bam` option. If you use this option then you must also use the `--no-trim option`, as trimming will prevent any mapping information from being included in the output files. Index files (.bai extension) will only be created for bam files that contain mapped reads and were sorted. Note that for large datasets sorting the output files may take a few minutes.
+
 #### Using a sample sheet
 
 Dorado is able to use a sample sheet to restrict the barcode classifications to only those present, and to apply aliases to the detected classifications. This is enabled by passing the path to a sample sheet to the `--sample-sheet` argument when using the `basecaller` or `demux` commands. See [here](documentation/SampleSheets.md) for more information.

From b9e669aa3ed311920969291e2398aea9696097bf Mon Sep 17 00:00:00 2001
From: Kevin Dolan <Kevin.Dolan@nanoporetech.com>
Date: Tue, 26 Mar 2024 17:17:21 +0000
Subject: [PATCH 3/3] Fixed some comment typos, and removed default argument.

---
 README.md                 | 4 ++--
 dorado/cli/aligner.cpp    | 2 +-
 dorado/cli/basecaller.cpp | 2 +-
 dorado/cli/duplex.cpp     | 2 +-
 dorado/cli/trim.cpp       | 2 +-
 dorado/utils/hts_file.h   | 2 +-
 tests/BamWriterTest.cpp   | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index ee13c241..2038154c 100644
--- a/README.md
+++ b/README.md
@@ -256,9 +256,9 @@ unclassified.bam
 
 #### Demuxing mapped reads
 
-If the input data files containing mapping data, this information can be preserved in the output files. To do this, you must use the `--no-trim` option. Trimming the barcodes will invalidate any mapping information that may be contained in the input files, and therefore the application will exclude any mapping information if `--no-trim` is not specified.
+If the input data files contain mapping data, this information can be preserved in the output files. To do this, you must use the `--no-trim` option. Trimming the barcodes will invalidate any mapping information that may be contained in the input files, and therefore the application will exclude any mapping information if `--no-trim` is not specified.
 
-It is also possible to get the demux application to sort and index any output bam files that contain mapped reads. To enable this use the `--sort-bam` option. If you use this option then you must also use the `--no-trim option`, as trimming will prevent any mapping information from being included in the output files. Index files (.bai extension) will only be created for bam files that contain mapped reads and were sorted. Note that for large datasets sorting the output files may take a few minutes.
+It is also possible to get the demux application to sort and index any output bam files that contain mapped reads. To enable this use the `--sort-bam` option. If you use this option then you must also use the `--no-trim` option, as trimming will prevent any mapping information from being included in the output files. Index files (.bai extension) will only be created for bam files that contain mapped reads and were sorted. Note that for large datasets sorting the output files may take a few minutes.
 
 #### Using a sample sheet
 
diff --git a/dorado/cli/aligner.cpp b/dorado/cli/aligner.cpp
index 0679c794..378d76d8 100644
--- a/dorado/cli/aligner.cpp
+++ b/dorado/cli/aligner.cpp
@@ -274,7 +274,7 @@ int aligner(int argc, char* argv[]) {
                 [&](size_t progress) {
                     tracker.update_post_processing_progress(static_cast<float>(progress));
                 },
-                writer_threads);
+                writer_threads, true);
 
         tracker.summarize();
 
diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp
index 32e5763e..8c0451d3 100644
--- a/dorado/cli/basecaller.cpp
+++ b/dorado/cli/basecaller.cpp
@@ -288,7 +288,7 @@ void setup(std::vector<std::string> args,
             [&](size_t progress) {
                 tracker.update_post_processing_progress(static_cast<float>(progress));
             },
-            thread_allocations.writer_threads);
+            thread_allocations.writer_threads, true);
 
     // Give the user a nice summary.
     tracker.summarize();
diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp
index 5b875330..8c43a44d 100644
--- a/dorado/cli/duplex.cpp
+++ b/dorado/cli/duplex.cpp
@@ -560,7 +560,7 @@ int duplex(int argc, char* argv[]) {
                 [&](size_t progress) {
                     tracker.update_post_processing_progress(static_cast<float>(progress));
                 },
-                WRITER_THREADS);
+                WRITER_THREADS, true);
 
         tracker.summarize();
         if (!dump_stats_file.empty()) {
diff --git a/dorado/cli/trim.cpp b/dorado/cli/trim.cpp
index f8d97857..83f0215c 100644
--- a/dorado/cli/trim.cpp
+++ b/dorado/cli/trim.cpp
@@ -185,7 +185,7 @@ int trim(int argc, char* argv[]) {
             [&](size_t progress) {
                 tracker.update_post_processing_progress(static_cast<float>(progress));
             },
-            trim_writer_threads);
+            trim_writer_threads, false);
     tracker.summarize();
 
     spdlog::info("> finished adapter/primer trimming");
diff --git a/dorado/utils/hts_file.h b/dorado/utils/hts_file.h
index b033e629..b47966f0 100644
--- a/dorado/utils/hts_file.h
+++ b/dorado/utils/hts_file.h
@@ -29,7 +29,7 @@ class HtsFile {
     bool finalise_is_noop() const { return m_finalise_is_noop; }
     void finalise(const ProgressCallback& progress_callback,
                   int writer_threads,
-                  bool sort_if_mapped = true);
+                  bool sort_if_mapped);
 
 private:
     HtsFilePtr m_file;
diff --git a/tests/BamWriterTest.cpp b/tests/BamWriterTest.cpp
index d0d181dd..a024b099 100644
--- a/tests/BamWriterTest.cpp
+++ b/tests/BamWriterTest.cpp
@@ -44,7 +44,7 @@ class HtsWriterTestsFixture {
         auto& writer_ref = dynamic_cast<HtsWriter&>(pipeline->get_node_ref(writer));
         stats = writer_ref.sample_stats();
 
-        hts_file.finalise([](size_t) { /* noop */ }, num_threads);
+        hts_file.finalise([](size_t) { /* noop */ }, num_threads, true);
     }
 
     stats::NamedStats stats;
@@ -97,7 +97,7 @@ TEST_CASE("HtsWriterTest: Read and write FASTQ with tag", TEST_GROUP) {
         CHECK_THAT(bam_aux2Z(bam_aux_get(reader.record.get(), "st")),
                    Equals("2023-06-22T07:17:48.308+00:00"));
         writer.write(reader.record.get());
-        hts_file.finalise([](size_t) { /* noop */ }, 2);
+        hts_file.finalise([](size_t) { /* noop */ }, 2, false);
     }
 
     // Read temporary file to make sure tags were correctly set.