From 1446320f7db5352722933f06ce5764879de8f68f Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 15 Feb 2024 11:41:10 +0000
Subject: [PATCH 01/21] [DOR-526] Factor out 3rdparty libs into their own
 CMakeLists.txt

---
 CMakeLists.txt                 | 32 ++------------------------------
 dorado/3rdparty/CMakeLists.txt | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 30 deletions(-)
 create mode 100644 dorado/3rdparty/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b389c83..56b07dee 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,36 +116,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
     endif()
 endif()
 
-# Bring in spdlog
-add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/spdlog)
-# Avoid namespace clashes with static torch.
-target_compile_definitions(spdlog PUBLIC
-    "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {"
-    "FMT_END_NAMESPACE=}}"
-)
-
-# ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx.
-set(ELZIP_DECOMPRESS_ONLY ON)
-add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/elzip)
-
-option(ONT_MM2_EXE "Build minimap2 exe" OFF)
-add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/ont-minimap2 EXCLUDE_FROM_ALL)
-message(STATUS "build minimap2 exe: ${ONT_MM2_EXE}")
-message(STATUS "build minimap2 lib: ${ONT_MM2_LIB}")
-if (ONT_MM2_EXE)
-    install(TARGETS minimap2_exe)
-    add_custom_target(testing_deps ALL DEPENDS minimap2_exe)
-endif()
-# Disable warnings from minimap source
-disable_warnings(minimap2)
-
-set(BUILD_TESTING OFF)
-add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/edlib EXCLUDE_FROM_ALL)
-# Disable warnings from edlib source
-disable_warnings(edlib)
-set(BUILD_TESTING ON)
-
-add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/date EXCLUDE_FROM_ALL)
+# Bring in 3rdparty libs
+add_subdirectory(dorado/3rdparty)
 
 enable_testing()
 
diff --git a/dorado/3rdparty/CMakeLists.txt b/dorado/3rdparty/CMakeLists.txt
new file mode 100644
index 00000000..a4e2416a
--- /dev/null
+++ b/dorado/3rdparty/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Bring in spdlog
+add_subdirectory(spdlog)
+# Avoid namespace clashes with static torch.
+target_compile_definitions(spdlog PUBLIC
+    "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {"
+    "FMT_END_NAMESPACE=}}"
+)
+
+# ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx.
+set(ELZIP_DECOMPRESS_ONLY ON)
+add_subdirectory(elzip)
+
+# minimap
+option(ONT_MM2_EXE "Build minimap2 exe" OFF)
+add_subdirectory(ont-minimap2 EXCLUDE_FROM_ALL)
+message(STATUS "build minimap2 exe: ${ONT_MM2_EXE}")
+message(STATUS "build minimap2 lib: ${ONT_MM2_LIB}")
+if (ONT_MM2_EXE)
+    install(TARGETS minimap2_exe)
+    add_custom_target(testing_deps ALL DEPENDS minimap2_exe)
+endif()
+# Disable warnings from minimap source
+disable_warnings(minimap2)
+
+# edlib
+set(BUILD_TESTING OFF)
+add_subdirectory(edlib EXCLUDE_FROM_ALL)
+# Disable warnings from edlib source
+disable_warnings(edlib)
+set(BUILD_TESTING ON)
+
+# date
+add_subdirectory(date EXCLUDE_FROM_ALL)

From bf85adb787c39ac7ec7d01510ce38ab55c1bbc20 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 15 Feb 2024 11:46:22 +0000
Subject: [PATCH 02/21] [DOR-526] Add missing EXCLUDE_FROM_ALL from 3rdparty
 libs

These libs don't build anything extra, but worth being consistent in
case future versions do.
---
 dorado/3rdparty/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dorado/3rdparty/CMakeLists.txt b/dorado/3rdparty/CMakeLists.txt
index a4e2416a..3ddd5103 100644
--- a/dorado/3rdparty/CMakeLists.txt
+++ b/dorado/3rdparty/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Bring in spdlog
-add_subdirectory(spdlog)
+add_subdirectory(spdlog EXCLUDE_FROM_ALL)
 # Avoid namespace clashes with static torch.
 target_compile_definitions(spdlog PUBLIC
     "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {"
@@ -8,7 +8,7 @@ target_compile_definitions(spdlog PUBLIC
 
 # ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx.
 set(ELZIP_DECOMPRESS_ONLY ON)
-add_subdirectory(elzip)
+add_subdirectory(elzip EXCLUDE_FROM_ALL)
 
 # minimap
 option(ONT_MM2_EXE "Build minimap2 exe" OFF)

From e3ef3edddf9c0ab7c36bc583e2fc26730eead345 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 15 Feb 2024 12:18:03 +0000
Subject: [PATCH 03/21] [DOR-526] Neaten the split finder builder code

The indentation was killing me so replace some parts with functionally
identical code that clang-format doesn't try to mess with as much.
---
 dorado/splitter/DuplexReadSplitter.cpp | 107 +++++++++++--------------
 1 file changed, 48 insertions(+), 59 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 3165c654..93e27a3e 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -377,67 +377,56 @@ std::vector<SimplexReadPtr> DuplexReadSplitter::subreads(SimplexReadPtr read,
 std::vector<std::pair<std::string, DuplexReadSplitter::SplitFinderF>>
 DuplexReadSplitter::build_split_finders() const {
     std::vector<std::pair<std::string, SplitFinderF>> split_finders;
-    split_finders.push_back({"PORE_ADAPTER", [&](const ExtRead& read) {
-                                 return filter_ranges(read.possible_pore_regions, [&](PosRange r) {
-                                     return check_nearby_adapter(*read.read, r,
-                                                                 m_settings.adapter_edist);
-                                 });
-                             }});
+    split_finders.emplace_back(std::make_pair("PORE_ADAPTER", [&](const ExtRead& read) {
+        return filter_ranges(read.possible_pore_regions, [&](PosRange r) {
+            return check_nearby_adapter(*read.read, r, m_settings.adapter_edist);
+        });
+    }));
 
     if (!m_settings.simplex_mode) {
-        split_finders.push_back(
-                {"PORE_FLANK", [&](const ExtRead& read) {
-                     return merge_ranges(
-                             filter_ranges(read.possible_pore_regions,
-                                           [&](PosRange r) {
-                                               return check_flank_match(*read.read, r,
-                                                                        m_settings.flank_err);
-                                           }),
-                             m_settings.strand_end_flank + m_settings.strand_start_flank);
-                 }});
-
-        split_finders.push_back(
-                {"PORE_ALL", [&](const ExtRead& read) {
-                     return merge_ranges(
-                             filter_ranges(read.possible_pore_regions,
-                                           [&](PosRange r) {
-                                               return check_nearby_adapter(
-                                                              *read.read, r,
-                                                              m_settings.relaxed_adapter_edist) &&
-                                                      check_flank_match(
-                                                              *read.read, r,
-                                                              m_settings.relaxed_flank_err);
-                                           }),
-                             m_settings.strand_end_flank + m_settings.strand_start_flank);
-                 }});
-
-        split_finders.push_back(
-                {"ADAPTER_FLANK", [&](const ExtRead& read) {
-                     return filter_ranges(
-                             find_adapter_matches(m_settings.adapter, read.read->read_common.seq,
-                                                  m_settings.adapter_edist,
-                                                  m_settings.expect_adapter_prefix),
-                             [&](PosRange r) {
-                                 return check_flank_match(*read.read, {r.first, r.first},
-                                                          m_settings.flank_err);
-                             });
-                 }});
-
-        split_finders.push_back({"ADAPTER_MIDDLE", [&](const ExtRead& read) {
-                                     if (auto split = identify_middle_adapter_split(*read.read)) {
-                                         return PosRanges{*split};
-                                     } else {
-                                         return PosRanges();
-                                     }
-                                 }});
-
-        split_finders.push_back({"SPLIT_MIDDLE", [&](const ExtRead& read) {
-                                     if (auto split = identify_extra_middle_split(*read.read)) {
-                                         return PosRanges{*split};
-                                     } else {
-                                         return PosRanges();
-                                     }
-                                 }});
+        split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) {
+            auto filter = [&](PosRange r) {
+                return check_flank_match(*read.read, r, m_settings.flank_err);
+            };
+            return merge_ranges(filter_ranges(read.possible_pore_regions, filter),
+                                m_settings.strand_end_flank + m_settings.strand_start_flank);
+        }));
+
+        split_finders.emplace_back(std::make_pair("PORE_ALL", [&](const ExtRead& read) {
+            auto filter = [&](PosRange r) {
+                return check_nearby_adapter(*read.read, r, m_settings.relaxed_adapter_edist) &&
+                       check_flank_match(*read.read, r, m_settings.relaxed_flank_err);
+            };
+            return merge_ranges(filter_ranges(read.possible_pore_regions, filter),
+                                m_settings.strand_end_flank + m_settings.strand_start_flank);
+        }));
+
+        split_finders.emplace_back(std::make_pair("ADAPTER_FLANK", [&](const ExtRead& read) {
+            auto filter = [&](PosRange r) {
+                return check_flank_match(*read.read, {r.first, r.first}, m_settings.flank_err);
+            };
+            return filter_ranges(
+                    find_adapter_matches(m_settings.adapter, read.read->read_common.seq,
+                                         m_settings.adapter_edist,
+                                         m_settings.expect_adapter_prefix),
+                    filter);
+        }));
+
+        split_finders.emplace_back(std::make_pair("ADAPTER_MIDDLE", [&](const ExtRead& read) {
+            if (auto split = identify_middle_adapter_split(*read.read)) {
+                return PosRanges{*split};
+            } else {
+                return PosRanges();
+            }
+        }));
+
+        split_finders.emplace_back(std::make_pair("SPLIT_MIDDLE", [&](const ExtRead& read) {
+            if (auto split = identify_extra_middle_split(*read.read)) {
+                return PosRanges{*split};
+            } else {
+                return PosRanges();
+            }
+        }));
     }
 
     return split_finders;

From 2dbfe45025d81df94f8691cdaa1ab1a65f9d1457 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 10:56:32 +0000
Subject: [PATCH 04/21] [DOR-526] Add direct port of the notebook code

This is currently limited to a single region per read, which will be
extended in future commits.
---
 dorado/splitter/DuplexReadSplitter.cpp | 86 ++++++++++++++++++++++++++
 dorado/splitter/DuplexReadSplitter.h   |  3 +-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 93e27a3e..708a1288 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -2,6 +2,7 @@
 
 #include "read_pipeline/read_utils.h"
 #include "splitter/splitter_utils.h"
+#include "utils/PostCondition.h"
 #include "utils/alignment_utils.h"
 #include "utils/sequence_utils.h"
 #include "utils/uuid_utils.h"
@@ -16,6 +17,7 @@
 #include <iomanip>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 
 namespace {
@@ -199,6 +201,87 @@ PosRanges DuplexReadSplitter::possible_pore_regions(const DuplexReadSplitter::Ex
     return pore_regions;
 }
 
+PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const {
+    spdlog::trace("Finding muA regions for read {}", read.read->read_common.read_id);
+
+    constexpr std::string_view muA_seq = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA";
+    constexpr std::int64_t max_muA_edist = 11;
+    constexpr std::int64_t ignore_start = 100;
+    constexpr std::int64_t min_read_len = 200;
+    // 14bp adapter prefix that should work better for click chemistry based on previous investigation
+    constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT";
+    constexpr std::int64_t max_adapter_edist = 5;
+    constexpr std::int64_t max_muA_adapter_dist = 100;
+
+    // Skip if the read is too small.
+    const auto& read_seq = read.read->read_common.seq;
+    if (read_seq.size() < min_read_len) {
+        spdlog::trace("Read too small");
+        return {};
+    }
+
+    auto match_start_range = [&](std::string_view query, std::int64_t min_start,
+                                 std::int64_t max_start,
+                                 std::int64_t max_edist) -> std::optional<PosRange> {
+        const auto max_end_pos =
+                std::min<std::int64_t>(read_seq.size(), max_start + query.size() + max_edist);
+        if (min_start + static_cast<std::int64_t>(query.size()) > max_end_pos) {
+            // Too close to the end.
+            spdlog::trace("Too close to the end");
+            return std::nullopt;
+        }
+
+        // Trim the sequence.
+        std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start);
+
+        // Search the sequence.
+        static const EdlibEqualityPair additionalEqualities[4] = {
+                {'N', 'A'}, {'N', 'T'}, {'N', 'C'}, {'N', 'G'}};
+        auto edlib_cfg = edlibNewAlignConfig(max_edist, EDLIB_MODE_HW, EDLIB_TASK_LOC,
+                                             additionalEqualities, std::size(additionalEqualities));
+        auto edlib_result =
+                edlibAlign(query.data(), query.size(), seq.data(), seq.size(), edlib_cfg);
+        assert(edlib_result.status == EDLIB_STATUS_OK);
+        auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); });
+
+        // Check we got a match.
+        if (edlib_result.status != EDLIB_STATUS_OK || edlib_result.editDistance == -1) {
+            spdlog::trace("No match {} {}", edlib_result.status, edlib_result.editDistance);
+            return std::nullopt;
+        }
+
+        // Build the result.
+        PosRange range;
+        range.first = min_start + edlib_result.startLocations[0];
+        range.second = min_start + edlib_result.endLocations[0] + 1;
+        if (static_cast<std::int64_t>(range.first) > max_start) {
+            spdlog::trace("Past the start");
+            return std::nullopt;
+        }
+        return range;
+    };
+
+    // Search for muA.
+    auto muA_range = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist);
+    if (!muA_range.has_value()) {
+        spdlog::trace("No muA");
+        return {};
+    }
+
+    // Search for the adapter.
+    const auto adapter_start = std::max(
+            ignore_start, static_cast<std::int64_t>(muA_range->first) - max_muA_adapter_dist);
+    auto adapter_match =
+            match_start_range(adapter_seq, adapter_start, muA_range->first, max_adapter_edist);
+    if (!adapter_match.has_value()) {
+        spdlog::trace("No adapter");
+        return {};
+    }
+
+    spdlog::trace("Detected muA+adapter region in read {}", read.read->read_common.read_id);
+    return {*adapter_match};
+}
+
 bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read,
                                               PosRange r,
                                               int adapter_edist) const {
@@ -383,6 +466,9 @@ DuplexReadSplitter::build_split_finders() const {
         });
     }));
 
+    split_finders.emplace_back(std::make_pair(
+            "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_splits(read); }));
+
     if (!m_settings.simplex_mode) {
         split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) {
             auto filter = [&](PosRange r) {
diff --git a/dorado/splitter/DuplexReadSplitter.h b/dorado/splitter/DuplexReadSplitter.h
index 2b1fe200..28466cf5 100644
--- a/dorado/splitter/DuplexReadSplitter.h
+++ b/dorado/splitter/DuplexReadSplitter.h
@@ -23,7 +23,8 @@ class DuplexReadSplitter : public ReadSplitter {
     using SplitFinderF = std::function<splitter::PosRanges(const ExtRead&)>;
 
     ExtRead create_ext_read(SimplexReadPtr r) const;
-    std::vector<splitter::PosRange> possible_pore_regions(const ExtRead& read) const;
+    PosRanges possible_pore_regions(const ExtRead& read) const;
+    PosRanges find_muA_adapter_splits(const ExtRead& read) const;
     bool check_nearby_adapter(const SimplexRead& read,
                               splitter::PosRange r,
                               int adapter_edist) const;

From a0e9bccf669012f5da57fc703856a44aaf44abee Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 11:22:16 +0000
Subject: [PATCH 05/21] [DOR-526] Add tests for PostCondition

---
 tests/CMakeLists.txt        |  1 +
 tests/PostConditionTest.cpp | 43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 tests/PostConditionTest.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a9fee034..000e7b4b 100755
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,6 +34,7 @@ add_executable(dorado_tests
     PairingNodeTest.cpp
     PipelineTest.cpp
     PolyACalculatorTest.cpp
+    PostConditionTest.cpp
     ReadFilterNodeTest.cpp
     ReadForwarderNodeTest.cpp
     ReadTest.cpp
diff --git a/tests/PostConditionTest.cpp b/tests/PostConditionTest.cpp
new file mode 100644
index 00000000..ccb1243b
--- /dev/null
+++ b/tests/PostConditionTest.cpp
@@ -0,0 +1,43 @@
+#include "utils/PostCondition.h"
+
+#include <catch2/catch.hpp>
+
+#define CUT_TAG "[PostCondition]"
+#define DEFINE_TEST(name) TEST_CASE(CUT_TAG " " name, CUT_TAG)
+
+using dorado::utils::PostCondition;
+
+DEFINE_TEST("Condition isn't triggered right away") {
+    int counter = 0;
+    auto on_scope_end = PostCondition([&] { counter++; });
+    CHECK(counter == 0);
+}
+
+DEFINE_TEST("Condition is triggered on scope end") {
+    int counter = 0;
+    {
+        auto on_scope_end = PostCondition([&] { counter++; });
+        CHECK(counter == 0);
+    }
+    CHECK(counter == 1);
+}
+
+DEFINE_TEST("Multiple scopes") {
+    int counter = 0;
+    {
+        const int outer_increment = 1;
+        auto outer_scope = PostCondition([&] { counter -= outer_increment; });
+        CHECK(counter == 0);
+        counter += outer_increment;
+        CHECK(counter == outer_increment);
+        {
+            const int inner_increment = 2;
+            auto inner_scope = PostCondition([&] { counter -= inner_increment; });
+            CHECK(counter == outer_increment);
+            counter += inner_increment;
+            CHECK(counter == outer_increment + inner_increment);
+        }
+        CHECK(counter == outer_increment);
+    }
+    CHECK(counter == 0);
+}

From 952aab93739ea1b64c751f6f889caf05bf0b626d Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 11:22:54 +0000
Subject: [PATCH 06/21] [DOR-526] Improve PostCondition to make it harder to
 misuse

Also drop the dependency on std::function.
---
 dorado/utils/PostCondition.h | 21 +++++++++++++--------
 tests/AlignerTest.cpp        |  3 ++-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/dorado/utils/PostCondition.h b/dorado/utils/PostCondition.h
index 0dc99396..38321b36 100644
--- a/dorado/utils/PostCondition.h
+++ b/dorado/utils/PostCondition.h
@@ -1,19 +1,24 @@
 #pragma once
-#include <functional>
+
+#include <utility>
 
 namespace dorado::utils {
 
+namespace detail {
+template <typename Func>
 class PostCondition {
 public:
-    PostCondition(std::function<void()> func) : m_func(func) {}
-    ~PostCondition() {
-        if (m_func) {
-            m_func();
-        }
-    }
+    PostCondition(Func&& func) : m_func(std::move(func)) {}
+    ~PostCondition() { m_func(); }
 
 private:
-    std::function<void()> m_func;
+    Func m_func;
 };
+}  // namespace detail
+
+template <typename Func>
+[[nodiscard]] auto PostCondition(Func function) {
+    return detail::PostCondition(std::move(function));
+}
 
 }  // namespace dorado::utils
diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp
index c0316a36..c6ba4a41 100644
--- a/tests/AlignerTest.cpp
+++ b/tests/AlignerTest.cpp
@@ -115,7 +115,8 @@ class AlignerNodeTestFixture {
         dorado::utils::add_sq_hdr(header.get(), aligner_ref.get_sequence_records_for_header());
 
         auto sam_line_buffer = dorado::utils::allocate_kstring();
-        dorado::utils::PostCondition free_buffer([&sam_line_buffer] { ks_free(&sam_line_buffer); });
+        auto free_buffer =
+                dorado::utils::PostCondition([&sam_line_buffer] { ks_free(&sam_line_buffer); });
         CHECK(sam_format1(header.get(), bam_record.get(), &sam_line_buffer) >= 0);
 
         return std::string(ks_str(&sam_line_buffer), ks_len(&sam_line_buffer));

From 178fc32dc604de8adb63dfdaf29e6f2e507621c6 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 17:44:18 +0000
Subject: [PATCH 07/21] [DOR-526] Add Nikolai's editdistance implementation

Also add some tests for it.
---
 CMakeLists.txt            |   2 +
 dorado/splitter/myers.cpp | 153 ++++++++++++++++++++++++++++++++++++++
 dorado/splitter/myers.h   |  23 ++++++
 tests/CMakeLists.txt      |   1 +
 tests/myers_test.cpp      |  95 +++++++++++++++++++++++
 5 files changed, 274 insertions(+)
 create mode 100644 dorado/splitter/myers.cpp
 create mode 100644 dorado/splitter/myers.h
 create mode 100644 tests/myers_test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56b07dee..a3fbc74a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,8 @@ set(LIB_SOURCE_FILES
     dorado/splitter/DuplexReadSplitter.h
     dorado/splitter/RNAReadSplitter.cpp
     dorado/splitter/RNAReadSplitter.h
+    dorado/splitter/myers.cpp
+    dorado/splitter/myers.h
     dorado/splitter/splitter_utils.cpp
     dorado/splitter/splitter_utils.h
     dorado/demux/AdapterDetector.cpp
diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp
new file mode 100644
index 00000000..0ab9c70f
--- /dev/null
+++ b/dorado/splitter/myers.cpp
@@ -0,0 +1,153 @@
+#include "myers.h"
+
+#include "utils/PostCondition.h"
+#include "utils/alignment_utils.h"
+
+#include <cassert>
+#include <cstdint>
+#include <iomanip>
+#include <optional>
+#include <ostream>
+
+namespace dorado::splitter {
+
+namespace {
+
+// Returns the array D[0..n], where D[i] is equal to local alignment of the pattern to suffix of text[0..i).
+std::vector<size_t> d_myers(const char* pattern, size_t m, const char* text, size_t n) {
+    constexpr size_t MAX_ALPHABET = 256;
+    assert(m < 64);
+    uint64_t PM[MAX_ALPHABET]{};
+    for (size_t i = 0; i < m; i++) {
+        PM[static_cast<uint8_t>(pattern[i])] |= uint64_t{1} << i;
+    }
+
+    std::vector<size_t> D(n + 1);
+    uint64_t VP = ~uint64_t{0};
+    uint64_t VN = 0;
+    size_t score = m;
+    D[0] = score;
+    for (size_t j = 0; j < n; j++) {
+        const uint64_t EQ = PM[static_cast<uint8_t>(text[j])];
+        const uint64_t D0 = (((EQ & VP) + VP) ^ VP) | EQ | VN;
+        uint64_t HP = VN | ~(D0 | VP);
+        uint64_t HN = D0 & VP;
+
+        if (HP & (uint64_t{1} << (m - 1))) {
+            score++;
+        }
+        if (HN & (uint64_t{1} << (m - 1))) {
+            score--;
+        }
+        D[j + 1] = score;
+
+        HP <<= 1;
+        HN <<= 1;
+        VP = HN | ~(D0 | HP);
+        VN = D0 & HP;
+    }
+    return D;
+}
+
+}  // namespace
+
+std::vector<EdistResult> myers_align(std::string_view query,
+                                     std::string_view seq,
+                                     std::size_t max_edist) {
+    std::vector<EdistResult> ranges;
+    const auto query_len = query.size();
+    if (seq.size() < query_len) {
+        // Too small, don't bother.
+        return ranges;
+    }
+
+    auto add_match = [&](std::size_t end, std::size_t edist) {
+        // |edist| is for the full query ending at |end|, so we know the earliest that the match can start.
+        const auto max_match_len = std::min(query_len + edist, end);
+        const auto min_match_start = end - max_match_len;
+
+        if (edist == 0) {
+            // Exact match, nothing more to do.
+            ranges.push_back({min_match_start, end, edist});
+
+        } else {
+            // If this isn't an exact match then hand over to edlib to get the start index.
+            const auto max_match_span = seq.substr(min_match_start, max_match_len);
+            auto edlib_cfg = edlibNewAlignConfig(static_cast<int>(edist), EDLIB_MODE_HW,
+                                                 EDLIB_TASK_LOC, nullptr, 0);
+            auto edlib_result =
+                    edlibAlign(query.data(), static_cast<int>(query.size()), max_match_span.data(),
+                               static_cast<int>(max_match_span.size()), edlib_cfg);
+            assert(edlib_result.status == EDLIB_STATUS_OK);
+            auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); });
+
+            if (edlib_result.status == EDLIB_STATUS_OK) {
+                // edlib only reports the best edist it finds, so if there's a better match in the same span then it'll
+                // report that instead. This can happen for spans that are close together, for example:
+                // edists:...,7,6,5,5,4,5,6,6,6,5,6,..., max_edist=5
+                //               end1=^    end2=^
+                // When processing 'end2' we have to extend our search span to include that of 'end1' (since it's
+                // |max_edist| indices away). This means that, if the edits aren't insertions, we end up finding the
+                // same sequence as |end1|, which has a better edist and hence is the only thing edlib reports.
+                // We should be safe to ignore the worse edist in those cases.
+                if (edlib_result.editDistance == static_cast<int>(edist)) {
+                    ranges.push_back(
+                            {min_match_start + edlib_result.startLocations[0], end, edist});
+                }
+            }
+        }
+    };
+
+    // Calculate edit distances for each index.
+    const auto local_edists = d_myers(query.data(), query.size(), seq.data(), seq.size());
+
+    // Look for drops below the threshold and join neighbouring ranges together.
+    //
+    // TODO: can we improve on this, since we have cases such as:
+    // local_edists = ...,7,6,5,6,6,5,5,6,5,4,5,5,6,7,..., max_edist = 5
+    // hits:                 [^]   [^  ] [  ^   ]
+    // where these hits should probably be grouped together.
+    //
+    std::optional<std::size_t> current_range_end;
+    std::size_t min_edist = max_edist + 1;
+    for (std::size_t current_end = query_len; current_end < local_edists.size(); current_end++) {
+        // See if this is a better match.
+        const auto current_edist = local_edists[current_end];
+        if (current_edist <= max_edist && current_edist < min_edist) {
+            min_edist = current_edist;
+            current_range_end = current_end;
+        } else if (current_range_end.has_value() && current_edist > max_edist) {
+            // We're back out of the threshold, so end the current range.
+            add_match(*current_range_end, min_edist);
+            current_range_end.reset();
+            min_edist = max_edist + 1;
+        }
+    }
+
+    // End the current range if we're in one.
+    if (current_range_end.has_value()) {
+        add_match(*current_range_end, min_edist);
+    }
+
+    return ranges;
+}
+
+void print_edists(std::ostream& os, std::string_view seq, const std::vector<size_t>& edists) {
+    assert(edists.size() == seq.size() + 1);
+
+    // Print the sequence with spaces to match the edists.
+    for (char c : seq) {
+        os << "  " << c;
+    }
+    os << '\n';
+
+    // Print the edists. Maximum edist is 64, so we only need to pad 2 spaces.
+    const auto old_fill = os.fill(' ');
+    for (size_t s : edists) {
+        os << std::setw(3) << s;
+    }
+    os.fill(old_fill);
+    os << '\n';
+}
+
+}  // namespace dorado::splitter
diff --git a/dorado/splitter/myers.h b/dorado/splitter/myers.h
new file mode 100644
index 00000000..01209692
--- /dev/null
+++ b/dorado/splitter/myers.h
@@ -0,0 +1,23 @@
+#ifndef MYERS_BIT_H
+#define MYERS_BIT_H
+
+#include <iosfwd>
+#include <string_view>
+#include <vector>
+
+namespace dorado::splitter {
+
+struct EdistResult {
+    std::size_t begin;  // inclusive
+    std::size_t end;    // exclusive
+    std::size_t edist;
+};
+std::vector<EdistResult> myers_align(std::string_view query,
+                                     std::string_view seq,
+                                     std::size_t max_edist);
+
+void print_edists(std::ostream& os, std::string_view seq, const std::vector<size_t>& edists);
+
+}  // namespace dorado::splitter
+
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 000e7b4b..286c127d 100755
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -31,6 +31,7 @@ add_executable(dorado_tests
     ModelMetadataTest.cpp
     ModelUtilsTest.cpp
     MotifMatcherTest.cpp
+    myers_test.cpp
     PairingNodeTest.cpp
     PipelineTest.cpp
     PolyACalculatorTest.cpp
diff --git a/tests/myers_test.cpp b/tests/myers_test.cpp
new file mode 100644
index 00000000..0bf7c759
--- /dev/null
+++ b/tests/myers_test.cpp
@@ -0,0 +1,95 @@
+#include "splitter/myers.h"
+
+#include <catch2/catch.hpp>
+
+#define CUT_TAG "[myers]"
+#define DEFINE_TEST(name) TEST_CASE(CUT_TAG " " name, CUT_TAG)
+
+using dorado::splitter::EdistResult;
+using dorado::splitter::myers_align;
+
+DEFINE_TEST("Basic alignment, single hit") {
+    const std::string_view query = "AAA";
+    const std::string_view seq = "GGGCCCAAATTT";
+
+    // The same location should be hit for all edists.
+    const auto max_edist = GENERATE(0, 1, 2);
+    CAPTURE(max_edist);
+
+    const auto alignments = myers_align(query, seq, max_edist);
+    REQUIRE(alignments.size() == 1);
+    CHECK(alignments[0].begin == 6);
+    CHECK(alignments[0].end == 9);
+    CHECK(alignments[0].edist == 0);
+}
+
+DEFINE_TEST("Basic alignment, multiple hits") {
+    const std::string_view query = "CCC";
+    const std::string_view seq = "GGGCCCAAATTTCCCGGG";
+
+    // The same locations should be hit for all edists.
+    const auto max_edist = GENERATE(0, 1, 2);
+    CAPTURE(max_edist);
+
+    const auto alignments = myers_align(query, seq, max_edist);
+    REQUIRE(alignments.size() == 2);
+    CHECK(alignments[0].begin == 3);
+    CHECK(alignments[0].end == 6);
+    CHECK(alignments[0].edist == 0);
+    CHECK(alignments[1].begin == 12);
+    CHECK(alignments[1].end == 15);
+    CHECK(alignments[1].edist == 0);
+}
+
+DEFINE_TEST("Basic alignment, hit at end") {
+    const std::string_view query = "TTT";
+    const std::string_view seq = "GGGCCCAAATTT";
+
+    // The same location should be hit for all edists.
+    const auto max_edist = GENERATE(0, 1, 2);
+    CAPTURE(max_edist);
+
+    const auto alignments = myers_align(query, seq, max_edist);
+    REQUIRE(alignments.size() == 1);
+    CHECK(alignments[0].begin == 9);
+    CHECK(alignments[0].end == 12);
+    CHECK(alignments[0].edist == 0);
+}
+
+DEFINE_TEST("Complex alignment, multiple hits") {
+    const std::string_view query = "TACTTCGTTCAGTT";
+    const std::string_view seq =
+            "CTGTCGAGACCCTT"
+            "TACTTCTTCTT"  // match 0
+            "CACCAA"
+            "TATTGTTATGTT"  // match 1
+            "ATGTAGCC"
+            "TGCTTCGTTCGGTT"  // match 2
+            "ATGCGCCGCCAATATTAACCTCGGTAAAA"
+            "TATCTTCGACCCAGTT"  // match 3
+            "TTCGCGTCT";
+    const auto max_edist = 4;
+
+    const auto alignments = myers_align(query, seq, max_edist);
+    REQUIRE(alignments.size() == 4);
+    CHECK(alignments[0].begin == 14);
+    CHECK(alignments[0].end == 25);
+    CHECK(alignments[0].edist == 3);
+    CHECK(alignments[1].begin == 31);
+    CHECK(alignments[1].end == 43);
+    CHECK(alignments[1].edist == 4);
+    CHECK(alignments[2].begin == 51);
+    CHECK(alignments[2].end == 65);
+    CHECK(alignments[2].edist == 2);
+    CHECK(alignments[3].begin == 94);
+    CHECK(alignments[3].end == 110);
+    CHECK(alignments[3].edist == 4);
+}
+
+DEFINE_TEST("Complex alignment, doesn't crash when high edist near start") {
+    const std::string_view query = "TACTTCGTTCAGTT";
+    const std::string_view seq = "TTTTTTTTTTCTCCTGTTCTTGGTTCGGTTGT";
+    const auto max_edist = 5;
+    const auto alignments = myers_align(query, seq, max_edist);
+    CHECK(!alignments.empty());
+}

From 63644d9dcb67dcea56004ca4d594f98354b99c60 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 17:48:25 +0000
Subject: [PATCH 08/21] [DOR-526] Small cmake simplification

Spotted while adding myers.*.
---
 CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3fbc74a..177d6132 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ endif()
 
 configure_file(dorado/Version.h.in dorado/Version.h)
 
-set(LIB_SOURCE_FILES
+add_library(dorado_lib
     dorado/alignment/alignment_processing_items.cpp
     dorado/alignment/alignment_processing_items.h
     dorado/alignment/IndexFileAccess.cpp
@@ -245,9 +245,7 @@ set(LIB_SOURCE_FILES
     dorado/poly_tail/rna_poly_tail_calculator.h
     dorado/summary/summary.cpp
     dorado/summary/summary.h
- )
-
-add_library(dorado_lib ${LIB_SOURCE_FILES})
+)
 
 enable_warnings_as_errors(dorado_lib)
 

From fc5c088cace51a987b502d25e259575a3d99feb9 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 17:52:21 +0000
Subject: [PATCH 09/21] [DOR-526] Move the anonymous namespace into
 dorado::splitter

This matches the other files and allows us to use the types without
having to alias them.
---
 dorado/splitter/DuplexReadSplitter.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 708a1288..5f5d5f85 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -20,13 +20,9 @@
 #include <string_view>
 #include <utility>
 
-namespace {
-
-using namespace dorado;
-using namespace dorado::splitter;
+namespace dorado::splitter {
 
-using PosRange = splitter::PosRange;
-using PosRanges = splitter::PosRanges;
+namespace {
 
 //[start, end)
 std::optional<PosRange> find_best_adapter_match(const std::string& adapter,
@@ -121,8 +117,6 @@ float qscore_mean(const std::string& qstring, splitter::PosRange r) {
 
 }  // namespace
 
-namespace dorado::splitter {
-
 struct DuplexReadSplitter::ExtRead {
     SimplexReadPtr read;
     at::Tensor data_as_float32;

From c8819e0182c0f74d593cf6c8d95c1c2b8abf9adb Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 21 Feb 2024 18:02:33 +0000
Subject: [PATCH 10/21] [DOR-526] Switch over to using multi-edist for
 searching

---
 dorado/splitter/DuplexReadSplitter.cpp | 71 ++++++++++++--------------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 5f5d5f85..4d681e46 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -1,5 +1,6 @@
 #include "DuplexReadSplitter.h"
 
+#include "myers.h"
 #include "read_pipeline/read_utils.h"
 #include "splitter/splitter_utils.h"
 #include "utils/PostCondition.h"
@@ -15,6 +16,7 @@
 #include <chrono>
 #include <cmath>
 #include <iomanip>
+#include <iterator>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -215,65 +217,56 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
     }
 
     auto match_start_range = [&](std::string_view query, std::int64_t min_start,
-                                 std::int64_t max_start,
-                                 std::int64_t max_edist) -> std::optional<PosRange> {
+                                 std::int64_t max_start, std::int64_t max_edist) -> PosRanges {
         const auto max_end_pos =
                 std::min<std::int64_t>(read_seq.size(), max_start + query.size() + max_edist);
         if (min_start + static_cast<std::int64_t>(query.size()) > max_end_pos) {
             // Too close to the end.
-            spdlog::trace("Too close to the end");
-            return std::nullopt;
+            return {};
         }
 
         // Trim the sequence.
         std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start);
 
         // Search the sequence.
-        static const EdlibEqualityPair additionalEqualities[4] = {
-                {'N', 'A'}, {'N', 'T'}, {'N', 'C'}, {'N', 'G'}};
-        auto edlib_cfg = edlibNewAlignConfig(max_edist, EDLIB_MODE_HW, EDLIB_TASK_LOC,
-                                             additionalEqualities, std::size(additionalEqualities));
-        auto edlib_result =
-                edlibAlign(query.data(), query.size(), seq.data(), seq.size(), edlib_cfg);
-        assert(edlib_result.status == EDLIB_STATUS_OK);
-        auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); });
-
-        // Check we got a match.
-        if (edlib_result.status != EDLIB_STATUS_OK || edlib_result.editDistance == -1) {
-            spdlog::trace("No match {} {}", edlib_result.status, edlib_result.editDistance);
-            return std::nullopt;
-        }
-
-        // Build the result.
-        PosRange range;
-        range.first = min_start + edlib_result.startLocations[0];
-        range.second = min_start + edlib_result.endLocations[0] + 1;
-        if (static_cast<std::int64_t>(range.first) > max_start) {
-            spdlog::trace("Past the start");
-            return std::nullopt;
+        const auto alignments = myers_align(query, seq, max_edist);
+
+        // Build the results.
+        PosRanges ranges;
+        for (auto alignment : alignments) {
+            PosRange range;
+            range.first = min_start + alignment.begin;
+            range.second = min_start + alignment.end;
+            if (static_cast<std::int64_t>(range.first) > max_start) {
+                continue;
+            }
+            ranges.push_back(range);
         }
-        return range;
+        return ranges;
     };
 
     // Search for muA.
-    auto muA_range = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist);
-    if (!muA_range.has_value()) {
-        spdlog::trace("No muA");
+    auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist);
+    if (muA_ranges.empty()) {
         return {};
     }
 
     // Search for the adapter.
-    const auto adapter_start = std::max(
-            ignore_start, static_cast<std::int64_t>(muA_range->first) - max_muA_adapter_dist);
-    auto adapter_match =
-            match_start_range(adapter_seq, adapter_start, muA_range->first, max_adapter_edist);
-    if (!adapter_match.has_value()) {
-        spdlog::trace("No adapter");
-        return {};
+    PosRanges all_adapter_ranges;
+    for (auto muA_range : muA_ranges) {
+        const auto adapter_start = std::max(
+                ignore_start, static_cast<std::int64_t>(muA_range.first) - max_muA_adapter_dist);
+        auto adapter_ranges =
+                match_start_range(adapter_seq, adapter_start, muA_range.first, max_adapter_edist);
+        std::move(adapter_ranges.begin(), adapter_ranges.end(),
+                  std::back_inserter(all_adapter_ranges));
     }
 
-    spdlog::trace("Detected muA+adapter region in read {}", read.read->read_common.read_id);
-    return {*adapter_match};
+    if (!all_adapter_ranges.empty()) {
+        spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(),
+                      read.read->read_common.read_id);
+    }
+    return all_adapter_ranges;
 }
 
 bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read,

From 6afac6b4fc9d5767abaea681f14cf6e5e47a0ab7 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 22 Feb 2024 13:37:36 +0000
Subject: [PATCH 11/21] [DOR-526] Check all of the edlib results

Also check that the end matches otherwise we can add duplicate ranges.
---
 dorado/splitter/myers.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp
index 0ab9c70f..231d915c 100644
--- a/dorado/splitter/myers.cpp
+++ b/dorado/splitter/myers.cpp
@@ -90,9 +90,14 @@ std::vector<EdistResult> myers_align(std::string_view query,
                 // |max_edist| indices away). This means that, if the edits aren't insertions, we end up finding the
                 // same sequence as |end1|, which has a better edist and hence is the only thing edlib reports.
                 // We should be safe to ignore the worse edist in those cases.
-                if (edlib_result.editDistance == static_cast<int>(edist)) {
-                    ranges.push_back(
-                            {min_match_start + edlib_result.startLocations[0], end, edist});
+                for (int i = 0; i < edlib_result.numLocations; i++) {
+                    // edlib indices are inclusive, ours are exclusive.
+                    const std::size_t edlib_end =
+                            min_match_start + edlib_result.endLocations[i] + 1;
+                    if (edlib_result.editDistance == static_cast<int>(edist) && edlib_end == end) {
+                        ranges.push_back(
+                                {min_match_start + edlib_result.startLocations[i], end, edist});
+                    }
                 }
             }
         }

From 96795fe60ad5cfbc5348580133a2fd62b7f95874 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 22 Feb 2024 14:01:32 +0000
Subject: [PATCH 12/21] [DOR-526] Reduce adapter edist

I copied the higher analysis value, not the "best" value.
---
 dorado/splitter/DuplexReadSplitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 4d681e46..d4837980 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -206,7 +206,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
     constexpr std::int64_t min_read_len = 200;
     // 14bp adapter prefix that should work better for click chemistry based on previous investigation
     constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT";
-    constexpr std::int64_t max_adapter_edist = 5;
+    constexpr std::int64_t max_adapter_edist = 3;
     constexpr std::int64_t max_muA_adapter_dist = 100;
 
     // Skip if the read is too small.

From c13e195026cdc90e636bb4f6ae0b313fd21a25c1 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 22 Feb 2024 14:04:09 +0000
Subject: [PATCH 13/21] [DOR-526] Fix searching for the adapter

We only want the best result for the adapter, so add a way to do that.
---
 dorado/splitter/DuplexReadSplitter.cpp | 45 ++++++++++++++++++++------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index d4837980..8c235e7e 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -217,7 +217,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
     }
 
     auto match_start_range = [&](std::string_view query, std::int64_t min_start,
-                                 std::int64_t max_start, std::int64_t max_edist) -> PosRanges {
+                                 std::int64_t max_start, std::int64_t max_edist,
+                                 bool best_only) -> PosRanges {
         const auto max_end_pos =
                 std::min<std::int64_t>(read_seq.size(), max_start + query.size() + max_edist);
         if (min_start + static_cast<std::int64_t>(query.size()) > max_end_pos) {
@@ -233,36 +234,60 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
 
         // Build the results.
         PosRanges ranges;
-        for (auto alignment : alignments) {
+        auto add_range = [&](EdistResult alignment) {
             PosRange range;
             range.first = min_start + alignment.begin;
             range.second = min_start + alignment.end;
-            if (static_cast<std::int64_t>(range.first) > max_start) {
-                continue;
-            }
             ranges.push_back(range);
+        };
+        auto valid_begin = [=](std::int64_t begin) { return min_start + begin <= max_start; };
+        if (best_only) {
+            // Look for the best score that is also valid.
+            auto compare_score = [=](auto& lhs, auto& rhs) {
+                auto const lhs_score = !valid_begin(lhs.begin) ? max_edist + 1 : lhs.edist;
+                auto const rhs_score = !valid_begin(rhs.begin) ? max_edist + 1 : rhs.edist;
+                return lhs_score < rhs_score;
+            };
+            auto it = std::min_element(alignments.begin(), alignments.end(), compare_score);
+            if (it != alignments.end()) {
+                add_range(*it);
+            }
+        } else {
+            for (auto alignment : alignments) {
+                if (valid_begin(alignment.begin)) {
+                    add_range(alignment);
+                }
+            }
+            // Merge the ranges.
+            std::sort(ranges.begin(), ranges.end());
+            ranges = merge_ranges(ranges, 0);
         }
         return ranges;
     };
 
-    // Search for muA.
-    auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist);
+    // Search for muAs.
+    auto muA_ranges =
+            match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false);
     if (muA_ranges.empty()) {
         return {};
     }
 
-    // Search for the adapter.
+    // Search for any adapters that are close to the muAs.
     PosRanges all_adapter_ranges;
     for (auto muA_range : muA_ranges) {
         const auto adapter_start = std::max(
                 ignore_start, static_cast<std::int64_t>(muA_range.first) - max_muA_adapter_dist);
-        auto adapter_ranges =
-                match_start_range(adapter_seq, adapter_start, muA_range.first, max_adapter_edist);
+        auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first,
+                                                max_adapter_edist, true);
         std::move(adapter_ranges.begin(), adapter_ranges.end(),
                   std::back_inserter(all_adapter_ranges));
     }
 
     if (!all_adapter_ranges.empty()) {
+        // Merge the final ranges.
+        std::sort(all_adapter_ranges.begin(), all_adapter_ranges.end());
+        all_adapter_ranges = merge_ranges(all_adapter_ranges, 0);
+
         spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(),
                       read.read->read_common.read_id);
     }

From 185df116d93ffce0a8862c566c85064a626b5a6e Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 22 Feb 2024 17:26:19 +0000
Subject: [PATCH 14/21] [DOR-526] Add missing spike detection

Also simplify the "best" search by using edlib directly.
---
 dorado/splitter/DuplexReadSplitter.cpp | 111 ++++++++++++++++---------
 dorado/splitter/DuplexReadSplitter.h   |   2 +-
 2 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 8c235e7e..82c3bc7e 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -197,7 +197,7 @@ PosRanges DuplexReadSplitter::possible_pore_regions(const DuplexReadSplitter::Ex
     return pore_regions;
 }
 
-PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const {
+PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const {
     spdlog::trace("Finding muA regions for read {}", read.read->read_common.read_id);
 
     constexpr std::string_view muA_seq = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA";
@@ -208,6 +208,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
     constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT";
     constexpr std::int64_t max_adapter_edist = 3;
     constexpr std::int64_t max_muA_adapter_dist = 100;
+    constexpr std::int64_t max_spike_adapter_dist = 50;
+    constexpr float max_spike_qscore = 10;
 
     // Skip if the read is too small.
     const auto& read_seq = read.read->read_common.seq;
@@ -230,68 +232,97 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const
         std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start);
 
         // Search the sequence.
-        const auto alignments = myers_align(query, seq, max_edist);
-
-        // Build the results.
-        PosRanges ranges;
-        auto add_range = [&](EdistResult alignment) {
-            PosRange range;
-            range.first = min_start + alignment.begin;
-            range.second = min_start + alignment.end;
-            ranges.push_back(range);
-        };
-        auto valid_begin = [=](std::int64_t begin) { return min_start + begin <= max_start; };
         if (best_only) {
-            // Look for the best score that is also valid.
-            auto compare_score = [=](auto& lhs, auto& rhs) {
-                auto const lhs_score = !valid_begin(lhs.begin) ? max_edist + 1 : lhs.edist;
-                auto const rhs_score = !valid_begin(rhs.begin) ? max_edist + 1 : rhs.edist;
-                return lhs_score < rhs_score;
-            };
-            auto it = std::min_element(alignments.begin(), alignments.end(), compare_score);
-            if (it != alignments.end()) {
-                add_range(*it);
+            auto edlib_cfg = edlibNewAlignConfig(static_cast<int>(max_edist), EDLIB_MODE_HW,
+                                                 EDLIB_TASK_LOC, nullptr, 0);
+            auto edlib_result = edlibAlign(query.data(), static_cast<int>(query.size()), seq.data(),
+                                           static_cast<int>(seq.size()), edlib_cfg);
+            assert(edlib_result.status == EDLIB_STATUS_OK);
+            auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); });
+
+            // Add the match if it looks good.
+            PosRanges ranges;
+            if (edlib_result.status == EDLIB_STATUS_OK && edlib_result.editDistance != -1) {
+                PosRange range;
+                range.first = min_start + edlib_result.startLocations[0];
+                range.second = min_start + edlib_result.endLocations[0] + 1;
+                if (static_cast<std::int64_t>(range.first) <= max_start) {
+                    ranges.push_back(range);
+                }
             }
+            return ranges;
+
         } else {
+            const auto alignments = myers_align(query, seq, max_edist);
+
+            // Build the results.
+            PosRanges ranges;
             for (auto alignment : alignments) {
-                if (valid_begin(alignment.begin)) {
-                    add_range(alignment);
+                if (min_start + static_cast<std::int64_t>(alignment.begin) <= max_start) {
+                    PosRange range;
+                    range.first = min_start + alignment.begin;
+                    range.second = min_start + alignment.end;
+                    ranges.push_back(range);
                 }
             }
+
             // Merge the ranges.
             std::sort(ranges.begin(), ranges.end());
-            ranges = merge_ranges(ranges, 0);
+            return merge_ranges(ranges, 0);
         }
-        return ranges;
     };
 
     // Search for muAs.
-    auto muA_ranges =
+    const auto muA_ranges =
             match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false);
     if (muA_ranges.empty()) {
         return {};
     }
 
     // Search for any adapters that are close to the muAs.
-    PosRanges all_adapter_ranges;
+    PosRanges spike_ranges;
     for (auto muA_range : muA_ranges) {
         const auto adapter_start = std::max(
                 ignore_start, static_cast<std::int64_t>(muA_range.first) - max_muA_adapter_dist);
-        auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first,
-                                                max_adapter_edist, true);
-        std::move(adapter_ranges.begin(), adapter_ranges.end(),
-                  std::back_inserter(all_adapter_ranges));
-    }
+        const auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first,
+                                                      max_adapter_edist, true);
+        if (adapter_ranges.empty()) {
+            continue;
+        }
 
-    if (!all_adapter_ranges.empty()) {
-        // Merge the final ranges.
-        std::sort(all_adapter_ranges.begin(), all_adapter_ranges.end());
-        all_adapter_ranges = merge_ranges(all_adapter_ranges, 0);
+        // We only wanted the best match.
+        assert(adapter_ranges.size() == 1);
+        const auto adapter_match = adapter_ranges.front();
+        if (adapter_match.first < max_spike_adapter_dist) {
+            continue;
+        }
 
-        spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(),
-                      read.read->read_common.read_id);
+        // Look for the spike between (the left of) the adapter and the muA, in signal space.
+        const auto spike_search_begin = adapter_match.first - max_spike_adapter_dist;
+        const auto spike_search_end = muA_range.first;
+        const auto spike_search_begin_s = spike_search_begin * read.read->read_common.model_stride;
+        const auto spike_search_end_s = spike_search_end * read.read->read_common.model_stride;
+        const auto search_span = read.data_as_float32.index(
+                {at::indexing::Slice(spike_search_begin_s, spike_search_end_s)});
+        const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong();
+        // Convert back to base space.
+        const auto spike_begin = spike_peak_s / read.read->read_common.model_stride;
+        const auto spike_end = spike_begin + 1;
+
+        // Check qscore is under the threshold.
+        const auto spike_avg_qscore =
+                qscore_mean(read.read->read_common.qstring, PosRange(spike_begin, spike_end));
+        if (spike_avg_qscore > max_spike_qscore) {
+            continue;
+        }
+
+        // We have a match.
+        spike_ranges.emplace_back(spike_begin, spike_end);
     }
-    return all_adapter_ranges;
+
+    spdlog::trace("Detected {} muA+adapter spike region(s) in read {}", spike_ranges.size(),
+                  read.read->read_common.read_id);
+    return spike_ranges;
 }
 
 bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read,
@@ -479,7 +510,7 @@ DuplexReadSplitter::build_split_finders() const {
     }));
 
     split_finders.emplace_back(std::make_pair(
-            "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_splits(read); }));
+            "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_spikes(read); }));
 
     if (!m_settings.simplex_mode) {
         split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) {
diff --git a/dorado/splitter/DuplexReadSplitter.h b/dorado/splitter/DuplexReadSplitter.h
index 28466cf5..2c420fd2 100644
--- a/dorado/splitter/DuplexReadSplitter.h
+++ b/dorado/splitter/DuplexReadSplitter.h
@@ -24,7 +24,7 @@ class DuplexReadSplitter : public ReadSplitter {
 
     ExtRead create_ext_read(SimplexReadPtr r) const;
     PosRanges possible_pore_regions(const ExtRead& read) const;
-    PosRanges find_muA_adapter_splits(const ExtRead& read) const;
+    PosRanges find_muA_adapter_spikes(const ExtRead& read) const;
     bool check_nearby_adapter(const SimplexRead& read,
                               splitter::PosRange r,
                               int adapter_edist) const;

From 42ad6f17e1ac97a44072a03475dfc03a7852f7aa Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Fri, 23 Feb 2024 12:11:16 +0000
Subject: [PATCH 15/21] [DOR-526] Correctly map indices to/from basespace

---
 dorado/splitter/DuplexReadSplitter.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 82c3bc7e..7d6f39a1 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -297,16 +297,27 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
             continue;
         }
 
+        // Helpers to map to/from basespace.
+        auto from_basespace = [&](std::size_t idx) {
+            const auto it = std::lower_bound(read.move_sums.begin(), read.move_sums.end(), idx);
+            return std::distance(read.move_sums.begin(), it) * read.read->read_common.model_stride;
+        };
+        auto to_basespace = [&](std::size_t idx) {
+            idx /= read.read->read_common.model_stride;
+            // The range we're querying is valid and any found indices should lie in that range.
+            assert(idx < read.move_sums.size());
+            return read.move_sums[idx];
+        };
+
         // Look for the spike between (the left of) the adapter and the muA, in signal space.
-        const auto spike_search_begin = adapter_match.first - max_spike_adapter_dist;
-        const auto spike_search_end = muA_range.first;
-        const auto spike_search_begin_s = spike_search_begin * read.read->read_common.model_stride;
-        const auto spike_search_end_s = spike_search_end * read.read->read_common.model_stride;
+        const auto spike_search_begin_s =
+                from_basespace(adapter_match.first - max_spike_adapter_dist);
+        const auto spike_search_end_s = from_basespace(muA_range.first);
         const auto search_span = read.data_as_float32.index(
                 {at::indexing::Slice(spike_search_begin_s, spike_search_end_s)});
         const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong();
         // Convert back to base space.
-        const auto spike_begin = spike_peak_s / read.read->read_common.model_stride;
+        const auto spike_begin = to_basespace(spike_peak_s);
         const auto spike_end = spike_begin + 1;
 
         // Check qscore is under the threshold.

From f45f1a542a1b7ad7be0e06047b07803adb9eb79e Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Fri, 23 Feb 2024 12:11:49 +0000
Subject: [PATCH 16/21] [DOR-526] Increase spike qstring range

This is at most the start of the muA, so we're safe to read 5 bases.
---
 dorado/splitter/DuplexReadSplitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 7d6f39a1..e9b82c69 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -318,7 +318,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
         const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong();
         // Convert back to base space.
         const auto spike_begin = to_basespace(spike_peak_s);
-        const auto spike_end = spike_begin + 1;
+        const auto spike_end = spike_begin + 5;
 
         // Check qscore is under the threshold.
         const auto spike_avg_qscore =

From a14030107ccfda2d0ddc079707c7fa86dd05d00d Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 27 Feb 2024 14:00:41 +0000
Subject: [PATCH 17/21] [DOR-526] Remove duplicate results

---
 dorado/splitter/DuplexReadSplitter.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index e9b82c69..f85a48ab 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -331,6 +331,11 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
         spike_ranges.emplace_back(spike_begin, spike_end);
     }
 
+    // Remove duplicates.
+    std::sort(spike_ranges.begin(), spike_ranges.end());
+    auto end_it = std::unique(spike_ranges.begin(), spike_ranges.end());
+    spike_ranges.erase(end_it, spike_ranges.end());
+
     spdlog::trace("Detected {} muA+adapter spike region(s) in read {}", spike_ranges.size(),
                   read.read->read_common.read_id);
     return spike_ranges;

From f8f8f0690b8132e1694a727cf45800027085e671 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 28 Feb 2024 10:19:49 +0000
Subject: [PATCH 18/21] [DOR-526] Add info to early return traces

---
 dorado/splitter/DuplexReadSplitter.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index f85a48ab..1c132636 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -214,7 +214,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
     // Skip if the read is too small.
     const auto& read_seq = read.read->read_common.seq;
     if (read_seq.size() < min_read_len) {
-        spdlog::trace("Read too small");
+        spdlog::trace("Read too small: id={} size={}", read.read->read_common.read_id,
+                      read_seq.size());
         return {};
     }
 
@@ -276,6 +277,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
     const auto muA_ranges =
             match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false);
     if (muA_ranges.empty()) {
+        spdlog::trace("No muA found in read: id={}", read.read->read_common.read_id);
         return {};
     }
 

From 88e8f370ced4b6a75c172f2102045a89e62a3926 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 28 Feb 2024 10:20:30 +0000
Subject: [PATCH 19/21] [DOR-526] Be consistent with use of include guards

---
 dorado/splitter/myers.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dorado/splitter/myers.h b/dorado/splitter/myers.h
index 01209692..ac9bba85 100644
--- a/dorado/splitter/myers.h
+++ b/dorado/splitter/myers.h
@@ -1,5 +1,4 @@
-#ifndef MYERS_BIT_H
-#define MYERS_BIT_H
+#pragma once
 
 #include <iosfwd>
 #include <string_view>
@@ -19,5 +18,3 @@ std::vector<EdistResult> myers_align(std::string_view query,
 void print_edists(std::ostream& os, std::string_view seq, const std::vector<size_t>& edists);
 
 }  // namespace dorado::splitter
-
-#endif

From dc590314fa8aed52b247e129a2aad64056c37733 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Wed, 6 Mar 2024 11:43:11 +0000
Subject: [PATCH 20/21] [DOR-526] Deduplicate the ranges as a safeguard

---
 dorado/splitter/myers.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp
index 231d915c..e48edf21 100644
--- a/dorado/splitter/myers.cpp
+++ b/dorado/splitter/myers.cpp
@@ -3,6 +3,7 @@
 #include "utils/PostCondition.h"
 #include "utils/alignment_utils.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iomanip>
@@ -134,6 +135,15 @@ std::vector<EdistResult> myers_align(std::string_view query,
         add_match(*current_range_end, min_edist);
     }
 
+    // Deduplicate the ranges.
+    auto less = [](const EdistResult& lhs, const EdistResult& rhs) { return lhs.end < rhs.end; };
+    auto equal = [](const EdistResult& lhs, const EdistResult& rhs) {
+        return lhs.begin == rhs.begin && lhs.end == rhs.end;
+    };
+    std::sort(ranges.begin(), ranges.end(), less);
+    auto new_end = std::unique(ranges.begin(), ranges.end(), equal);
+    ranges.erase(new_end, ranges.end());
+
     return ranges;
 }
 

From a4cb934bfb37892e7e6423307aba1a7cd1fb4f7c Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Thu, 7 Mar 2024 17:13:03 +0000
Subject: [PATCH 21/21] [DOR-526] Bump adapter edist from 3 to 5

---
 dorado/splitter/DuplexReadSplitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp
index 1c132636..24c29714 100644
--- a/dorado/splitter/DuplexReadSplitter.cpp
+++ b/dorado/splitter/DuplexReadSplitter.cpp
@@ -206,7 +206,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const
     constexpr std::int64_t min_read_len = 200;
     // 14bp adapter prefix that should work better for click chemistry based on previous investigation
     constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT";
-    constexpr std::int64_t max_adapter_edist = 3;
+    constexpr std::int64_t max_adapter_edist = 5;
     constexpr std::int64_t max_muA_adapter_dist = 100;
     constexpr std::int64_t max_spike_adapter_dist = 50;
     constexpr float max_spike_qscore = 10;