From 1446320f7db5352722933f06ce5764879de8f68f Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 15 Feb 2024 11:41:10 +0000 Subject: [PATCH 01/21] [DOR-526] Factor out 3rdparty libs into their own CMakeLists.txt --- CMakeLists.txt | 32 ++------------------------------ dorado/3rdparty/CMakeLists.txt | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 30 deletions(-) create mode 100644 dorado/3rdparty/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b389c83..56b07dee 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,36 +116,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux") endif() endif() -# Bring in spdlog -add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/spdlog) -# Avoid namespace clashes with static torch. -target_compile_definitions(spdlog PUBLIC - "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {" - "FMT_END_NAMESPACE=}}" -) - -# ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx. -set(ELZIP_DECOMPRESS_ONLY ON) -add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/elzip) - -option(ONT_MM2_EXE "Build minimap2 exe" OFF) -add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/ont-minimap2 EXCLUDE_FROM_ALL) -message(STATUS "build minimap2 exe: ${ONT_MM2_EXE}") -message(STATUS "build minimap2 lib: ${ONT_MM2_LIB}") -if (ONT_MM2_EXE) - install(TARGETS minimap2_exe) - add_custom_target(testing_deps ALL DEPENDS minimap2_exe) -endif() -# Disable warnings from minimap source -disable_warnings(minimap2) - -set(BUILD_TESTING OFF) -add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/edlib EXCLUDE_FROM_ALL) -# Disable warnings from edlib source -disable_warnings(edlib) -set(BUILD_TESTING ON) - -add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/date EXCLUDE_FROM_ALL) +# Bring in 3rdparty libs +add_subdirectory(dorado/3rdparty) enable_testing() diff --git a/dorado/3rdparty/CMakeLists.txt b/dorado/3rdparty/CMakeLists.txt new file mode 100644 index 00000000..a4e2416a --- /dev/null +++ b/dorado/3rdparty/CMakeLists.txt @@ -0,0 +1,33 @@ +# Bring in spdlog +add_subdirectory(spdlog) +# Avoid namespace clashes with static torch. +target_compile_definitions(spdlog PUBLIC + "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {" + "FMT_END_NAMESPACE=}}" +) + +# ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx. +set(ELZIP_DECOMPRESS_ONLY ON) +add_subdirectory(elzip) + +# minimap +option(ONT_MM2_EXE "Build minimap2 exe" OFF) +add_subdirectory(ont-minimap2 EXCLUDE_FROM_ALL) +message(STATUS "build minimap2 exe: ${ONT_MM2_EXE}") +message(STATUS "build minimap2 lib: ${ONT_MM2_LIB}") +if (ONT_MM2_EXE) + install(TARGETS minimap2_exe) + add_custom_target(testing_deps ALL DEPENDS minimap2_exe) +endif() +# Disable warnings from minimap source +disable_warnings(minimap2) + +# edlib +set(BUILD_TESTING OFF) +add_subdirectory(edlib EXCLUDE_FROM_ALL) +# Disable warnings from edlib source +disable_warnings(edlib) +set(BUILD_TESTING ON) + +# date +add_subdirectory(date EXCLUDE_FROM_ALL) From bf85adb787c39ac7ec7d01510ce38ab55c1bbc20 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 15 Feb 2024 11:46:22 +0000 Subject: [PATCH 02/21] [DOR-526] Add missing EXCLUDE_FROM_ALL from 3rdparty libs These libs don't build anything extra, but worth being consistent in case future versions do. --- dorado/3rdparty/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dorado/3rdparty/CMakeLists.txt b/dorado/3rdparty/CMakeLists.txt index a4e2416a..3ddd5103 100644 --- a/dorado/3rdparty/CMakeLists.txt +++ b/dorado/3rdparty/CMakeLists.txt @@ -1,5 +1,5 @@ # Bring in spdlog -add_subdirectory(spdlog) +add_subdirectory(spdlog EXCLUDE_FROM_ALL) # Avoid namespace clashes with static torch. target_compile_definitions(spdlog PUBLIC "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {" @@ -8,7 +8,7 @@ target_compile_definitions(spdlog PUBLIC # ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx. set(ELZIP_DECOMPRESS_ONLY ON) -add_subdirectory(elzip) +add_subdirectory(elzip EXCLUDE_FROM_ALL) # minimap option(ONT_MM2_EXE "Build minimap2 exe" OFF) From e3ef3edddf9c0ab7c36bc583e2fc26730eead345 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 15 Feb 2024 12:18:03 +0000 Subject: [PATCH 03/21] [DOR-526] Neaten the split finder builder code The indentation was killing me so replace some parts with functionally identical code that clang-format doesn't try to mess with as much. --- dorado/splitter/DuplexReadSplitter.cpp | 107 +++++++++++-------------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 3165c654..93e27a3e 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -377,67 +377,56 @@ std::vector DuplexReadSplitter::subreads(SimplexReadPtr read, std::vector> DuplexReadSplitter::build_split_finders() const { std::vector> split_finders; - split_finders.push_back({"PORE_ADAPTER", [&](const ExtRead& read) { - return filter_ranges(read.possible_pore_regions, [&](PosRange r) { - return check_nearby_adapter(*read.read, r, - m_settings.adapter_edist); - }); - }}); + split_finders.emplace_back(std::make_pair("PORE_ADAPTER", [&](const ExtRead& read) { + return filter_ranges(read.possible_pore_regions, [&](PosRange r) { + return check_nearby_adapter(*read.read, r, m_settings.adapter_edist); + }); + })); if (!m_settings.simplex_mode) { - split_finders.push_back( - {"PORE_FLANK", [&](const ExtRead& read) { - return merge_ranges( - filter_ranges(read.possible_pore_regions, - [&](PosRange r) { - return check_flank_match(*read.read, r, - m_settings.flank_err); - }), - m_settings.strand_end_flank + m_settings.strand_start_flank); - }}); - - split_finders.push_back( - {"PORE_ALL", [&](const ExtRead& read) { - return merge_ranges( - filter_ranges(read.possible_pore_regions, - [&](PosRange r) { - return check_nearby_adapter( - *read.read, r, - m_settings.relaxed_adapter_edist) && - check_flank_match( - *read.read, r, - m_settings.relaxed_flank_err); - }), - m_settings.strand_end_flank + m_settings.strand_start_flank); - }}); - - split_finders.push_back( - {"ADAPTER_FLANK", [&](const ExtRead& read) { - return filter_ranges( - find_adapter_matches(m_settings.adapter, read.read->read_common.seq, - m_settings.adapter_edist, - m_settings.expect_adapter_prefix), - [&](PosRange r) { - return check_flank_match(*read.read, {r.first, r.first}, - m_settings.flank_err); - }); - }}); - - split_finders.push_back({"ADAPTER_MIDDLE", [&](const ExtRead& read) { - if (auto split = identify_middle_adapter_split(*read.read)) { - return PosRanges{*split}; - } else { - return PosRanges(); - } - }}); - - split_finders.push_back({"SPLIT_MIDDLE", [&](const ExtRead& read) { - if (auto split = identify_extra_middle_split(*read.read)) { - return PosRanges{*split}; - } else { - return PosRanges(); - } - }}); + split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) { + auto filter = [&](PosRange r) { + return check_flank_match(*read.read, r, m_settings.flank_err); + }; + return merge_ranges(filter_ranges(read.possible_pore_regions, filter), + m_settings.strand_end_flank + m_settings.strand_start_flank); + })); + + split_finders.emplace_back(std::make_pair("PORE_ALL", [&](const ExtRead& read) { + auto filter = [&](PosRange r) { + return check_nearby_adapter(*read.read, r, m_settings.relaxed_adapter_edist) && + check_flank_match(*read.read, r, m_settings.relaxed_flank_err); + }; + return merge_ranges(filter_ranges(read.possible_pore_regions, filter), + m_settings.strand_end_flank + m_settings.strand_start_flank); + })); + + split_finders.emplace_back(std::make_pair("ADAPTER_FLANK", [&](const ExtRead& read) { + auto filter = [&](PosRange r) { + return check_flank_match(*read.read, {r.first, r.first}, m_settings.flank_err); + }; + return filter_ranges( + find_adapter_matches(m_settings.adapter, read.read->read_common.seq, + m_settings.adapter_edist, + m_settings.expect_adapter_prefix), + filter); + })); + + split_finders.emplace_back(std::make_pair("ADAPTER_MIDDLE", [&](const ExtRead& read) { + if (auto split = identify_middle_adapter_split(*read.read)) { + return PosRanges{*split}; + } else { + return PosRanges(); + } + })); + + split_finders.emplace_back(std::make_pair("SPLIT_MIDDLE", [&](const ExtRead& read) { + if (auto split = identify_extra_middle_split(*read.read)) { + return PosRanges{*split}; + } else { + return PosRanges(); + } + })); } return split_finders; From 2dbfe45025d81df94f8691cdaa1ab1a65f9d1457 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 10:56:32 +0000 Subject: [PATCH 04/21] [DOR-526] Add direct port of the notebook code This is currently limited to a single region per read, which will be extended in future commits. --- dorado/splitter/DuplexReadSplitter.cpp | 86 ++++++++++++++++++++++++++ dorado/splitter/DuplexReadSplitter.h | 3 +- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 93e27a3e..708a1288 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -2,6 +2,7 @@ #include "read_pipeline/read_utils.h" #include "splitter/splitter_utils.h" +#include "utils/PostCondition.h" #include "utils/alignment_utils.h" #include "utils/sequence_utils.h" #include "utils/uuid_utils.h" @@ -16,6 +17,7 @@ #include #include #include +#include #include namespace { @@ -199,6 +201,87 @@ PosRanges DuplexReadSplitter::possible_pore_regions(const DuplexReadSplitter::Ex return pore_regions; } +PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const { + spdlog::trace("Finding muA regions for read {}", read.read->read_common.read_id); + + constexpr std::string_view muA_seq = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; + constexpr std::int64_t max_muA_edist = 11; + constexpr std::int64_t ignore_start = 100; + constexpr std::int64_t min_read_len = 200; + // 14bp adapter prefix that should work better for click chemistry based on previous investigation + constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT"; + constexpr std::int64_t max_adapter_edist = 5; + constexpr std::int64_t max_muA_adapter_dist = 100; + + // Skip if the read is too small. + const auto& read_seq = read.read->read_common.seq; + if (read_seq.size() < min_read_len) { + spdlog::trace("Read too small"); + return {}; + } + + auto match_start_range = [&](std::string_view query, std::int64_t min_start, + std::int64_t max_start, + std::int64_t max_edist) -> std::optional { + const auto max_end_pos = + std::min(read_seq.size(), max_start + query.size() + max_edist); + if (min_start + static_cast(query.size()) > max_end_pos) { + // Too close to the end. + spdlog::trace("Too close to the end"); + return std::nullopt; + } + + // Trim the sequence. + std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start); + + // Search the sequence. + static const EdlibEqualityPair additionalEqualities[4] = { + {'N', 'A'}, {'N', 'T'}, {'N', 'C'}, {'N', 'G'}}; + auto edlib_cfg = edlibNewAlignConfig(max_edist, EDLIB_MODE_HW, EDLIB_TASK_LOC, + additionalEqualities, std::size(additionalEqualities)); + auto edlib_result = + edlibAlign(query.data(), query.size(), seq.data(), seq.size(), edlib_cfg); + assert(edlib_result.status == EDLIB_STATUS_OK); + auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); }); + + // Check we got a match. + if (edlib_result.status != EDLIB_STATUS_OK || edlib_result.editDistance == -1) { + spdlog::trace("No match {} {}", edlib_result.status, edlib_result.editDistance); + return std::nullopt; + } + + // Build the result. + PosRange range; + range.first = min_start + edlib_result.startLocations[0]; + range.second = min_start + edlib_result.endLocations[0] + 1; + if (static_cast(range.first) > max_start) { + spdlog::trace("Past the start"); + return std::nullopt; + } + return range; + }; + + // Search for muA. + auto muA_range = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist); + if (!muA_range.has_value()) { + spdlog::trace("No muA"); + return {}; + } + + // Search for the adapter. + const auto adapter_start = std::max( + ignore_start, static_cast(muA_range->first) - max_muA_adapter_dist); + auto adapter_match = + match_start_range(adapter_seq, adapter_start, muA_range->first, max_adapter_edist); + if (!adapter_match.has_value()) { + spdlog::trace("No adapter"); + return {}; + } + + spdlog::trace("Detected muA+adapter region in read {}", read.read->read_common.read_id); + return {*adapter_match}; +} + bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read, PosRange r, int adapter_edist) const { @@ -383,6 +466,9 @@ DuplexReadSplitter::build_split_finders() const { }); })); + split_finders.emplace_back(std::make_pair( + "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_splits(read); })); + if (!m_settings.simplex_mode) { split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) { auto filter = [&](PosRange r) { diff --git a/dorado/splitter/DuplexReadSplitter.h b/dorado/splitter/DuplexReadSplitter.h index 2b1fe200..28466cf5 100644 --- a/dorado/splitter/DuplexReadSplitter.h +++ b/dorado/splitter/DuplexReadSplitter.h @@ -23,7 +23,8 @@ class DuplexReadSplitter : public ReadSplitter { using SplitFinderF = std::function; ExtRead create_ext_read(SimplexReadPtr r) const; - std::vector possible_pore_regions(const ExtRead& read) const; + PosRanges possible_pore_regions(const ExtRead& read) const; + PosRanges find_muA_adapter_splits(const ExtRead& read) const; bool check_nearby_adapter(const SimplexRead& read, splitter::PosRange r, int adapter_edist) const; From a0e9bccf669012f5da57fc703856a44aaf44abee Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 11:22:16 +0000 Subject: [PATCH 05/21] [DOR-526] Add tests for PostCondition --- tests/CMakeLists.txt | 1 + tests/PostConditionTest.cpp | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/PostConditionTest.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a9fee034..000e7b4b 100755 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,6 +34,7 @@ add_executable(dorado_tests PairingNodeTest.cpp PipelineTest.cpp PolyACalculatorTest.cpp + PostConditionTest.cpp ReadFilterNodeTest.cpp ReadForwarderNodeTest.cpp ReadTest.cpp diff --git a/tests/PostConditionTest.cpp b/tests/PostConditionTest.cpp new file mode 100644 index 00000000..ccb1243b --- /dev/null +++ b/tests/PostConditionTest.cpp @@ -0,0 +1,43 @@ +#include "utils/PostCondition.h" + +#include + +#define CUT_TAG "[PostCondition]" +#define DEFINE_TEST(name) TEST_CASE(CUT_TAG " " name, CUT_TAG) + +using dorado::utils::PostCondition; + +DEFINE_TEST("Condition isn't triggered right away") { + int counter = 0; + auto on_scope_end = PostCondition([&] { counter++; }); + CHECK(counter == 0); +} + +DEFINE_TEST("Condition is triggered on scope end") { + int counter = 0; + { + auto on_scope_end = PostCondition([&] { counter++; }); + CHECK(counter == 0); + } + CHECK(counter == 1); +} + +DEFINE_TEST("Multiple scopes") { + int counter = 0; + { + const int outer_increment = 1; + auto outer_scope = PostCondition([&] { counter -= outer_increment; }); + CHECK(counter == 0); + counter += outer_increment; + CHECK(counter == outer_increment); + { + const int inner_increment = 2; + auto inner_scope = PostCondition([&] { counter -= inner_increment; }); + CHECK(counter == outer_increment); + counter += inner_increment; + CHECK(counter == outer_increment + inner_increment); + } + CHECK(counter == outer_increment); + } + CHECK(counter == 0); +} From 952aab93739ea1b64c751f6f889caf05bf0b626d Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 11:22:54 +0000 Subject: [PATCH 06/21] [DOR-526] Improve PostCondition to make it harder to misuse Also drop the dependency on std::function. --- dorado/utils/PostCondition.h | 21 +++++++++++++-------- tests/AlignerTest.cpp | 3 ++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/dorado/utils/PostCondition.h b/dorado/utils/PostCondition.h index 0dc99396..38321b36 100644 --- a/dorado/utils/PostCondition.h +++ b/dorado/utils/PostCondition.h @@ -1,19 +1,24 @@ #pragma once -#include + +#include namespace dorado::utils { +namespace detail { +template class PostCondition { public: - PostCondition(std::function func) : m_func(func) {} - ~PostCondition() { - if (m_func) { - m_func(); - } - } + PostCondition(Func&& func) : m_func(std::move(func)) {} + ~PostCondition() { m_func(); } private: - std::function m_func; + Func m_func; }; +} // namespace detail + +template +[[nodiscard]] auto PostCondition(Func function) { + return detail::PostCondition(std::move(function)); +} } // namespace dorado::utils diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index c0316a36..c6ba4a41 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -115,7 +115,8 @@ class AlignerNodeTestFixture { dorado::utils::add_sq_hdr(header.get(), aligner_ref.get_sequence_records_for_header()); auto sam_line_buffer = dorado::utils::allocate_kstring(); - dorado::utils::PostCondition free_buffer([&sam_line_buffer] { ks_free(&sam_line_buffer); }); + auto free_buffer = + dorado::utils::PostCondition([&sam_line_buffer] { ks_free(&sam_line_buffer); }); CHECK(sam_format1(header.get(), bam_record.get(), &sam_line_buffer) >= 0); return std::string(ks_str(&sam_line_buffer), ks_len(&sam_line_buffer)); From 178fc32dc604de8adb63dfdaf29e6f2e507621c6 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 17:44:18 +0000 Subject: [PATCH 07/21] [DOR-526] Add Nikolai's editdistance implementation Also add some tests for it. --- CMakeLists.txt | 2 + dorado/splitter/myers.cpp | 153 ++++++++++++++++++++++++++++++++++++++ dorado/splitter/myers.h | 23 ++++++ tests/CMakeLists.txt | 1 + tests/myers_test.cpp | 95 +++++++++++++++++++++++ 5 files changed, 274 insertions(+) create mode 100644 dorado/splitter/myers.cpp create mode 100644 dorado/splitter/myers.h create mode 100644 tests/myers_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 56b07dee..a3fbc74a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,8 @@ set(LIB_SOURCE_FILES dorado/splitter/DuplexReadSplitter.h dorado/splitter/RNAReadSplitter.cpp dorado/splitter/RNAReadSplitter.h + dorado/splitter/myers.cpp + dorado/splitter/myers.h dorado/splitter/splitter_utils.cpp dorado/splitter/splitter_utils.h dorado/demux/AdapterDetector.cpp diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp new file mode 100644 index 00000000..0ab9c70f --- /dev/null +++ b/dorado/splitter/myers.cpp @@ -0,0 +1,153 @@ +#include "myers.h" + +#include "utils/PostCondition.h" +#include "utils/alignment_utils.h" + +#include +#include +#include +#include +#include + +namespace dorado::splitter { + +namespace { + +// Returns the array D[0..n], where D[i] is equal to local alignment of the pattern to suffix of text[0..i). +std::vector d_myers(const char* pattern, size_t m, const char* text, size_t n) { + constexpr size_t MAX_ALPHABET = 256; + assert(m < 64); + uint64_t PM[MAX_ALPHABET]{}; + for (size_t i = 0; i < m; i++) { + PM[static_cast(pattern[i])] |= uint64_t{1} << i; + } + + std::vector D(n + 1); + uint64_t VP = ~uint64_t{0}; + uint64_t VN = 0; + size_t score = m; + D[0] = score; + for (size_t j = 0; j < n; j++) { + const uint64_t EQ = PM[static_cast(text[j])]; + const uint64_t D0 = (((EQ & VP) + VP) ^ VP) | EQ | VN; + uint64_t HP = VN | ~(D0 | VP); + uint64_t HN = D0 & VP; + + if (HP & (uint64_t{1} << (m - 1))) { + score++; + } + if (HN & (uint64_t{1} << (m - 1))) { + score--; + } + D[j + 1] = score; + + HP <<= 1; + HN <<= 1; + VP = HN | ~(D0 | HP); + VN = D0 & HP; + } + return D; +} + +} // namespace + +std::vector myers_align(std::string_view query, + std::string_view seq, + std::size_t max_edist) { + std::vector ranges; + const auto query_len = query.size(); + if (seq.size() < query_len) { + // Too small, don't bother. + return ranges; + } + + auto add_match = [&](std::size_t end, std::size_t edist) { + // |edist| is for the full query ending at |end|, so we know the earliest that the match can start. + const auto max_match_len = std::min(query_len + edist, end); + const auto min_match_start = end - max_match_len; + + if (edist == 0) { + // Exact match, nothing more to do. + ranges.push_back({min_match_start, end, edist}); + + } else { + // If this isn't an exact match then hand over to edlib to get the start index. + const auto max_match_span = seq.substr(min_match_start, max_match_len); + auto edlib_cfg = edlibNewAlignConfig(static_cast(edist), EDLIB_MODE_HW, + EDLIB_TASK_LOC, nullptr, 0); + auto edlib_result = + edlibAlign(query.data(), static_cast(query.size()), max_match_span.data(), + static_cast(max_match_span.size()), edlib_cfg); + assert(edlib_result.status == EDLIB_STATUS_OK); + auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); }); + + if (edlib_result.status == EDLIB_STATUS_OK) { + // edlib only reports the best edist it finds, so if there's a better match in the same span then it'll + // report that instead. This can happen for spans that are close together, for example: + // edists:...,7,6,5,5,4,5,6,6,6,5,6,..., max_edist=5 + // end1=^ end2=^ + // When processing 'end2' we have to extend our search span to include that of 'end1' (since it's + // |max_edist| indices away). This means that, if the edits aren't insertions, we end up finding the + // same sequence as |end1|, which has a better edist and hence is the only thing edlib reports. + // We should be safe to ignore the worse edist in those cases. + if (edlib_result.editDistance == static_cast(edist)) { + ranges.push_back( + {min_match_start + edlib_result.startLocations[0], end, edist}); + } + } + } + }; + + // Calculate edit distances for each index. + const auto local_edists = d_myers(query.data(), query.size(), seq.data(), seq.size()); + + // Look for drops below the threshold and join neighbouring ranges together. + // + // TODO: can we improve on this, since we have cases such as: + // local_edists = ...,7,6,5,6,6,5,5,6,5,4,5,5,6,7,..., max_edist = 5 + // hits: [^] [^ ] [ ^ ] + // where these hits should probably be grouped together. + // + std::optional current_range_end; + std::size_t min_edist = max_edist + 1; + for (std::size_t current_end = query_len; current_end < local_edists.size(); current_end++) { + // See if this is a better match. + const auto current_edist = local_edists[current_end]; + if (current_edist <= max_edist && current_edist < min_edist) { + min_edist = current_edist; + current_range_end = current_end; + } else if (current_range_end.has_value() && current_edist > max_edist) { + // We're back out of the threshold, so end the current range. + add_match(*current_range_end, min_edist); + current_range_end.reset(); + min_edist = max_edist + 1; + } + } + + // End the current range if we're in one. + if (current_range_end.has_value()) { + add_match(*current_range_end, min_edist); + } + + return ranges; +} + +void print_edists(std::ostream& os, std::string_view seq, const std::vector& edists) { + assert(edists.size() == seq.size() + 1); + + // Print the sequence with spaces to match the edists. + for (char c : seq) { + os << " " << c; + } + os << '\n'; + + // Print the edists. Maximum edist is 64, so we only need to pad 2 spaces. + const auto old_fill = os.fill(' '); + for (size_t s : edists) { + os << std::setw(3) << s; + } + os.fill(old_fill); + os << '\n'; +} + +} // namespace dorado::splitter diff --git a/dorado/splitter/myers.h b/dorado/splitter/myers.h new file mode 100644 index 00000000..01209692 --- /dev/null +++ b/dorado/splitter/myers.h @@ -0,0 +1,23 @@ +#ifndef MYERS_BIT_H +#define MYERS_BIT_H + +#include +#include +#include + +namespace dorado::splitter { + +struct EdistResult { + std::size_t begin; // inclusive + std::size_t end; // exclusive + std::size_t edist; +}; +std::vector myers_align(std::string_view query, + std::string_view seq, + std::size_t max_edist); + +void print_edists(std::ostream& os, std::string_view seq, const std::vector& edists); + +} // namespace dorado::splitter + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 000e7b4b..286c127d 100755 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -31,6 +31,7 @@ add_executable(dorado_tests ModelMetadataTest.cpp ModelUtilsTest.cpp MotifMatcherTest.cpp + myers_test.cpp PairingNodeTest.cpp PipelineTest.cpp PolyACalculatorTest.cpp diff --git a/tests/myers_test.cpp b/tests/myers_test.cpp new file mode 100644 index 00000000..0bf7c759 --- /dev/null +++ b/tests/myers_test.cpp @@ -0,0 +1,95 @@ +#include "splitter/myers.h" + +#include + +#define CUT_TAG "[myers]" +#define DEFINE_TEST(name) TEST_CASE(CUT_TAG " " name, CUT_TAG) + +using dorado::splitter::EdistResult; +using dorado::splitter::myers_align; + +DEFINE_TEST("Basic alignment, single hit") { + const std::string_view query = "AAA"; + const std::string_view seq = "GGGCCCAAATTT"; + + // The same location should be hit for all edists. + const auto max_edist = GENERATE(0, 1, 2); + CAPTURE(max_edist); + + const auto alignments = myers_align(query, seq, max_edist); + REQUIRE(alignments.size() == 1); + CHECK(alignments[0].begin == 6); + CHECK(alignments[0].end == 9); + CHECK(alignments[0].edist == 0); +} + +DEFINE_TEST("Basic alignment, multiple hits") { + const std::string_view query = "CCC"; + const std::string_view seq = "GGGCCCAAATTTCCCGGG"; + + // The same locations should be hit for all edists. + const auto max_edist = GENERATE(0, 1, 2); + CAPTURE(max_edist); + + const auto alignments = myers_align(query, seq, max_edist); + REQUIRE(alignments.size() == 2); + CHECK(alignments[0].begin == 3); + CHECK(alignments[0].end == 6); + CHECK(alignments[0].edist == 0); + CHECK(alignments[1].begin == 12); + CHECK(alignments[1].end == 15); + CHECK(alignments[1].edist == 0); +} + +DEFINE_TEST("Basic alignment, hit at end") { + const std::string_view query = "TTT"; + const std::string_view seq = "GGGCCCAAATTT"; + + // The same location should be hit for all edists. + const auto max_edist = GENERATE(0, 1, 2); + CAPTURE(max_edist); + + const auto alignments = myers_align(query, seq, max_edist); + REQUIRE(alignments.size() == 1); + CHECK(alignments[0].begin == 9); + CHECK(alignments[0].end == 12); + CHECK(alignments[0].edist == 0); +} + +DEFINE_TEST("Complex alignment, multiple hits") { + const std::string_view query = "TACTTCGTTCAGTT"; + const std::string_view seq = + "CTGTCGAGACCCTT" + "TACTTCTTCTT" // match 0 + "CACCAA" + "TATTGTTATGTT" // match 1 + "ATGTAGCC" + "TGCTTCGTTCGGTT" // match 2 + "ATGCGCCGCCAATATTAACCTCGGTAAAA" + "TATCTTCGACCCAGTT" // match 3 + "TTCGCGTCT"; + const auto max_edist = 4; + + const auto alignments = myers_align(query, seq, max_edist); + REQUIRE(alignments.size() == 4); + CHECK(alignments[0].begin == 14); + CHECK(alignments[0].end == 25); + CHECK(alignments[0].edist == 3); + CHECK(alignments[1].begin == 31); + CHECK(alignments[1].end == 43); + CHECK(alignments[1].edist == 4); + CHECK(alignments[2].begin == 51); + CHECK(alignments[2].end == 65); + CHECK(alignments[2].edist == 2); + CHECK(alignments[3].begin == 94); + CHECK(alignments[3].end == 110); + CHECK(alignments[3].edist == 4); +} + +DEFINE_TEST("Complex alignment, doesn't crash when high edist near start") { + const std::string_view query = "TACTTCGTTCAGTT"; + const std::string_view seq = "TTTTTTTTTTCTCCTGTTCTTGGTTCGGTTGT"; + const auto max_edist = 5; + const auto alignments = myers_align(query, seq, max_edist); + CHECK(!alignments.empty()); +} From 63644d9dcb67dcea56004ca4d594f98354b99c60 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 17:48:25 +0000 Subject: [PATCH 08/21] [DOR-526] Small cmake simplification Spotted while adding myers.*. --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a3fbc74a..177d6132 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,7 +136,7 @@ endif() configure_file(dorado/Version.h.in dorado/Version.h) -set(LIB_SOURCE_FILES +add_library(dorado_lib dorado/alignment/alignment_processing_items.cpp dorado/alignment/alignment_processing_items.h dorado/alignment/IndexFileAccess.cpp @@ -245,9 +245,7 @@ set(LIB_SOURCE_FILES dorado/poly_tail/rna_poly_tail_calculator.h dorado/summary/summary.cpp dorado/summary/summary.h - ) - -add_library(dorado_lib ${LIB_SOURCE_FILES}) +) enable_warnings_as_errors(dorado_lib) From fc5c088cace51a987b502d25e259575a3d99feb9 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 17:52:21 +0000 Subject: [PATCH 09/21] [DOR-526] Move the anonymous namespace into dorado::splitter This matches the other files and allows us to use the types without having to alias them. --- dorado/splitter/DuplexReadSplitter.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 708a1288..5f5d5f85 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -20,13 +20,9 @@ #include #include -namespace { - -using namespace dorado; -using namespace dorado::splitter; +namespace dorado::splitter { -using PosRange = splitter::PosRange; -using PosRanges = splitter::PosRanges; +namespace { //[start, end) std::optional find_best_adapter_match(const std::string& adapter, @@ -121,8 +117,6 @@ float qscore_mean(const std::string& qstring, splitter::PosRange r) { } // namespace -namespace dorado::splitter { - struct DuplexReadSplitter::ExtRead { SimplexReadPtr read; at::Tensor data_as_float32; From c8819e0182c0f74d593cf6c8d95c1c2b8abf9adb Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 21 Feb 2024 18:02:33 +0000 Subject: [PATCH 10/21] [DOR-526] Switch over to using multi-edist for searching --- dorado/splitter/DuplexReadSplitter.cpp | 71 ++++++++++++-------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 5f5d5f85..4d681e46 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -1,5 +1,6 @@ #include "DuplexReadSplitter.h" +#include "myers.h" #include "read_pipeline/read_utils.h" #include "splitter/splitter_utils.h" #include "utils/PostCondition.h" @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -215,65 +217,56 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const } auto match_start_range = [&](std::string_view query, std::int64_t min_start, - std::int64_t max_start, - std::int64_t max_edist) -> std::optional { + std::int64_t max_start, std::int64_t max_edist) -> PosRanges { const auto max_end_pos = std::min(read_seq.size(), max_start + query.size() + max_edist); if (min_start + static_cast(query.size()) > max_end_pos) { // Too close to the end. - spdlog::trace("Too close to the end"); - return std::nullopt; + return {}; } // Trim the sequence. std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start); // Search the sequence. - static const EdlibEqualityPair additionalEqualities[4] = { - {'N', 'A'}, {'N', 'T'}, {'N', 'C'}, {'N', 'G'}}; - auto edlib_cfg = edlibNewAlignConfig(max_edist, EDLIB_MODE_HW, EDLIB_TASK_LOC, - additionalEqualities, std::size(additionalEqualities)); - auto edlib_result = - edlibAlign(query.data(), query.size(), seq.data(), seq.size(), edlib_cfg); - assert(edlib_result.status == EDLIB_STATUS_OK); - auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); }); - - // Check we got a match. - if (edlib_result.status != EDLIB_STATUS_OK || edlib_result.editDistance == -1) { - spdlog::trace("No match {} {}", edlib_result.status, edlib_result.editDistance); - return std::nullopt; - } - - // Build the result. - PosRange range; - range.first = min_start + edlib_result.startLocations[0]; - range.second = min_start + edlib_result.endLocations[0] + 1; - if (static_cast(range.first) > max_start) { - spdlog::trace("Past the start"); - return std::nullopt; + const auto alignments = myers_align(query, seq, max_edist); + + // Build the results. + PosRanges ranges; + for (auto alignment : alignments) { + PosRange range; + range.first = min_start + alignment.begin; + range.second = min_start + alignment.end; + if (static_cast(range.first) > max_start) { + continue; + } + ranges.push_back(range); } - return range; + return ranges; }; // Search for muA. - auto muA_range = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist); - if (!muA_range.has_value()) { - spdlog::trace("No muA"); + auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist); + if (muA_ranges.empty()) { return {}; } // Search for the adapter. - const auto adapter_start = std::max( - ignore_start, static_cast(muA_range->first) - max_muA_adapter_dist); - auto adapter_match = - match_start_range(adapter_seq, adapter_start, muA_range->first, max_adapter_edist); - if (!adapter_match.has_value()) { - spdlog::trace("No adapter"); - return {}; + PosRanges all_adapter_ranges; + for (auto muA_range : muA_ranges) { + const auto adapter_start = std::max( + ignore_start, static_cast(muA_range.first) - max_muA_adapter_dist); + auto adapter_ranges = + match_start_range(adapter_seq, adapter_start, muA_range.first, max_adapter_edist); + std::move(adapter_ranges.begin(), adapter_ranges.end(), + std::back_inserter(all_adapter_ranges)); } - spdlog::trace("Detected muA+adapter region in read {}", read.read->read_common.read_id); - return {*adapter_match}; + if (!all_adapter_ranges.empty()) { + spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(), + read.read->read_common.read_id); + } + return all_adapter_ranges; } bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read, From 6afac6b4fc9d5767abaea681f14cf6e5e47a0ab7 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 22 Feb 2024 13:37:36 +0000 Subject: [PATCH 11/21] [DOR-526] Check all of the edlib results Also check that the end matches otherwise we can add duplicate ranges. --- dorado/splitter/myers.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp index 0ab9c70f..231d915c 100644 --- a/dorado/splitter/myers.cpp +++ b/dorado/splitter/myers.cpp @@ -90,9 +90,14 @@ std::vector myers_align(std::string_view query, // |max_edist| indices away). This means that, if the edits aren't insertions, we end up finding the // same sequence as |end1|, which has a better edist and hence is the only thing edlib reports. // We should be safe to ignore the worse edist in those cases. - if (edlib_result.editDistance == static_cast(edist)) { - ranges.push_back( - {min_match_start + edlib_result.startLocations[0], end, edist}); + for (int i = 0; i < edlib_result.numLocations; i++) { + // edlib indices are inclusive, ours are exclusive. + const std::size_t edlib_end = + min_match_start + edlib_result.endLocations[i] + 1; + if (edlib_result.editDistance == static_cast(edist) && edlib_end == end) { + ranges.push_back( + {min_match_start + edlib_result.startLocations[i], end, edist}); + } } } } From 96795fe60ad5cfbc5348580133a2fd62b7f95874 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 22 Feb 2024 14:01:32 +0000 Subject: [PATCH 12/21] [DOR-526] Reduce adapter edist I copied the higher analysis value, not the "best" value. --- dorado/splitter/DuplexReadSplitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 4d681e46..d4837980 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -206,7 +206,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const constexpr std::int64_t min_read_len = 200; // 14bp adapter prefix that should work better for click chemistry based on previous investigation constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT"; - constexpr std::int64_t max_adapter_edist = 5; + constexpr std::int64_t max_adapter_edist = 3; constexpr std::int64_t max_muA_adapter_dist = 100; // Skip if the read is too small. From c13e195026cdc90e636bb4f6ae0b313fd21a25c1 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 22 Feb 2024 14:04:09 +0000 Subject: [PATCH 13/21] [DOR-526] Fix searching for the adapter We only want the best result for the adapter, so add a way to do that. --- dorado/splitter/DuplexReadSplitter.cpp | 45 ++++++++++++++++++++------ 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index d4837980..8c235e7e 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -217,7 +217,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const } auto match_start_range = [&](std::string_view query, std::int64_t min_start, - std::int64_t max_start, std::int64_t max_edist) -> PosRanges { + std::int64_t max_start, std::int64_t max_edist, + bool best_only) -> PosRanges { const auto max_end_pos = std::min(read_seq.size(), max_start + query.size() + max_edist); if (min_start + static_cast(query.size()) > max_end_pos) { @@ -233,36 +234,60 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const // Build the results. PosRanges ranges; - for (auto alignment : alignments) { + auto add_range = [&](EdistResult alignment) { PosRange range; range.first = min_start + alignment.begin; range.second = min_start + alignment.end; - if (static_cast(range.first) > max_start) { - continue; - } ranges.push_back(range); + }; + auto valid_begin = [=](std::int64_t begin) { return min_start + begin <= max_start; }; + if (best_only) { + // Look for the best score that is also valid. + auto compare_score = [=](auto& lhs, auto& rhs) { + auto const lhs_score = !valid_begin(lhs.begin) ? max_edist + 1 : lhs.edist; + auto const rhs_score = !valid_begin(rhs.begin) ? max_edist + 1 : rhs.edist; + return lhs_score < rhs_score; + }; + auto it = std::min_element(alignments.begin(), alignments.end(), compare_score); + if (it != alignments.end()) { + add_range(*it); + } + } else { + for (auto alignment : alignments) { + if (valid_begin(alignment.begin)) { + add_range(alignment); + } + } + // Merge the ranges. + std::sort(ranges.begin(), ranges.end()); + ranges = merge_ranges(ranges, 0); } return ranges; }; - // Search for muA. - auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist); + // Search for muAs. + auto muA_ranges = + match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false); if (muA_ranges.empty()) { return {}; } - // Search for the adapter. + // Search for any adapters that are close to the muAs. PosRanges all_adapter_ranges; for (auto muA_range : muA_ranges) { const auto adapter_start = std::max( ignore_start, static_cast(muA_range.first) - max_muA_adapter_dist); - auto adapter_ranges = - match_start_range(adapter_seq, adapter_start, muA_range.first, max_adapter_edist); + auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first, + max_adapter_edist, true); std::move(adapter_ranges.begin(), adapter_ranges.end(), std::back_inserter(all_adapter_ranges)); } if (!all_adapter_ranges.empty()) { + // Merge the final ranges. + std::sort(all_adapter_ranges.begin(), all_adapter_ranges.end()); + all_adapter_ranges = merge_ranges(all_adapter_ranges, 0); + spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(), read.read->read_common.read_id); } From 185df116d93ffce0a8862c566c85064a626b5a6e Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 22 Feb 2024 17:26:19 +0000 Subject: [PATCH 14/21] [DOR-526] Add missing spike detection Also simplify the "best" search by using edlib directly. --- dorado/splitter/DuplexReadSplitter.cpp | 111 ++++++++++++++++--------- dorado/splitter/DuplexReadSplitter.h | 2 +- 2 files changed, 72 insertions(+), 41 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 8c235e7e..82c3bc7e 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -197,7 +197,7 @@ PosRanges DuplexReadSplitter::possible_pore_regions(const DuplexReadSplitter::Ex return pore_regions; } -PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const { +PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const { spdlog::trace("Finding muA regions for read {}", read.read->read_common.read_id); constexpr std::string_view muA_seq = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; @@ -208,6 +208,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT"; constexpr std::int64_t max_adapter_edist = 3; constexpr std::int64_t max_muA_adapter_dist = 100; + constexpr std::int64_t max_spike_adapter_dist = 50; + constexpr float max_spike_qscore = 10; // Skip if the read is too small. const auto& read_seq = read.read->read_common.seq; @@ -230,68 +232,97 @@ PosRanges DuplexReadSplitter::find_muA_adapter_splits(const ExtRead& read) const std::string_view const seq(read_seq.data() + min_start, max_end_pos - min_start); // Search the sequence. - const auto alignments = myers_align(query, seq, max_edist); - - // Build the results. - PosRanges ranges; - auto add_range = [&](EdistResult alignment) { - PosRange range; - range.first = min_start + alignment.begin; - range.second = min_start + alignment.end; - ranges.push_back(range); - }; - auto valid_begin = [=](std::int64_t begin) { return min_start + begin <= max_start; }; if (best_only) { - // Look for the best score that is also valid. - auto compare_score = [=](auto& lhs, auto& rhs) { - auto const lhs_score = !valid_begin(lhs.begin) ? max_edist + 1 : lhs.edist; - auto const rhs_score = !valid_begin(rhs.begin) ? max_edist + 1 : rhs.edist; - return lhs_score < rhs_score; - }; - auto it = std::min_element(alignments.begin(), alignments.end(), compare_score); - if (it != alignments.end()) { - add_range(*it); + auto edlib_cfg = edlibNewAlignConfig(static_cast(max_edist), EDLIB_MODE_HW, + EDLIB_TASK_LOC, nullptr, 0); + auto edlib_result = edlibAlign(query.data(), static_cast(query.size()), seq.data(), + static_cast(seq.size()), edlib_cfg); + assert(edlib_result.status == EDLIB_STATUS_OK); + auto edlib_cleanup = utils::PostCondition([&] { edlibFreeAlignResult(edlib_result); }); + + // Add the match if it looks good. + PosRanges ranges; + if (edlib_result.status == EDLIB_STATUS_OK && edlib_result.editDistance != -1) { + PosRange range; + range.first = min_start + edlib_result.startLocations[0]; + range.second = min_start + edlib_result.endLocations[0] + 1; + if (static_cast(range.first) <= max_start) { + ranges.push_back(range); + } } + return ranges; + } else { + const auto alignments = myers_align(query, seq, max_edist); + + // Build the results. + PosRanges ranges; for (auto alignment : alignments) { - if (valid_begin(alignment.begin)) { - add_range(alignment); + if (min_start + static_cast(alignment.begin) <= max_start) { + PosRange range; + range.first = min_start + alignment.begin; + range.second = min_start + alignment.end; + ranges.push_back(range); } } + // Merge the ranges. std::sort(ranges.begin(), ranges.end()); - ranges = merge_ranges(ranges, 0); + return merge_ranges(ranges, 0); } - return ranges; }; // Search for muAs. - auto muA_ranges = + const auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false); if (muA_ranges.empty()) { return {}; } // Search for any adapters that are close to the muAs. - PosRanges all_adapter_ranges; + PosRanges spike_ranges; for (auto muA_range : muA_ranges) { const auto adapter_start = std::max( ignore_start, static_cast(muA_range.first) - max_muA_adapter_dist); - auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first, - max_adapter_edist, true); - std::move(adapter_ranges.begin(), adapter_ranges.end(), - std::back_inserter(all_adapter_ranges)); - } + const auto adapter_ranges = match_start_range(adapter_seq, adapter_start, muA_range.first, + max_adapter_edist, true); + if (adapter_ranges.empty()) { + continue; + } - if (!all_adapter_ranges.empty()) { - // Merge the final ranges. - std::sort(all_adapter_ranges.begin(), all_adapter_ranges.end()); - all_adapter_ranges = merge_ranges(all_adapter_ranges, 0); + // We only wanted the best match. + assert(adapter_ranges.size() == 1); + const auto adapter_match = adapter_ranges.front(); + if (adapter_match.first < max_spike_adapter_dist) { + continue; + } - spdlog::trace("Detected {} muA+adapter region(s) in read {}", all_adapter_ranges.size(), - read.read->read_common.read_id); + // Look for the spike between (the left of) the adapter and the muA, in signal space. + const auto spike_search_begin = adapter_match.first - max_spike_adapter_dist; + const auto spike_search_end = muA_range.first; + const auto spike_search_begin_s = spike_search_begin * read.read->read_common.model_stride; + const auto spike_search_end_s = spike_search_end * read.read->read_common.model_stride; + const auto search_span = read.data_as_float32.index( + {at::indexing::Slice(spike_search_begin_s, spike_search_end_s)}); + const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong(); + // Convert back to base space. + const auto spike_begin = spike_peak_s / read.read->read_common.model_stride; + const auto spike_end = spike_begin + 1; + + // Check qscore is under the threshold. + const auto spike_avg_qscore = + qscore_mean(read.read->read_common.qstring, PosRange(spike_begin, spike_end)); + if (spike_avg_qscore > max_spike_qscore) { + continue; + } + + // We have a match. + spike_ranges.emplace_back(spike_begin, spike_end); } - return all_adapter_ranges; + + spdlog::trace("Detected {} muA+adapter spike region(s) in read {}", spike_ranges.size(), + read.read->read_common.read_id); + return spike_ranges; } bool DuplexReadSplitter::check_nearby_adapter(const SimplexRead& read, @@ -479,7 +510,7 @@ DuplexReadSplitter::build_split_finders() const { })); split_finders.emplace_back(std::make_pair( - "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_splits(read); })); + "muA_adapter", [&](const ExtRead& read) { return find_muA_adapter_spikes(read); })); if (!m_settings.simplex_mode) { split_finders.emplace_back(std::make_pair("PORE_FLANK", [&](const ExtRead& read) { diff --git a/dorado/splitter/DuplexReadSplitter.h b/dorado/splitter/DuplexReadSplitter.h index 28466cf5..2c420fd2 100644 --- a/dorado/splitter/DuplexReadSplitter.h +++ b/dorado/splitter/DuplexReadSplitter.h @@ -24,7 +24,7 @@ class DuplexReadSplitter : public ReadSplitter { ExtRead create_ext_read(SimplexReadPtr r) const; PosRanges possible_pore_regions(const ExtRead& read) const; - PosRanges find_muA_adapter_splits(const ExtRead& read) const; + PosRanges find_muA_adapter_spikes(const ExtRead& read) const; bool check_nearby_adapter(const SimplexRead& read, splitter::PosRange r, int adapter_edist) const; From 42ad6f17e1ac97a44072a03475dfc03a7852f7aa Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Fri, 23 Feb 2024 12:11:16 +0000 Subject: [PATCH 15/21] [DOR-526] Correctly map indices to/from basespace --- dorado/splitter/DuplexReadSplitter.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 82c3bc7e..7d6f39a1 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -297,16 +297,27 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const continue; } + // Helpers to map to/from basespace. + auto from_basespace = [&](std::size_t idx) { + const auto it = std::lower_bound(read.move_sums.begin(), read.move_sums.end(), idx); + return std::distance(read.move_sums.begin(), it) * read.read->read_common.model_stride; + }; + auto to_basespace = [&](std::size_t idx) { + idx /= read.read->read_common.model_stride; + // The range we're querying is valid and any found indices should lie in that range. + assert(idx < read.move_sums.size()); + return read.move_sums[idx]; + }; + // Look for the spike between (the left of) the adapter and the muA, in signal space. - const auto spike_search_begin = adapter_match.first - max_spike_adapter_dist; - const auto spike_search_end = muA_range.first; - const auto spike_search_begin_s = spike_search_begin * read.read->read_common.model_stride; - const auto spike_search_end_s = spike_search_end * read.read->read_common.model_stride; + const auto spike_search_begin_s = + from_basespace(adapter_match.first - max_spike_adapter_dist); + const auto spike_search_end_s = from_basespace(muA_range.first); const auto search_span = read.data_as_float32.index( {at::indexing::Slice(spike_search_begin_s, spike_search_end_s)}); const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong(); // Convert back to base space. - const auto spike_begin = spike_peak_s / read.read->read_common.model_stride; + const auto spike_begin = to_basespace(spike_peak_s); const auto spike_end = spike_begin + 1; // Check qscore is under the threshold. From f45f1a542a1b7ad7be0e06047b07803adb9eb79e Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Fri, 23 Feb 2024 12:11:49 +0000 Subject: [PATCH 16/21] [DOR-526] Increase spike qstring range This is at most the start of the muA, so we're safe to read 5 bases. --- dorado/splitter/DuplexReadSplitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 7d6f39a1..e9b82c69 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -318,7 +318,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const const auto spike_peak_s = spike_search_begin_s + search_span.argmax().item().toLong(); // Convert back to base space. const auto spike_begin = to_basespace(spike_peak_s); - const auto spike_end = spike_begin + 1; + const auto spike_end = spike_begin + 5; // Check qscore is under the threshold. const auto spike_avg_qscore = From a14030107ccfda2d0ddc079707c7fa86dd05d00d Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 27 Feb 2024 14:00:41 +0000 Subject: [PATCH 17/21] [DOR-526] Remove duplicate results --- dorado/splitter/DuplexReadSplitter.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index e9b82c69..f85a48ab 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -331,6 +331,11 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const spike_ranges.emplace_back(spike_begin, spike_end); } + // Remove duplicates. + std::sort(spike_ranges.begin(), spike_ranges.end()); + auto end_it = std::unique(spike_ranges.begin(), spike_ranges.end()); + spike_ranges.erase(end_it, spike_ranges.end()); + spdlog::trace("Detected {} muA+adapter spike region(s) in read {}", spike_ranges.size(), read.read->read_common.read_id); return spike_ranges; From f8f8f0690b8132e1694a727cf45800027085e671 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 28 Feb 2024 10:19:49 +0000 Subject: [PATCH 18/21] [DOR-526] Add info to early return traces --- dorado/splitter/DuplexReadSplitter.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index f85a48ab..1c132636 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -214,7 +214,8 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const // Skip if the read is too small. const auto& read_seq = read.read->read_common.seq; if (read_seq.size() < min_read_len) { - spdlog::trace("Read too small"); + spdlog::trace("Read too small: id={} size={}", read.read->read_common.read_id, + read_seq.size()); return {}; } @@ -276,6 +277,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const const auto muA_ranges = match_start_range(muA_seq, ignore_start, read_seq.size(), max_muA_edist, false); if (muA_ranges.empty()) { + spdlog::trace("No muA found in read: id={}", read.read->read_common.read_id); return {}; } From 88e8f370ced4b6a75c172f2102045a89e62a3926 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 28 Feb 2024 10:20:30 +0000 Subject: [PATCH 19/21] [DOR-526] Be consistent with use of include guards --- dorado/splitter/myers.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dorado/splitter/myers.h b/dorado/splitter/myers.h index 01209692..ac9bba85 100644 --- a/dorado/splitter/myers.h +++ b/dorado/splitter/myers.h @@ -1,5 +1,4 @@ -#ifndef MYERS_BIT_H -#define MYERS_BIT_H +#pragma once #include #include @@ -19,5 +18,3 @@ std::vector myers_align(std::string_view query, void print_edists(std::ostream& os, std::string_view seq, const std::vector& edists); } // namespace dorado::splitter - -#endif From dc590314fa8aed52b247e129a2aad64056c37733 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 6 Mar 2024 11:43:11 +0000 Subject: [PATCH 20/21] [DOR-526] Deduplicate the ranges as a safeguard --- dorado/splitter/myers.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dorado/splitter/myers.cpp b/dorado/splitter/myers.cpp index 231d915c..e48edf21 100644 --- a/dorado/splitter/myers.cpp +++ b/dorado/splitter/myers.cpp @@ -3,6 +3,7 @@ #include "utils/PostCondition.h" #include "utils/alignment_utils.h" +#include #include #include #include @@ -134,6 +135,15 @@ std::vector myers_align(std::string_view query, add_match(*current_range_end, min_edist); } + // Deduplicate the ranges. + auto less = [](const EdistResult& lhs, const EdistResult& rhs) { return lhs.end < rhs.end; }; + auto equal = [](const EdistResult& lhs, const EdistResult& rhs) { + return lhs.begin == rhs.begin && lhs.end == rhs.end; + }; + std::sort(ranges.begin(), ranges.end(), less); + auto new_end = std::unique(ranges.begin(), ranges.end(), equal); + ranges.erase(new_end, ranges.end()); + return ranges; } From a4cb934bfb37892e7e6423307aba1a7cd1fb4f7c Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Thu, 7 Mar 2024 17:13:03 +0000 Subject: [PATCH 21/21] [DOR-526] Bump adapter edist from 3 to 5 --- dorado/splitter/DuplexReadSplitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/splitter/DuplexReadSplitter.cpp b/dorado/splitter/DuplexReadSplitter.cpp index 1c132636..24c29714 100644 --- a/dorado/splitter/DuplexReadSplitter.cpp +++ b/dorado/splitter/DuplexReadSplitter.cpp @@ -206,7 +206,7 @@ PosRanges DuplexReadSplitter::find_muA_adapter_spikes(const ExtRead& read) const constexpr std::int64_t min_read_len = 200; // 14bp adapter prefix that should work better for click chemistry based on previous investigation constexpr std::string_view adapter_seq = "TACTTCGTTCAGTT"; - constexpr std::int64_t max_adapter_edist = 3; + constexpr std::int64_t max_adapter_edist = 5; constexpr std::int64_t max_muA_adapter_dist = 100; constexpr std::int64_t max_spike_adapter_dist = 50; constexpr float max_spike_qscore = 10;