From eca8b54316f37b3f800f6f9fc712e9257b656859 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Mon, 1 Apr 2024 18:05:04 -0400 Subject: [PATCH 1/4] Improve plasmid polyA estimation 1. Remove support for split tails (for now) 2. Make threshold tighter 3. Improve logging 4. Support anchor detection based on front or rear flank --- documentation/PolyTailConfig.md | 5 +- dorado/poly_tail/dna_poly_tail_calculator.cpp | 17 ++-- .../plasmid_poly_tail_calculator.cpp | 92 ++++++++++++------- dorado/poly_tail/poly_tail_config.cpp | 12 ++- dorado/poly_tail/poly_tail_config.h | 3 +- 5 files changed, 84 insertions(+), 45 deletions(-) diff --git a/documentation/PolyTailConfig.md b/documentation/PolyTailConfig.md index ab23c17a..7712b290 100644 --- a/documentation/PolyTailConfig.md +++ b/documentation/PolyTailConfig.md @@ -47,7 +47,7 @@ plasmid_front_flank = "CGATCG" plasmid_rear_flank = "TGACTGC" [threshold] -flank_threshold = 10 +flank_threshold = 0.4 [tail] tail_interrupt_length = 10 @@ -61,5 +61,6 @@ tail_interrupt_length = 10 | rear_primer | Rear primer sequence for cDNA | | plasmid_front_flank | Front flanking sequence of poly(A) in plasmid | | plasmid_rear_flank | Rear flanking sequence of poly(A) in plasmid | -| flank_threshold | The edit distance threshold to use for detection of the flank/primer sequences | +| flank_threshold | Threshold to use for detection of the flank/primer sequences. Equates to `(1 - edit distance / flank_sequence)` | +| primer_window | Window of bases at the front and rear of the rear within which to look for primer sequences | | tail_interrupt_length | Combine tails that are within this distance of each other (default is 0, i.e. don't combine any) | diff --git a/dorado/poly_tail/dna_poly_tail_calculator.cpp b/dorado/poly_tail/dna_poly_tail_calculator.cpp index 992884be..5c025c1c 100644 --- a/dorado/poly_tail/dna_poly_tail_calculator.cpp +++ b/dorado/poly_tail/dna_poly_tail_calculator.cpp @@ -20,15 +20,15 @@ SignalAnchorInfo DNAPolyTailCalculator::determine_signal_anchor_and_strand( const std::string& front_primer_rc = m_config.rc_front_primer; const std::string& rear_primer = m_config.rear_primer; const std::string& rear_primer_rc = m_config.rc_rear_primer; - const int threshold = m_config.flank_threshold; + const float threshold = m_config.flank_threshold; + const int primer_window = m_config.primer_window; int trailing_Ts = static_cast(dorado::utils::count_trailing_chars(rear_primer, 'T')); const int kMinSeparation = 10; - const int kWindowSize = 150; std::string_view seq_view = std::string_view(read.read_common.seq); - std::string_view read_top = seq_view.substr(0, kWindowSize); - auto bottom_start = std::max(0, (int)seq_view.length() - kWindowSize); - std::string_view read_bottom = seq_view.substr(bottom_start, kWindowSize); + std::string_view read_top = seq_view.substr(0, primer_window); + auto bottom_start = std::max(0, (int)seq_view.length() - primer_window); + std::string_view read_bottom = seq_view.substr(bottom_start, primer_window); EdlibAlignConfig align_config = edlibDefaultAlignConfig(); align_config.task = EDLIB_TASK_LOC; @@ -53,9 +53,10 @@ SignalAnchorInfo DNAPolyTailCalculator::determine_signal_anchor_and_strand( int dist_v2 = top_v2.editDistance + bottom_v2.editDistance; spdlog::trace("v1 dist {}, v2 dist {}", dist_v1, dist_v2); - bool fwd = dist_v1 < dist_v2; - bool proceed = - std::min(dist_v1, dist_v2) < threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation; + const bool fwd = dist_v1 < dist_v2; + const float flank_score = static_cast(std::min(dist_v1, dist_v2)) / + (front_primer.length() + rear_primer.length()); + const bool proceed = flank_score < threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation; SignalAnchorInfo result = {false, -1, trailing_Ts, false}; diff --git a/dorado/poly_tail/plasmid_poly_tail_calculator.cpp b/dorado/poly_tail/plasmid_poly_tail_calculator.cpp index f9a5ca42..0afd562b 100644 --- a/dorado/poly_tail/plasmid_poly_tail_calculator.cpp +++ b/dorado/poly_tail/plasmid_poly_tail_calculator.cpp @@ -1,6 +1,7 @@ #include "plasmid_poly_tail_calculator.h" #include "read_pipeline/messages.h" +#include "utils/PostCondition.h" #include "utils/sequence_utils.h" #include @@ -14,7 +15,7 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( const std::string& rear_flank = m_config.plasmid_rear_flank; const std::string& front_flank_rc = m_config.rc_plasmid_front_flank; const std::string& rear_flank_rc = m_config.rc_plasmid_rear_flank; - const int threshold = m_config.flank_threshold; + const float threshold = m_config.flank_threshold; std::string_view seq_view = std::string_view(read.read_common.seq); EdlibAlignConfig align_config = edlibDefaultAlignConfig(); @@ -22,23 +23,38 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( align_config.mode = EDLIB_MODE_HW; // Check for forward strand. - EdlibAlignResult fwd_v1 = edlibAlign(front_flank.data(), int(front_flank.length()), - seq_view.data(), int(seq_view.length()), align_config); - EdlibAlignResult fwd_v2 = edlibAlign(rear_flank.data(), int(rear_flank.length()), - seq_view.data(), int(seq_view.length()), align_config); + EdlibAlignResult fwd_front = edlibAlign(front_flank.data(), int(front_flank.length()), + seq_view.data(), int(seq_view.length()), align_config); + EdlibAlignResult fwd_rear = edlibAlign(rear_flank.data(), int(rear_flank.length()), + seq_view.data(), int(seq_view.length()), align_config); // Check for reverse strand. - EdlibAlignResult rev_v1 = edlibAlign(rear_flank_rc.data(), int(rear_flank_rc.length()), - seq_view.data(), int(seq_view.length()), align_config); - EdlibAlignResult rev_v2 = edlibAlign(front_flank_rc.data(), int(front_flank_rc.length()), - seq_view.data(), int(seq_view.length()), align_config); + EdlibAlignResult rev_front = edlibAlign(rear_flank_rc.data(), int(rear_flank_rc.length()), + seq_view.data(), int(seq_view.length()), align_config); + EdlibAlignResult rev_rear = edlibAlign(front_flank_rc.data(), int(front_flank_rc.length()), + seq_view.data(), int(seq_view.length()), align_config); - auto scores = {fwd_v1.editDistance, fwd_v2.editDistance, rev_v1.editDistance, - rev_v2.editDistance}; + auto clear_edlib_results = + utils::PostCondition([&fwd_front, &fwd_rear, &rev_front, &rev_rear]() { + edlibFreeAlignResult(fwd_front); + edlibFreeAlignResult(fwd_rear); + edlibFreeAlignResult(rev_front); + edlibFreeAlignResult(rev_rear); + }); + + float fwd_front_score = fwd_front.editDistance / float(front_flank.length()); + float fwd_rear_score = fwd_rear.editDistance / float(rear_flank.length()); + float rev_front_score = rev_front.editDistance / float(rear_flank_rc.length()); + float rev_rear_score = rev_rear.editDistance / float(front_flank_rc.length()); + + spdlog::trace("Flank scores: fwd_front {} fwd_rear {}, rev_front {}, rev_rear {}", + fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score); + + auto scores = {fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score}; if (std::none_of(std::begin(scores), std::end(scores), [threshold](auto val) { return val < threshold; })) { - spdlog::trace("{} flank edit distance too high {}", read.read_common.read_id, + spdlog::trace("{} flank score too high {}", read.read_common.read_id, *std::min_element(std::begin(scores), std::end(scores))); return {false, -1, 0, false}; } @@ -46,50 +62,62 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( bool fwd = std::distance(std::begin(scores), std::min_element(std::begin(scores), std::end(scores))) < 2; - EdlibAlignResult& front_result = fwd ? fwd_v1 : rev_v1; - EdlibAlignResult& rear_result = fwd ? fwd_v2 : rev_v2; + float front_result_score = fwd ? fwd_front_score : rev_front_score; + float rear_result_score = fwd ? fwd_rear_score : rev_rear_score; + EdlibAlignResult& front_result = fwd ? fwd_front : rev_front; + EdlibAlignResult& rear_result = fwd ? fwd_rear : rev_rear; // good flank detection with the front and rear in order is the only configuration // where we can be sure we haven't cleaved the tail - bool whole_tail = front_result.editDistance < threshold && - rear_result.editDistance < threshold && - front_result.endLocations[0] < rear_result.startLocations[0]; - - int base_anchor = front_result.endLocations[0]; - if (front_result.editDistance - rear_result.editDistance > threshold) { - // front sequence cleaved? - base_anchor = rear_result.startLocations[0]; + bool split_tail = front_result_score < threshold && rear_result_score < threshold && + rear_result.endLocations[0] < front_result.startLocations[0]; + + if (split_tail) { + spdlog::trace("{} split tail found - not supported yet", read.read_common.read_id); + return {false, -1, 0, false}; } + int base_anchor = -1; size_t trailing_tail_bases = 0; if (fwd) { - if (fwd_v1.editDistance < threshold) { + if (fwd_front_score < fwd_rear_score) { + base_anchor = front_result.endLocations[0]; + spdlog::trace("Using fwd front flank as anchor"); + } else { + base_anchor = rear_result.startLocations[0]; + spdlog::trace("Using fwd rear flank as anchor"); + } + + if (fwd_front_score < threshold) { trailing_tail_bases += dorado::utils::count_trailing_chars(front_flank, 'A'); } - if (fwd_v2.editDistance < threshold) { + if (fwd_rear_score < threshold) { trailing_tail_bases += dorado::utils::count_leading_chars(rear_flank, 'A'); } } else { - if (rev_v1.editDistance < threshold) { + if (rev_front_score < rev_rear_score) { + base_anchor = front_result.endLocations[0]; + spdlog::trace("Using rev front flank as anchor"); + } else { + base_anchor = rear_result.startLocations[0]; + spdlog::trace("Using rev rear flank as anchor"); + } + + if (rev_front_score < threshold) { trailing_tail_bases += dorado::utils::count_trailing_chars(rear_flank_rc, 'T'); } - if (rev_v2.editDistance < threshold) { + if (rev_rear_score < threshold) { trailing_tail_bases += dorado::utils::count_leading_chars(front_flank_rc, 'T'); } } - edlibFreeAlignResult(fwd_v1); - edlibFreeAlignResult(fwd_v2); - edlibFreeAlignResult(rev_v1); - edlibFreeAlignResult(rev_v2); - const auto stride = read.read_common.model_stride; const auto seq_to_sig_map = dorado::utils::moves_to_map(read.read_common.moves, stride, read.read_common.get_raw_data_samples(), read.read_common.seq.size() + 1); int signal_anchor = int(seq_to_sig_map[base_anchor]); - return {fwd, signal_anchor, static_cast(trailing_tail_bases), !whole_tail}; + return {fwd, signal_anchor, static_cast(trailing_tail_bases), split_tail}; } } // namespace dorado::poly_tail diff --git a/dorado/poly_tail/poly_tail_config.cpp b/dorado/poly_tail/poly_tail_config.cpp index 63074ce1..d4c23e34 100644 --- a/dorado/poly_tail/poly_tail_config.cpp +++ b/dorado/poly_tail/poly_tail_config.cpp @@ -40,14 +40,22 @@ PolyTailConfig prepare_config(std::istream& is) { config.plasmid_front_flank = toml::find(anchors, "plasmid_front_flank"); config.plasmid_rear_flank = toml::find(anchors, "plasmid_rear_flank"); config.is_plasmid = true; - config.flank_threshold = 10; // reduced default for plasmids + config.flank_threshold = 0.15; // reduced default for plasmids + } + + if (anchors.contains("primer_window")) { + config.primer_window = toml::find(anchors, "primer_window"); + if (config.primer_window <= 0) { + throw std::runtime_error("primer_window size needs to be > 0, given " + + std::to_string(config.primer_window)); + } } } if (config_toml.contains("threshold")) { const auto& threshold = toml::find(config_toml, "threshold"); if (threshold.contains("flank_threshold ")) { - config.flank_threshold = toml::find(threshold, "flank_threshold "); + config.flank_threshold = toml::find(threshold, "flank_threshold"); } } diff --git a/dorado/poly_tail/poly_tail_config.h b/dorado/poly_tail/poly_tail_config.h index 158d347b..01834d24 100644 --- a/dorado/poly_tail/poly_tail_config.h +++ b/dorado/poly_tail/poly_tail_config.h @@ -14,7 +14,8 @@ struct PolyTailConfig { std::string plasmid_rear_flank; std::string rc_plasmid_front_flank; std::string rc_plasmid_rear_flank; - int flank_threshold = 30; + int primer_window = 150; + float flank_threshold = 0.4; bool is_plasmid = false; int tail_interrupt_length = 0; int min_base_count = 10; From 8aa0c70ba7e57356b8e5a082369840f8791541a4 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Mon, 15 Apr 2024 08:58:47 -0400 Subject: [PATCH 2/4] fix code to match docs --- dorado/poly_tail/dna_poly_tail_calculator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dorado/poly_tail/dna_poly_tail_calculator.cpp b/dorado/poly_tail/dna_poly_tail_calculator.cpp index 5c025c1c..7fbfbe76 100644 --- a/dorado/poly_tail/dna_poly_tail_calculator.cpp +++ b/dorado/poly_tail/dna_poly_tail_calculator.cpp @@ -54,9 +54,9 @@ SignalAnchorInfo DNAPolyTailCalculator::determine_signal_anchor_and_strand( spdlog::trace("v1 dist {}, v2 dist {}", dist_v1, dist_v2); const bool fwd = dist_v1 < dist_v2; - const float flank_score = static_cast(std::min(dist_v1, dist_v2)) / - (front_primer.length() + rear_primer.length()); - const bool proceed = flank_score < threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation; + const float flank_score = 1.f - static_cast(std::min(dist_v1, dist_v2)) / + (front_primer.length() + rear_primer.length()); + const bool proceed = flank_score >= threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation; SignalAnchorInfo result = {false, -1, trailing_Ts, false}; From 32f1f29742a3fba4daf2e57b34ca69b62a6521a2 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Mon, 15 Apr 2024 20:02:54 -0400 Subject: [PATCH 3/4] fix windows type narrowing error --- documentation/PolyTailConfig.md | 2 +- dorado/poly_tail/poly_tail_config.cpp | 2 +- dorado/poly_tail/poly_tail_config.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/documentation/PolyTailConfig.md b/documentation/PolyTailConfig.md index 7712b290..03f229aa 100644 --- a/documentation/PolyTailConfig.md +++ b/documentation/PolyTailConfig.md @@ -47,7 +47,7 @@ plasmid_front_flank = "CGATCG" plasmid_rear_flank = "TGACTGC" [threshold] -flank_threshold = 0.4 +flank_threshold = 0.6 [tail] tail_interrupt_length = 10 diff --git a/dorado/poly_tail/poly_tail_config.cpp b/dorado/poly_tail/poly_tail_config.cpp index d4c23e34..c3aaab06 100644 --- a/dorado/poly_tail/poly_tail_config.cpp +++ b/dorado/poly_tail/poly_tail_config.cpp @@ -40,7 +40,7 @@ PolyTailConfig prepare_config(std::istream& is) { config.plasmid_front_flank = toml::find(anchors, "plasmid_front_flank"); config.plasmid_rear_flank = toml::find(anchors, "plasmid_rear_flank"); config.is_plasmid = true; - config.flank_threshold = 0.15; // reduced default for plasmids + config.flank_threshold = 0.15f; // reduced default for plasmids } if (anchors.contains("primer_window")) { diff --git a/dorado/poly_tail/poly_tail_config.h b/dorado/poly_tail/poly_tail_config.h index 01834d24..f6e1db32 100644 --- a/dorado/poly_tail/poly_tail_config.h +++ b/dorado/poly_tail/poly_tail_config.h @@ -15,7 +15,7 @@ struct PolyTailConfig { std::string rc_plasmid_front_flank; std::string rc_plasmid_rear_flank; int primer_window = 150; - float flank_threshold = 0.4; + float flank_threshold = 0.6f; bool is_plasmid = false; int tail_interrupt_length = 0; int min_base_count = 10; From 72a87344cdf724687f3003fa051d78214688d4c0 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Tue, 16 Apr 2024 09:33:52 -0400 Subject: [PATCH 4/4] Update thresholds for plasmid --- .../plasmid_poly_tail_calculator.cpp | 22 +++++++++---------- dorado/poly_tail/poly_tail_config.cpp | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dorado/poly_tail/plasmid_poly_tail_calculator.cpp b/dorado/poly_tail/plasmid_poly_tail_calculator.cpp index 0afd562b..7dc76872 100644 --- a/dorado/poly_tail/plasmid_poly_tail_calculator.cpp +++ b/dorado/poly_tail/plasmid_poly_tail_calculator.cpp @@ -42,10 +42,10 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( edlibFreeAlignResult(rev_rear); }); - float fwd_front_score = fwd_front.editDistance / float(front_flank.length()); - float fwd_rear_score = fwd_rear.editDistance / float(rear_flank.length()); - float rev_front_score = rev_front.editDistance / float(rear_flank_rc.length()); - float rev_rear_score = rev_rear.editDistance / float(front_flank_rc.length()); + float fwd_front_score = 1.f - fwd_front.editDistance / float(front_flank.length()); + float fwd_rear_score = 1.f - fwd_rear.editDistance / float(rear_flank.length()); + float rev_front_score = 1.f - rev_front.editDistance / float(rear_flank_rc.length()); + float rev_rear_score = 1.f - rev_rear.editDistance / float(front_flank_rc.length()); spdlog::trace("Flank scores: fwd_front {} fwd_rear {}, rev_front {}, rev_rear {}", fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score); @@ -53,14 +53,14 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( auto scores = {fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score}; if (std::none_of(std::begin(scores), std::end(scores), - [threshold](auto val) { return val < threshold; })) { + [threshold](auto val) { return val >= threshold; })) { spdlog::trace("{} flank score too high {}", read.read_common.read_id, *std::min_element(std::begin(scores), std::end(scores))); return {false, -1, 0, false}; } bool fwd = std::distance(std::begin(scores), - std::min_element(std::begin(scores), std::end(scores))) < 2; + std::max_element(std::begin(scores), std::end(scores))) < 2; float front_result_score = fwd ? fwd_front_score : rev_front_score; float rear_result_score = fwd ? fwd_rear_score : rev_rear_score; @@ -69,7 +69,7 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( // good flank detection with the front and rear in order is the only configuration // where we can be sure we haven't cleaved the tail - bool split_tail = front_result_score < threshold && rear_result_score < threshold && + bool split_tail = front_result_score >= threshold && rear_result_score >= threshold && rear_result.endLocations[0] < front_result.startLocations[0]; if (split_tail) { @@ -88,10 +88,10 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( spdlog::trace("Using fwd rear flank as anchor"); } - if (fwd_front_score < threshold) { + if (fwd_front_score >= threshold) { trailing_tail_bases += dorado::utils::count_trailing_chars(front_flank, 'A'); } - if (fwd_rear_score < threshold) { + if (fwd_rear_score >= threshold) { trailing_tail_bases += dorado::utils::count_leading_chars(rear_flank, 'A'); } } else { @@ -103,10 +103,10 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand( spdlog::trace("Using rev rear flank as anchor"); } - if (rev_front_score < threshold) { + if (rev_front_score >= threshold) { trailing_tail_bases += dorado::utils::count_trailing_chars(rear_flank_rc, 'T'); } - if (rev_rear_score < threshold) { + if (rev_rear_score >= threshold) { trailing_tail_bases += dorado::utils::count_leading_chars(front_flank_rc, 'T'); } } diff --git a/dorado/poly_tail/poly_tail_config.cpp b/dorado/poly_tail/poly_tail_config.cpp index c3aaab06..61b3c4a8 100644 --- a/dorado/poly_tail/poly_tail_config.cpp +++ b/dorado/poly_tail/poly_tail_config.cpp @@ -40,7 +40,7 @@ PolyTailConfig prepare_config(std::istream& is) { config.plasmid_front_flank = toml::find(anchors, "plasmid_front_flank"); config.plasmid_rear_flank = toml::find(anchors, "plasmid_rear_flank"); config.is_plasmid = true; - config.flank_threshold = 0.15f; // reduced default for plasmids + config.flank_threshold = 0.85f; // stricter default for plasmids } if (anchors.contains("primer_window")) {