Skip to content

Commit

Permalink
Merge branch 'jdaw/improve-plasmid-polya-estimation' into 'master'
Browse files Browse the repository at this point in the history
Improve plasmid polyA estimation

Closes DOR-625

See merge request machine-learning/dorado!942
  • Loading branch information
tijyojwad committed Apr 18, 2024
2 parents 7e311c9 + 72a8734 commit 67dc5ba
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 47 deletions.
5 changes: 3 additions & 2 deletions documentation/PolyTailConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ plasmid_front_flank = "CGATCG"
plasmid_rear_flank = "TGACTGC"
[threshold]
flank_threshold = 10
flank_threshold = 0.6
[tail]
tail_interrupt_length = 10
Expand All @@ -61,5 +61,6 @@ tail_interrupt_length = 10
| rear_primer | Rear primer sequence for cDNA |
| plasmid_front_flank | Front flanking sequence of poly(A) in plasmid |
| plasmid_rear_flank | Rear flanking sequence of poly(A) in plasmid |
| flank_threshold | The edit distance threshold to use for detection of the flank/primer sequences |
| flank_threshold | Threshold to use for detection of the flank/primer sequences. Equates to `(1 - edit distance / flank_sequence)` |
| primer_window | Window of bases at the front and rear of the rear within which to look for primer sequences |
| tail_interrupt_length | Combine tails that are within this distance of each other (default is 0, i.e. don't combine any) |
17 changes: 9 additions & 8 deletions dorado/poly_tail/dna_poly_tail_calculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ SignalAnchorInfo DNAPolyTailCalculator::determine_signal_anchor_and_strand(
const std::string& front_primer_rc = m_config.rc_front_primer;
const std::string& rear_primer = m_config.rear_primer;
const std::string& rear_primer_rc = m_config.rc_rear_primer;
const int threshold = m_config.flank_threshold;
const float threshold = m_config.flank_threshold;
const int primer_window = m_config.primer_window;
int trailing_Ts = static_cast<int>(dorado::utils::count_trailing_chars(rear_primer, 'T'));

const int kMinSeparation = 10;
const int kWindowSize = 150;
std::string_view seq_view = std::string_view(read.read_common.seq);
std::string_view read_top = seq_view.substr(0, kWindowSize);
auto bottom_start = std::max(0, (int)seq_view.length() - kWindowSize);
std::string_view read_bottom = seq_view.substr(bottom_start, kWindowSize);
std::string_view read_top = seq_view.substr(0, primer_window);
auto bottom_start = std::max(0, (int)seq_view.length() - primer_window);
std::string_view read_bottom = seq_view.substr(bottom_start, primer_window);

EdlibAlignConfig align_config = edlibDefaultAlignConfig();
align_config.task = EDLIB_TASK_LOC;
Expand All @@ -53,9 +53,10 @@ SignalAnchorInfo DNAPolyTailCalculator::determine_signal_anchor_and_strand(
int dist_v2 = top_v2.editDistance + bottom_v2.editDistance;
spdlog::trace("v1 dist {}, v2 dist {}", dist_v1, dist_v2);

bool fwd = dist_v1 < dist_v2;
bool proceed =
std::min(dist_v1, dist_v2) < threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation;
const bool fwd = dist_v1 < dist_v2;
const float flank_score = 1.f - static_cast<float>(std::min(dist_v1, dist_v2)) /
(front_primer.length() + rear_primer.length());
const bool proceed = flank_score >= threshold && std::abs(dist_v1 - dist_v2) > kMinSeparation;

SignalAnchorInfo result = {false, -1, trailing_Ts, false};

Expand Down
96 changes: 62 additions & 34 deletions dorado/poly_tail/plasmid_poly_tail_calculator.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "plasmid_poly_tail_calculator.h"

#include "read_pipeline/messages.h"
#include "utils/PostCondition.h"
#include "utils/sequence_utils.h"

#include <edlib.h>
Expand All @@ -14,82 +15,109 @@ SignalAnchorInfo PlasmidPolyTailCalculator::determine_signal_anchor_and_strand(
const std::string& rear_flank = m_config.plasmid_rear_flank;
const std::string& front_flank_rc = m_config.rc_plasmid_front_flank;
const std::string& rear_flank_rc = m_config.rc_plasmid_rear_flank;
const int threshold = m_config.flank_threshold;
const float threshold = m_config.flank_threshold;

std::string_view seq_view = std::string_view(read.read_common.seq);
EdlibAlignConfig align_config = edlibDefaultAlignConfig();
align_config.task = EDLIB_TASK_LOC;
align_config.mode = EDLIB_MODE_HW;

// Check for forward strand.
EdlibAlignResult fwd_v1 = edlibAlign(front_flank.data(), int(front_flank.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult fwd_v2 = edlibAlign(rear_flank.data(), int(rear_flank.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult fwd_front = edlibAlign(front_flank.data(), int(front_flank.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult fwd_rear = edlibAlign(rear_flank.data(), int(rear_flank.length()),
seq_view.data(), int(seq_view.length()), align_config);

// Check for reverse strand.
EdlibAlignResult rev_v1 = edlibAlign(rear_flank_rc.data(), int(rear_flank_rc.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult rev_v2 = edlibAlign(front_flank_rc.data(), int(front_flank_rc.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult rev_front = edlibAlign(rear_flank_rc.data(), int(rear_flank_rc.length()),
seq_view.data(), int(seq_view.length()), align_config);
EdlibAlignResult rev_rear = edlibAlign(front_flank_rc.data(), int(front_flank_rc.length()),
seq_view.data(), int(seq_view.length()), align_config);

auto scores = {fwd_v1.editDistance, fwd_v2.editDistance, rev_v1.editDistance,
rev_v2.editDistance};
auto clear_edlib_results =
utils::PostCondition([&fwd_front, &fwd_rear, &rev_front, &rev_rear]() {
edlibFreeAlignResult(fwd_front);
edlibFreeAlignResult(fwd_rear);
edlibFreeAlignResult(rev_front);
edlibFreeAlignResult(rev_rear);
});

float fwd_front_score = 1.f - fwd_front.editDistance / float(front_flank.length());
float fwd_rear_score = 1.f - fwd_rear.editDistance / float(rear_flank.length());
float rev_front_score = 1.f - rev_front.editDistance / float(rear_flank_rc.length());
float rev_rear_score = 1.f - rev_rear.editDistance / float(front_flank_rc.length());

spdlog::trace("Flank scores: fwd_front {} fwd_rear {}, rev_front {}, rev_rear {}",
fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score);

auto scores = {fwd_front_score, fwd_rear_score, rev_front_score, rev_rear_score};

if (std::none_of(std::begin(scores), std::end(scores),
[threshold](auto val) { return val < threshold; })) {
spdlog::trace("{} flank edit distance too high {}", read.read_common.read_id,
[threshold](auto val) { return val >= threshold; })) {
spdlog::trace("{} flank score too high {}", read.read_common.read_id,
*std::min_element(std::begin(scores), std::end(scores)));
return {false, -1, 0, false};
}

bool fwd = std::distance(std::begin(scores),
std::min_element(std::begin(scores), std::end(scores))) < 2;
std::max_element(std::begin(scores), std::end(scores))) < 2;

EdlibAlignResult& front_result = fwd ? fwd_v1 : rev_v1;
EdlibAlignResult& rear_result = fwd ? fwd_v2 : rev_v2;
float front_result_score = fwd ? fwd_front_score : rev_front_score;
float rear_result_score = fwd ? fwd_rear_score : rev_rear_score;
EdlibAlignResult& front_result = fwd ? fwd_front : rev_front;
EdlibAlignResult& rear_result = fwd ? fwd_rear : rev_rear;

// good flank detection with the front and rear in order is the only configuration
// where we can be sure we haven't cleaved the tail
bool whole_tail = front_result.editDistance < threshold &&
rear_result.editDistance < threshold &&
front_result.endLocations[0] < rear_result.startLocations[0];

int base_anchor = front_result.endLocations[0];
if (front_result.editDistance - rear_result.editDistance > threshold) {
// front sequence cleaved?
base_anchor = rear_result.startLocations[0];
bool split_tail = front_result_score >= threshold && rear_result_score >= threshold &&
rear_result.endLocations[0] < front_result.startLocations[0];

if (split_tail) {
spdlog::trace("{} split tail found - not supported yet", read.read_common.read_id);
return {false, -1, 0, false};
}

int base_anchor = -1;
size_t trailing_tail_bases = 0;
if (fwd) {
if (fwd_v1.editDistance < threshold) {
if (fwd_front_score < fwd_rear_score) {
base_anchor = front_result.endLocations[0];
spdlog::trace("Using fwd front flank as anchor");
} else {
base_anchor = rear_result.startLocations[0];
spdlog::trace("Using fwd rear flank as anchor");
}

if (fwd_front_score >= threshold) {
trailing_tail_bases += dorado::utils::count_trailing_chars(front_flank, 'A');
}
if (fwd_v2.editDistance < threshold) {
if (fwd_rear_score >= threshold) {
trailing_tail_bases += dorado::utils::count_leading_chars(rear_flank, 'A');
}
} else {
if (rev_v1.editDistance < threshold) {
if (rev_front_score < rev_rear_score) {
base_anchor = front_result.endLocations[0];
spdlog::trace("Using rev front flank as anchor");
} else {
base_anchor = rear_result.startLocations[0];
spdlog::trace("Using rev rear flank as anchor");
}

if (rev_front_score >= threshold) {
trailing_tail_bases += dorado::utils::count_trailing_chars(rear_flank_rc, 'T');
}
if (rev_v2.editDistance < threshold) {
if (rev_rear_score >= threshold) {
trailing_tail_bases += dorado::utils::count_leading_chars(front_flank_rc, 'T');
}
}

edlibFreeAlignResult(fwd_v1);
edlibFreeAlignResult(fwd_v2);
edlibFreeAlignResult(rev_v1);
edlibFreeAlignResult(rev_v2);

const auto stride = read.read_common.model_stride;
const auto seq_to_sig_map = dorado::utils::moves_to_map(read.read_common.moves, stride,
read.read_common.get_raw_data_samples(),
read.read_common.seq.size() + 1);
int signal_anchor = int(seq_to_sig_map[base_anchor]);

return {fwd, signal_anchor, static_cast<int>(trailing_tail_bases), !whole_tail};
return {fwd, signal_anchor, static_cast<int>(trailing_tail_bases), split_tail};
}

} // namespace dorado::poly_tail
12 changes: 10 additions & 2 deletions dorado/poly_tail/poly_tail_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,22 @@ PolyTailConfig prepare_config(std::istream& is) {
config.plasmid_front_flank = toml::find<std::string>(anchors, "plasmid_front_flank");
config.plasmid_rear_flank = toml::find<std::string>(anchors, "plasmid_rear_flank");
config.is_plasmid = true;
config.flank_threshold = 10; // reduced default for plasmids
config.flank_threshold = 0.85f; // stricter default for plasmids
}

if (anchors.contains("primer_window")) {
config.primer_window = toml::find<int>(anchors, "primer_window");
if (config.primer_window <= 0) {
throw std::runtime_error("primer_window size needs to be > 0, given " +
std::to_string(config.primer_window));
}
}
}

if (config_toml.contains("threshold")) {
const auto& threshold = toml::find(config_toml, "threshold");
if (threshold.contains("flank_threshold ")) {
config.flank_threshold = toml::find<int>(threshold, "flank_threshold ");
config.flank_threshold = toml::find<float>(threshold, "flank_threshold");
}
}

Expand Down
3 changes: 2 additions & 1 deletion dorado/poly_tail/poly_tail_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ struct PolyTailConfig {
std::string plasmid_rear_flank;
std::string rc_plasmid_front_flank;
std::string rc_plasmid_rear_flank;
int flank_threshold = 30;
int primer_window = 150;
float flank_threshold = 0.6f;
bool is_plasmid = false;
int tail_interrupt_length = 0;
int min_base_count = 10;
Expand Down

0 comments on commit 67dc5ba

Please sign in to comment.