Skip to content

Commit

Permalink
Merge branch 'rh/rna_polya_qscore_trim_picked' into 'master'
Browse files Browse the repository at this point in the history
[INSTX-4025] RNA PolyA QScore Trim - Master

See merge request machine-learning/dorado!862
  • Loading branch information
malton-ont committed Feb 29, 2024
2 parents f0b829d + 394ffd3 commit ae47155
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 0 deletions.
1 change: 1 addition & 0 deletions dorado/read_pipeline/BasecallerNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ void BasecallerNode::working_reads_manager() {
read_common_data.model_name = m_model_name;
read_common_data.mean_qscore_start_pos = m_mean_qscore_start_pos;
read_common_data.pre_trim_seq_length = read_common_data.seq.length();
read_common_data.is_rna = m_rna;

if (m_rna) {
std::reverse(read_common_data.seq.begin(), read_common_data.seq.end());
Expand Down
11 changes: 11 additions & 0 deletions dorado/read_pipeline/ReadPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <sstream>
#include <stack>
#include <stdexcept>
#include <string_view>
#include <unordered_map>

using namespace std::chrono_literals;
Expand Down Expand Up @@ -287,6 +288,16 @@ void ReadCommon::generate_modbase_tags(bam1_t *aln, uint8_t threshold) const {
}

float ReadCommon::calculate_mean_qscore() const {
if (is_rna) {
const size_t polya_start = utils::find_rna_polya(seq);
spdlog::trace("calculate_mean_qscore rna - len:{} polya_start_idx: {}, polya_trim_len:{}",
seq.size(), polya_start, seq.size() - polya_start);
if (polya_start == 0) {
return utils::mean_qscore_from_qstring(qstring);
}
return utils::mean_qscore_from_qstring(std::string_view{qstring}.substr(0, polya_start));
}

// If Q-score start position is greater than the
// read length, then calculate mean Q-score from the
// start of the read.
Expand Down
1 change: 1 addition & 0 deletions dorado/read_pipeline/messages.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class ReadCommon {

size_t get_raw_data_samples() const { return is_duplex ? raw_data.size(1) : raw_data.size(0); }

bool is_rna{false};
// Track length of estimated polyA tail in bases.
int rna_poly_tail_length{-1};
// Track position of end of RNA adapter in signal space. If the RNA adapter is
Expand Down
27 changes: 27 additions & 0 deletions dorado/utils/sequence_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,33 @@ __attribute__((target("avx2"))) std::string reverse_complement_impl(const std::s

namespace dorado::utils {

size_t find_rna_polya(const std::string& seq) {
// Number of bases to search at the end of the RNA sequence
const size_t kSearchSize = 200;
// Minimum contiguous length of a polyA
const size_t kMinPolyASize = 5;

const size_t size = seq.size();
const size_t end = (kSearchSize < size) ? size - kSearchSize : size_t(0);
size_t polya_size = 0;
size_t polya_end_idx = size;

// In RNA - sequence is reversed
for (size_t i = size; i > end; --i) {
if (seq[i - 1] == 'A') {
if (++polya_size >= kMinPolyASize) {
polya_end_idx = i - 1;
}
} else if (polya_end_idx != size) {
break;
} else {
polya_size = 0;
}
}

return polya_end_idx;
}

float mean_qscore_from_qstring(std::string_view qstring) {
if (qstring.empty()) {
return 0.0f;
Expand Down
4 changes: 4 additions & 0 deletions dorado/utils/sequence_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@

namespace dorado::utils {

// Returns the polyA start index in seq. The is the polyA end index in the forward direction.
// Used to trim the polyA from the qstring when calculating the mean.
size_t find_rna_polya(const std::string& seq);

// Calculate a mean qscore from a per-base Q string.
float mean_qscore_from_qstring(std::string_view qstring);

Expand Down
59 changes: 59 additions & 0 deletions tests/SequenceUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,62 @@ TEST_CASE(TEST_GROUP ": count trailing chars", TEST_GROUP) {
auto actual = dorado::utils::count_trailing_chars(seq, 'A');
CHECK(actual == expected);
}

TEST_CASE(TEST_GROUP "find rna polya", TEST_GROUP) {
// Polya |here|
// Index 0 5 10 15 20 25 30
const std::string seq = "TTTTTCCCCCTTTTTCCCCCTTTTTCCCCCAAAAATCAATCA";
const size_t expected_index = 30;
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find first rna polya", TEST_GROUP) {
// Expect to only trim the "first" polya match
// Polya |2nd | |1st |
// Index 0 5 10 15 20 25 30 35 40
const std::string seq = "TTTTTCCCCCTTTTTCCCCCTTTTTCCCCCAAAAATTTTTAAAAAC";
const size_t expected_index = 40;
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find no rna polya", TEST_GROUP) {
// With no polyA expect to trim no bases
const std::string seq = "TTTTTCCCCCTTTTTCCCCCTTTTTCCCCC";
const size_t expected_index = seq.length();
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find rna polya - at start", TEST_GROUP) {
// Polya |here|
const std::string seq = "AAAAACCCCCTTTTTCCCCCTTTTTCCCCC";
const size_t expected_index = 0;
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find rna polya - straddle search area", TEST_GROUP) {
// PolyA continues over the maximum of 200 bases
const std::string seq(210, 'A');
const size_t expected_index = 10;
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find rna polya - outside search", TEST_GROUP) {
// PolyA beyond the search range
const std::string seq = std::string(5, 'A') + std::string(200, 'T');
const size_t expected_index = seq.length();
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

TEST_CASE(TEST_GROUP "find rna polya - within search", TEST_GROUP) {
using S = std::string;
const S seq = S(100, 'A') + S(155, 'C') + S(10, 'A') + S(100, 'C');
const size_t expected_index = 255;
const size_t res = dorado::utils::find_rna_polya(seq);
CHECK(expected_index == res);
}

0 comments on commit ae47155

Please sign in to comment.