Skip to content

Commit

Permalink
Merge branch 'jdaw/add-midstrand-bc-detection' into 'master'
Browse files Browse the repository at this point in the history
DOR-678 Add midstrand support for single and double ended barcodes

See merge request machine-learning/dorado!993
  • Loading branch information
tijyojwad committed May 15, 2024
2 parents 68d40da + 8a52c97 commit 6373792
Show file tree
Hide file tree
Showing 11 changed files with 201 additions and 3 deletions.
2 changes: 2 additions & 0 deletions documentation/CustomBarcodes.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ flank_left_pad = 5
flank_right_pad = 10
front_barcode_window = 175
rear_barcode_window = 175
midstrand_flank_score = 0.8
```

#### Arrangement Options
Expand Down Expand Up @@ -97,6 +98,7 @@ If a candidate meets (1) or (2) AND (3), and the location of the start/end of th
| front_barcode_window | Number of bases at the front of the read within which to look for barcodes. |
| rear_barcode_window | Number of bases at the rear of the read within which to look for barcodes. |
| min_flank_score | Minimum score for the flank alignment. Score here is 1.f - (edit distance) / flank_length |
| midstrand_flank_score | Minimum score for a flank alignment that is not at read ends to be considered as a mid-strand barcode. Score here is 1.f - (edit distance) / flank_length |

For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment penalty. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification.

Expand Down
150 changes: 149 additions & 1 deletion dorado/demux/BarcodeClassifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,62 @@ std::vector<BarcodeScoreResult> BarcodeClassifier::calculate_barcode_score_diffe
return results;
}

float BarcodeClassifier::find_midstrand_barcode_different_double_ends(
std::string_view read_seq,
const BarcodeCandidateKit& candidate) const {
auto length_of_end_windows =
m_scoring_params.front_barcode_window + m_scoring_params.rear_barcode_window;
if ((int)read_seq.length() < length_of_end_windows) {
return 0.f;
}

std::string_view top_context_v1 = candidate.top_context;
std::string_view bottom_context_v1 = candidate.bottom_context_rev;

std::string_view top_context_v2 = candidate.bottom_context;
std::string_view bottom_context_v2 = candidate.top_context_rev;

auto length_without_end_windows = read_seq.length() - length_of_end_windows;

if (length_without_end_windows <
std::min({top_context_v1.length(), bottom_context_v1.length(), top_context_v2.length(),
bottom_context_v2.length()})) {
return 0.f;
}

auto read_mid =
read_seq.substr(m_scoring_params.front_barcode_window, length_without_end_windows);

// Try to find the location of the barcode + flanks in the top and bottom windows.
EdlibAlignConfig placement_config = init_edlib_config_for_flanks();

int barcode_len = int(candidate.barcodes1[0].length());

// Fetch barcode mask locations for variant 1
auto [top_result_v1, top_flank_score_v1, top_bc_loc_v1] = extract_flank_fit(
top_context_v1, read_mid, barcode_len, placement_config, "midstrand flank top v1");

auto [bottom_result_v1, bottom_flank_score_v1, bottom_bc_loc_v1] =
extract_flank_fit(bottom_context_v1, read_mid, barcode_len, placement_config,
"midstrand flank bottom v1");

// Fetch barcode mask locations for variant 2
auto [top_result_v2, top_flank_score_v2, top_bc_loc_v2] = extract_flank_fit(
top_context_v2, read_mid, barcode_len, placement_config, "midstrand flank top v2");

auto [bottom_result_v2, bottom_flank_score_v2, bottom_bc_loc_v2] =
extract_flank_fit(bottom_context_v2, read_mid, barcode_len, placement_config,
"midstrand flank bottom v2");

edlibFreeAlignResult(top_result_v1);
edlibFreeAlignResult(bottom_result_v1);
edlibFreeAlignResult(top_result_v2);
edlibFreeAlignResult(bottom_result_v2);
// Find the best variant of the two.
return std::max(
{top_flank_score_v1, bottom_flank_score_v1, top_flank_score_v2, bottom_flank_score_v2});
}

// Calculate barcode score for the following barcoding scenario:
// 5' >-=====--------------=====-> 3'
// BCXXX RC(BCXXX)
Expand Down Expand Up @@ -686,6 +742,44 @@ std::vector<BarcodeScoreResult> BarcodeClassifier::calculate_barcode_score_doubl
return results;
}

float BarcodeClassifier::find_midstrand_barcode_double_ends(
std::string_view read_seq,
const BarcodeCandidateKit& candidate) const {
auto length_of_end_windows =
m_scoring_params.front_barcode_window + m_scoring_params.rear_barcode_window;
if ((int)read_seq.length() < length_of_end_windows) {
return 0.f;
}

std::string_view top_context = candidate.top_context;
std::string_view bottom_context = candidate.top_context_rev;

auto length_without_end_windows = read_seq.length() - length_of_end_windows;

if (length_without_end_windows < std::min({top_context.length(), bottom_context.length()})) {
return 0.f;
}

auto read_mid =
read_seq.substr(m_scoring_params.front_barcode_window, length_without_end_windows);

// Try to find the location of the barcode + flanks in the top and bottom windows.
EdlibAlignConfig placement_config = init_edlib_config_for_flanks();

int barcode_len = int(candidate.barcodes1[0].length());

auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(
top_context, read_mid, barcode_len, placement_config, "midstrand flank top");

auto [bottom_result, bottom_flank_score, bottom_bc_loc] = extract_flank_fit(
bottom_context, read_mid, barcode_len, placement_config, "midstrand flank bottom");

edlibFreeAlignResult(top_result);
edlibFreeAlignResult(bottom_result);
// Find the best variant of the two.
return std::max({top_flank_score, bottom_flank_score});
}

// Calculate barcode score for the following barcoding scenario:
// 5' >-=====---------------> 3'
// BCXXX
Expand Down Expand Up @@ -753,6 +847,38 @@ std::vector<BarcodeScoreResult> BarcodeClassifier::calculate_barcode_score(
return results;
}

float BarcodeClassifier::find_midstrand_barcode_single_end(
std::string_view read_seq,
const BarcodeCandidateKit& candidate) const {
auto length_of_end_windows = m_scoring_params.front_barcode_window;
if ((int)read_seq.length() < length_of_end_windows) {
return 0.f;
}

std::string_view top_context = candidate.top_context;

auto length_without_end_windows = read_seq.length() - length_of_end_windows;

if (length_without_end_windows < top_context.length()) {
return 0.f;
}

auto read_mid =
read_seq.substr(m_scoring_params.front_barcode_window, length_without_end_windows);

// Try to find the location of the barcode + flanks in the top and bottom windows.
EdlibAlignConfig placement_config = init_edlib_config_for_flanks();

int barcode_len = int(candidate.barcodes1[0].length());

auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(
top_context, read_mid, barcode_len, placement_config, "midstrand flank top");

edlibFreeAlignResult(top_result);
// Find the best variant of the two.
return top_flank_score;
}

// Score every barcode against the input read and returns the best match,
// or an unclassified match, based on certain heuristics.
BarcodeScoreResult BarcodeClassifier::find_best_barcode(
Expand All @@ -775,9 +901,31 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode(
throw std::runtime_error("Unimplemented: multiple barcoding kits");
}

const auto& kit = get_kit_info(candidate->kit);

// Detect presence of mid-strand barcode. If one is confident found, then
// treat that read as unclassified since it's most likely an unsplit read.
float midstrand_score = -1.f;
if (kit.double_ends) {
if (kit.ends_different) {
midstrand_score = find_midstrand_barcode_different_double_ends(fwd, *candidate);
} else {
midstrand_score = find_midstrand_barcode_double_ends(fwd, *candidate);
}
} else {
midstrand_score = find_midstrand_barcode_single_end(fwd, *candidate);
}
const auto midstrand_thres = m_scoring_params.midstrand_flank_score;
if (midstrand_score >= midstrand_thres) {
spdlog::trace("Found midstrand barcode flanks with score {}, threshold {}", midstrand_score,
midstrand_thres);
auto midstrand_res = UNCLASSIFIED;
midstrand_res.found_midstrand = true;
return midstrand_res;
}

// Then find the best barcode hit within that kit.
std::vector<BarcodeScoreResult> results;
const auto& kit = get_kit_info(candidate->kit);
if (kit.double_ends) {
if (kit.ends_different) {
auto out = calculate_barcode_score_different_double_ends(fwd, *candidate,
Expand Down
6 changes: 6 additions & 0 deletions dorado/demux/BarcodeClassifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ class BarcodeClassifier {
const std::vector<BarcodeCandidateKit> m_barcode_candidates;

std::vector<BarcodeCandidateKit> generate_candidates(const std::vector<std::string>& kit_names);
float find_midstrand_barcode_different_double_ends(std::string_view read_seq,
const BarcodeCandidateKit& candidate) const;
float find_midstrand_barcode_double_ends(std::string_view read_seq,
const BarcodeCandidateKit& candidate) const;
float find_midstrand_barcode_single_end(std::string_view read_seq,
const BarcodeCandidateKit& candidate) const;
std::vector<BarcodeScoreResult> calculate_barcode_score_different_double_ends(
std::string_view read_seq,
const BarcodeCandidateKit& candidate,
Expand Down
1 change: 1 addition & 0 deletions dorado/read_pipeline/BarcodeClassifierNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) {
auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends,
barcoding_info->allowed_barcodes);
read.read_common.barcode = generate_barcode_string(bc_res);
spdlog::trace("Barcode for {} is {}", read.read_common.read_id, read.read_common.barcode);
read.read_common.barcoding_result = std::make_shared<BarcodeScoreResult>(std::move(bc_res));
if (barcoding_info->trim) {
read.read_common.barcode_trim_interval = Trimmer::determine_trim_interval(
Expand Down
2 changes: 1 addition & 1 deletion dorado/utils/barcode_kits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const std::string BC_1st_REAR = "TTAACCTTTCTGTTGGTGCTGATATTGC";
const std::string BC_2nd_FRONT = "GGTGCTG";
const std::string BC_2nd_REAR = "TTAACCTACTTGCCTGTCGCTCTATCTTC";

const std::string NB_1st_FRONT = "AGGTTAA";
const std::string NB_1st_FRONT = "ATTGCTAAGGTTAA";
const std::string NB_1st_REAR = "CAGCACCT";
const std::string NB_2nd_FRONT = "ATTGCTAAGGTTAA";
const std::string NB_2nd_REAR = "CAGCACC";
Expand Down
1 change: 1 addition & 0 deletions dorado/utils/barcode_kits.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ struct BarcodeKitScoringParams {
int front_barcode_window = 175;
int rear_barcode_window = 175;
float min_flank_score = 0.5f;
float midstrand_flank_score = 0.8f;
};

struct KitInfo {
Expand Down
3 changes: 3 additions & 0 deletions dorado/utils/parse_custom_kit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,9 @@ dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params(
if (config.contains("min_flank_score")) {
params.min_flank_score = toml::find<float>(config, "min_flank_score");
}
if (config.contains("midstrand_flank_score")) {
params.midstrand_flank_score = toml::find<float>(config, "midstrand_flank_score");
}

return params;
}
Expand Down
1 change: 1 addition & 0 deletions dorado/utils/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct BarcodeScoreResult {
std::string variant = "n/a";
std::pair<int, int> top_barcode_pos = {-1, -1};
std::pair<int, int> bottom_barcode_pos = {-1, -1};
bool found_midstrand = false;
};

struct SingleEndResult {
Expand Down
34 changes: 33 additions & 1 deletion tests/BarcodeClassifierTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,38 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - passing case", TEST_
}
}

TEST_CASE("BarcodeClassifier: check presence of midstrand barcode double ended kit", TEST_GROUP) {
fs::path data_dir = fs::path(get_data_dir("barcode_demux/double_end_variant"));

demux::BarcodeClassifier classifier({"EXP-PBC096"}, std::nullopt, std::nullopt);

// Check case where both ends do match.
auto bc_file = data_dir / "EXP-PBC096_midstrand.fasta";
HtsReader reader(bc_file.string(), std::nullopt);
while (reader.read()) {
std::string seq = utils::extract_sequence(reader.record.get());
auto res = classifier.barcode(seq, false, std::nullopt);
CHECK(res.barcode_name == "unclassified");
CHECK(res.found_midstrand);
}
}

TEST_CASE("BarcodeClassifier: check presence of midstrand barcode single ended kit", TEST_GROUP) {
fs::path data_dir = fs::path(get_data_dir("barcode_demux/single_end"));

demux::BarcodeClassifier classifier({"SQK-RBK114-96"}, std::nullopt, std::nullopt);

// Check case where both ends do match.
auto bc_file = data_dir / "SQK-RBK114-96_midstrand.fasta";
HtsReader reader(bc_file.string(), std::nullopt);
while (reader.read()) {
std::string seq = utils::extract_sequence(reader.record.get());
auto res = classifier.barcode(seq, false, std::nullopt);
CHECK(res.barcode_name == "unclassified");
CHECK(res.found_midstrand);
}
}

TEST_CASE(
"BarcodeClassifierNode: check read messages are correctly updated after classification and "
"trimming",
Expand Down Expand Up @@ -483,4 +515,4 @@ TEST_CASE(
"Either custom kit must include kit arrangement or a kit name needs to be passed in.");
}

} // namespace dorado::barcode_classifier_test
} // namespace dorado::barcode_classifier_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>a6703fe8-baa2-4dbe-a2de-4b5b22acd99
TGTACTTCGTTCCAGTTGCATTATGCTGGTGCTGTTCGGATTCTGTGGTAACTTCCTATTAATGCCTTTCTGTTGGTGCTGATGTGCGGCGTCTGCTTGGGTGTTTAACCTCCAAAAATCGCCGGAAGGGGCCCCTGCCCACTGAAACCGTGGTTTGCGATTCCTGGCCCTGTCGCTGAAGAATACAGCATCGCCTTTGGTCACTGGGCGTAACTGGAGGGCAAAGGTACGCCGGAAGGAGAGTATATACGCGCTGGATACCGGCTGCTGCTGGAATTCGGTACGTGCCTGCCTGCGCTGGGAAGATAAGCAGGCCTTTGTCAGCCGTCGAACCGGCATAAGGATTTGGAAATGAAACGGCGGCGTCTTAAACACCGACCTACGATATAGGAAGGCGGATAAGACGCGACCGGCGTCACATCCGGCGCTAGCCGTAAATTCTATACAAAATTACCGCCGCTCCAGATCTCAAAGCAATGAGCTGTGAGAGTTCTGCGCATCAGCATCGTGGAATTCGCTGAATACCGATTCCAGTCATCCGGCTCATCAATCGGAAATGGGTGTCGCCTTCCACTTCTGCGTCATTAATCAGATACAGTTTTTCTGCGCTTTTGGCAAGAACTGTTCATAAACGCGACCGCCGCCGCGATCACCCTTGGTGCGTACTACACGCCGCGATGGCTTCATCCACCAGCTACCGCGTTACGCGATCGTCAAATACCCGGTTGACTCTTCAGGGATAATATTTTTGCGTCTGGCAACGGACGACCGATTGATTCCCAGGTATGGCGGCCCATAATCACGGGTTTATTTAAGGTGTTGCGTTTAAACCAGGCGAGATCGGCAGGCAGGTTCCACGGCGTATGGCGTTTTCCATGCCGATAACGCGATCTACCGCTAACGCCGCAATCAGACTGATCATTGAGATTTCCCGATAAAAAAAATTGTCACAACCACTATGCGTAAAGCGTAAACCGTCGTCGACTGGTGCGAGGATGATGTTGAGGTTAAACACCCAAACGGAGCGCCGCAATATCAGCACCAGCAAGAAGGTTAATAGGGAAACACGATAGAATCCGAACAGCACCAGCAATACGTAATATTGTACTTCGTTCCAGTTGCATTATGCTGGTGCTGTTCGGATTCTGTGGTAACTTCCTATTAATGCCTTTCTGTTGGTGCTGATGTGCGGCGTCTGCTTGGGTGTTTAACCTCCAAAAATCGCCGGAAGGGGCCCCTGCCCACTGAAACCGTGGTTTGCGATTCCTGGCCCTGTCGCTGAAGAATACAGCATCGCCTTTGGTCACTGGGCGTAACTGGAGGGCAAAGGTACGCCGGAAGGAGAGTATATACGCGCTGGATACCGGCTGCTGCTGGAATTCGGTACGTGCCTGCCTGCGCTGGGAAGATAAGCAGGCCTTTGTCAGCCGTCGAACCGGCATAAGGATTTGGAAATGAAACGGCGGCGTCTTAAACACCGACCTACGATATAGGAAGGCGGATAAGACGCGACCGGCGTCACATCCGGCGCTAGCCGTAAATTCTATACAAAATTACCGCCGCTCCAGATCTCAAAGCAATGAGCTGTGAGAGTTCTGCGCATCAGCATCGTGGAATTCGCTGAATACCGATTCCAGTCATCCGGCTCATCAATCGGAAATGGGTGTCGCCTTCCACTTCTGCGTCATTAATCAGATACAGTTTTTCTGCGCTTTTGGCAAGAACTGTTCATAAACGCGACCGCCGCCGCGATCACCCTTGGTGCGTACTACACGCCGCGATGGCTTCATCCACCAGCTACCGCGTTACGCGATCGTCAAATACCCGGTTGACTCTTCAGGGATAATATTTTTGCGTCTGGCAACGGACGACCGATTGATTCCCAGGTATGGCGGCCCATAATCACGGGTTTATTTAAGGTGTTGCGTTTAAACCAGGCGAGATCGGCAGGCAGGTTCCACGGCGTATGGCGTTTTCCATGCCGATAACGCGATCTACCGCTAACGCCGCAATCAGACTGATCATTGAGATTTCCCGATAAAAAAAATTGTCACAACCACTATGCGTAAAGCGTAAACCGTCGTCGACTGGTGCGAGGATGATGTTGAGGTTAAACACCCAAACGGAGCGCCGCAATATCAGCACCAGCAAGAAGGTTAATAGGGAAACACGATAGAATCCGAACAGCACCAGCAATACGTAATAT
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>00681739-1170-486a-899c-f6546ca9222e
GCCGATATTAACCAAAGTTCGGTGTCTTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTGGGGCGACCACAGGTTCAATTTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCACATCCATACCAGAAAGATCATCTCGCGGGTCCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCGCCTCGGAGAACTGCCGATATTAACCAAAGTTCGGTGTCTTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTGGGGCGACCACAGGTTCAATTTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCACATCCATACCAGAAAGATCATCTCGCGGGTCCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCGCCTCGGAGAACT

0 comments on commit 6373792

Please sign in to comment.