From d8a762d26ac3281d7fd748d498af6d78a9bd9339 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Fri, 15 Dec 2023 11:20:57 -0500 Subject: [PATCH 01/23] increase rbk4 leading flank --- dorado/utils/barcode_kits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index d7155667..84c61fa8 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -22,7 +22,7 @@ const std::string RBK_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA const std::string RBK4_FRONT = "GCTTGGGTGTTTAACC"; const std::string RBK4_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; -const std::string RBK4_kit14_FRONT = "C"; +const std::string RBK4_kit14_FRONT = "TAACC"; const std::string RBK4_kit14_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; const std::string RLB_FRONT = "CCGTGAC"; From dae2d19795a2aa26f5fff9c844a4617823c6d7cc Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Fri, 15 Dec 2023 12:22:18 -0500 Subject: [PATCH 02/23] rbk4 full leading flank for kit 14 --- dorado/utils/barcode_kits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 84c61fa8..cd4b67cc 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -22,7 +22,7 @@ const std::string RBK_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA const std::string RBK4_FRONT = "GCTTGGGTGTTTAACC"; const std::string RBK4_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; -const std::string RBK4_kit14_FRONT = "TAACC"; +const std::string RBK4_kit14_FRONT = "GCTTGGGTGTTTAACC"; const std::string RBK4_kit14_REAR = "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; const std::string RLB_FRONT = "CCGTGAC"; From 306fe9518779a88ae9398561b3736671a21119f8 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Mon, 18 Dec 2023 20:26:55 +0000 Subject: [PATCH 03/23] update heuristic to determine RBK alignment location - use front flank and rear flank separately to determine the position of the barcode mask --- dorado/demux/BarcodeClassifier.cpp | 56 +++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index de90ec1a..1fe86ee2 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -39,7 +39,7 @@ EdlibAlignConfig init_edlib_config_for_flanks() { // the detected region. EdlibAlignConfig init_edlib_config_for_mask() { EdlibAlignConfig mask_config = edlibDefaultAlignConfig(); - mask_config.mode = EDLIB_MODE_NW; + mask_config.mode = EDLIB_MODE_HW; mask_config.task = (spdlog::get_level() == spdlog::level::trace) ? EDLIB_TASK_PATH : EDLIB_TASK_LOC; return mask_config; @@ -86,6 +86,54 @@ std::tuple extract_flank_fit(std::string_view stra return {result, score, bc_loc}; } +// Helper function to locally align the flanks with barcode mask +// against a subsequence of the read (either front or rear window) +// and return the alignment, score & barcode position. +std::tuple extract_flank_fit_rbk( + std::string_view strand, + std::string_view read, + int barcode_len, + const EdlibAlignConfig& placement_config, + const char* debug_prefix) { + (void)strand; + const std::string front_flank = + "GCTTGGGTGTTTAACCNNNNNNNNNNNNNNNNNNNNNNNNGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGC" + "TTCA"; + const std::string rear_flank = + "CNNNNNNNNNNNNNNNNNNNNNNNNGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; + + EdlibAlignResult result_front = edlibAlign(front_flank.data(), int(front_flank.length()), + read.data(), int(read.length()), placement_config); + float front_score = 1.f - static_cast(result_front.editDistance) / + (front_flank.length() - barcode_len); + int bc_loc_front = extract_mask_location(result_front, front_flank); + spdlog::trace("{} {} front_score {} pos {}", debug_prefix, result_front.editDistance, + front_score, bc_loc_front); + spdlog::trace("\n{}", utils::alignment_to_str(front_flank.data(), read.data(), result_front)); + + EdlibAlignResult result_rear = edlibAlign(rear_flank.data(), int(rear_flank.length()), + read.data(), int(read.length()), placement_config); + float rear_score = 1.f - static_cast(result_rear.editDistance) / + (rear_flank.length() - barcode_len); + int bc_loc_rear = extract_mask_location(result_rear, rear_flank); + spdlog::trace("{} {} rear_score {} pos {}", debug_prefix, result_rear.editDistance, rear_score, + bc_loc_rear); + spdlog::trace("\n{}", utils::alignment_to_str(rear_flank.data(), read.data(), result_rear)); + + (void)bc_loc_front; + auto mask_len = bc_loc_rear + barcode_len - (front_score > 0.6f ? bc_loc_front : 0); + if (mask_len == 0) { + spdlog::error("{} {} front_score {} pos {}", debug_prefix, result_front.editDistance, + front_score, bc_loc_front); + spdlog::error("\n{}", + utils::alignment_to_str(front_flank.data(), read.data(), result_front)); + spdlog::error("{} {} rear_score {} pos {}", debug_prefix, result_rear.editDistance, + rear_score, bc_loc_rear); + spdlog::error("\n{}", utils::alignment_to_str(rear_flank.data(), read.data(), result_rear)); + } + return {result_rear, (rear_score + front_score) / 2, bc_loc_front, mask_len}; +} + // Helper function to globally align a barcode to a region // within the read. float extract_mask_score(std::string_view barcode, @@ -528,9 +576,9 @@ std::vector BarcodeClassifier::calculate_barcode_score( std::string_view top_context = candidate.top_context; int barcode_len = int(candidate.barcodes1[0].length()); - auto [top_result, top_flank_score, top_bc_loc] = - extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - std::string_view top_mask = read_top.substr(top_bc_loc, barcode_len); + auto [top_result, top_flank_score, top_bc_loc, mask_len] = extract_flank_fit_rbk( + top_context, read_top, barcode_len, placement_config, "top score"); + std::string_view top_mask = read_top.substr(top_bc_loc, mask_len); spdlog::trace("BC location {}", top_bc_loc); std::vector results; From 9374ec097bfbbe7be9b070d46943f99b7c45c77e Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Fri, 5 Jan 2024 22:26:02 +0000 Subject: [PATCH 04/23] new heuristic for classification - use only rear flank to find mask position - use parts of the front and rear flank during barcode matching to account for possible deletions --- dorado/demux/BarcodeClassifier.cpp | 279 ++++++++++++++++------------- dorado/demux/BarcodeClassifier.h | 1 + 2 files changed, 154 insertions(+), 126 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 1fe86ee2..fc6fb5f2 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -39,7 +39,7 @@ EdlibAlignConfig init_edlib_config_for_flanks() { // the detected region. EdlibAlignConfig init_edlib_config_for_mask() { EdlibAlignConfig mask_config = edlibDefaultAlignConfig(); - mask_config.mode = EDLIB_MODE_HW; + mask_config.mode = EDLIB_MODE_NW; mask_config.task = (spdlog::get_level() == spdlog::level::trace) ? EDLIB_TASK_PATH : EDLIB_TASK_LOC; return mask_config; @@ -80,60 +80,14 @@ std::tuple extract_flank_fit(std::string_view stra EdlibAlignResult result = edlibAlign(strand.data(), int(strand.length()), read.data(), int(read.length()), placement_config); float score = 1.f - static_cast(result.editDistance) / (strand.length() - barcode_len); - spdlog::trace("{} {} score {}", debug_prefix, result.editDistance, score); + (void)extract_mask_location(result, strand); + int bc_loc = result.startLocations[0]; //std::max(0, result.startLocations[0] - barcode_len); + spdlog::trace("{} {} position {} bc_loc {} score {}", debug_prefix, result.editDistance, + result.startLocations[0], bc_loc, score); spdlog::trace("\n{}", utils::alignment_to_str(strand.data(), read.data(), result)); - int bc_loc = extract_mask_location(result, strand); return {result, score, bc_loc}; } -// Helper function to locally align the flanks with barcode mask -// against a subsequence of the read (either front or rear window) -// and return the alignment, score & barcode position. -std::tuple extract_flank_fit_rbk( - std::string_view strand, - std::string_view read, - int barcode_len, - const EdlibAlignConfig& placement_config, - const char* debug_prefix) { - (void)strand; - const std::string front_flank = - "GCTTGGGTGTTTAACCNNNNNNNNNNNNNNNNNNNNNNNNGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGC" - "TTCA"; - const std::string rear_flank = - "CNNNNNNNNNNNNNNNNNNNNNNNNGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"; - - EdlibAlignResult result_front = edlibAlign(front_flank.data(), int(front_flank.length()), - read.data(), int(read.length()), placement_config); - float front_score = 1.f - static_cast(result_front.editDistance) / - (front_flank.length() - barcode_len); - int bc_loc_front = extract_mask_location(result_front, front_flank); - spdlog::trace("{} {} front_score {} pos {}", debug_prefix, result_front.editDistance, - front_score, bc_loc_front); - spdlog::trace("\n{}", utils::alignment_to_str(front_flank.data(), read.data(), result_front)); - - EdlibAlignResult result_rear = edlibAlign(rear_flank.data(), int(rear_flank.length()), - read.data(), int(read.length()), placement_config); - float rear_score = 1.f - static_cast(result_rear.editDistance) / - (rear_flank.length() - barcode_len); - int bc_loc_rear = extract_mask_location(result_rear, rear_flank); - spdlog::trace("{} {} rear_score {} pos {}", debug_prefix, result_rear.editDistance, rear_score, - bc_loc_rear); - spdlog::trace("\n{}", utils::alignment_to_str(rear_flank.data(), read.data(), result_rear)); - - (void)bc_loc_front; - auto mask_len = bc_loc_rear + barcode_len - (front_score > 0.6f ? bc_loc_front : 0); - if (mask_len == 0) { - spdlog::error("{} {} front_score {} pos {}", debug_prefix, result_front.editDistance, - front_score, bc_loc_front); - spdlog::error("\n{}", - utils::alignment_to_str(front_flank.data(), read.data(), result_front)); - spdlog::error("{} {} rear_score {} pos {}", debug_prefix, result_rear.editDistance, - rear_score, bc_loc_rear); - spdlog::error("\n{}", utils::alignment_to_str(rear_flank.data(), read.data(), result_rear)); - } - return {result_rear, (rear_score + front_score) / 2, bc_loc_front, mask_len}; -} - // Helper function to globally align a barcode to a region // within the read. float extract_mask_score(std::string_view barcode, @@ -142,8 +96,10 @@ float extract_mask_score(std::string_view barcode, const char* debug_prefix) { auto result = edlibAlign(barcode.data(), int(barcode.length()), read.data(), int(read.length()), config); - float score = 1.f - static_cast(result.editDistance) / barcode.length(); - spdlog::trace("{} {} score {}", debug_prefix, result.editDistance, score); + //float score = 1.f - static_cast(result.editDistance) / barcode.length(); + float score = result.editDistance; + spdlog::trace("{} {} position {} score {}", debug_prefix, result.editDistance, + result.startLocations[0], score); spdlog::trace("\n{}", utils::alignment_to_str(barcode.data(), read.data(), result)); edlibFreeAlignResult(result); return score; @@ -179,7 +135,9 @@ std::unordered_map process_custom_ki namespace demux { -const int TRIM_LENGTH = 150; +const int TRIM_LENGTH = 175; +const int left_buffer = 5; +const int right_buffer = 10; const BarcodeScoreResult UNCLASSIFIED{}; struct BarcodeClassifier::BarcodeCandidateKit { @@ -188,13 +146,17 @@ struct BarcodeClassifier::BarcodeCandidateKit { std::vector barcodes2; std::vector barcodes2_rev; std::string top_context; + std::string top_context_left_buffer; + std::string top_context_right_buffer; std::string top_context_rev; + std::string top_context_rev_left_buffer; + std::string top_context_rev_right_buffer; std::string bottom_context; + std::string bottom_context_left_buffer; + std::string bottom_context_right_buffer; std::string bottom_context_rev; - int top_context_front_flank_len; - int top_context_rear_flank_len; - int bottom_context_front_flank_len; - int bottom_context_rear_flank_len; + std::string bottom_context_rev_left_buffer; + std::string bottom_context_rev_right_buffer; std::vector barcode_names; std::string kit; std::string barcode_kit; @@ -289,20 +251,36 @@ std::vector BarcodeClassifier::generate_ const auto& ref_bc = get_barcode_sequence(ref_bc_name); std::string bc_mask(ref_bc.length(), 'N'); - candidate.top_context = kit_info.top_front_flank + bc_mask + kit_info.top_rear_flank; - candidate.top_context_rev = utils::reverse_complement(kit_info.top_rear_flank) + bc_mask + - utils::reverse_complement(kit_info.top_front_flank); + candidate.top_context = kit_info.top_rear_flank; + candidate.top_context_left_buffer = kit_info.top_front_flank.substr( + std::max(0lu, kit_info.top_front_flank.length() - left_buffer)); + candidate.top_context_right_buffer = kit_info.top_rear_flank.substr(0, right_buffer); + + auto top_front_flank_rc = utils::reverse_complement(kit_info.top_front_flank); + auto top_rear_flank_rc = utils::reverse_complement(kit_info.top_rear_flank); + candidate.top_context_rev = top_front_flank_rc; + candidate.top_context_rev_left_buffer = + top_rear_flank_rc.substr(std::max(0lu, top_rear_flank_rc.length() - left_buffer)); + candidate.top_context_rev_right_buffer = top_front_flank_rc.substr(0, right_buffer); if (!kit_info.barcodes2.empty()) { const auto& ref_bc2_name = kit_info.barcodes2[0]; const auto& ref_bc2 = get_barcode_sequence(ref_bc2_name); std::string bc2_mask(ref_bc2.length(), 'N'); - candidate.bottom_context = - kit_info.bottom_front_flank + bc2_mask + kit_info.bottom_rear_flank; - candidate.bottom_context_rev = utils::reverse_complement(kit_info.bottom_rear_flank) + - bc2_mask + - utils::reverse_complement(kit_info.bottom_front_flank); + candidate.bottom_context = kit_info.bottom_rear_flank; + candidate.bottom_context_left_buffer = kit_info.bottom_front_flank.substr( + std::max(0lu, kit_info.bottom_front_flank.length() - left_buffer)); + candidate.bottom_context_right_buffer = + kit_info.bottom_rear_flank.substr(0, right_buffer); + + auto bottom_front_flank_rc = utils::reverse_complement(kit_info.bottom_front_flank); + auto bottom_rear_flank_rc = utils::reverse_complement(kit_info.bottom_rear_flank); + candidate.bottom_context_rev = bottom_front_flank_rc; + candidate.bottom_context_rev_left_buffer = bottom_rear_flank_rc.substr( + std::max(0lu, bottom_rear_flank_rc.length() - left_buffer)); + candidate.bottom_context_rev_right_buffer = + bottom_front_flank_rc.substr(0, right_buffer); } for (size_t idx = 0; idx < kit_info.barcodes.size(); idx++) { @@ -373,28 +351,52 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe EdlibAlignConfig mask_config = init_edlib_config_for_mask(); std::string_view top_context_v1 = candidate.top_context; - std::string_view bottom_strand_v1 = candidate.bottom_context_rev; + const auto& top_context_v1_left_buffer = candidate.top_context_left_buffer; + const auto& top_context_v1_right_buffer = candidate.top_context_right_buffer; + + std::string_view bottom_context_v1 = candidate.bottom_context_rev; + const auto& bottom_context_v1_left_buffer = candidate.bottom_context_rev_left_buffer; + const auto& bottom_context_v1_right_buffer = candidate.bottom_context_rev_right_buffer; + std::string_view top_context_v2 = candidate.bottom_context; - std::string_view bottom_strand_v2 = candidate.top_context_rev; + const auto& top_context_v2_left_buffer = candidate.bottom_context_left_buffer; + const auto& top_context_v2_right_buffer = candidate.bottom_context_right_buffer; + + std::string_view bottom_context_v2 = candidate.top_context_rev; + const auto& bottom_context_v2_left_buffer = candidate.top_context_rev_left_buffer; + const auto& bottom_context_v2_right_buffer = candidate.top_context_rev_right_buffer; + int barcode_len = int(candidate.barcodes1[0].length()); // Fetch barcode mask locations for variant 1 auto [top_result_v1, top_flank_score_v1, top_bc_loc_v1] = extract_flank_fit( top_context_v1, read_top, barcode_len, placement_config, "top score v1"); - std::string_view top_mask_v1 = read_top.substr(top_bc_loc_v1, barcode_len); + auto top_start_idx_v1 = std::max(0, top_bc_loc_v1 - left_buffer - barcode_len); + auto top_end_idx_v1 = top_bc_loc_v1 + right_buffer; + std::string_view top_mask_v1 = + read_top.substr(top_start_idx_v1, top_end_idx_v1 - top_start_idx_v1); auto [bottom_result_v1, bottom_flank_score_v1, bottom_bc_loc_v1] = extract_flank_fit( - bottom_strand_v1, read_bottom, barcode_len, placement_config, "bottom score v1"); - std::string_view bottom_mask_v1 = read_bottom.substr(bottom_bc_loc_v1, barcode_len); + bottom_context_v1, read_bottom, barcode_len, placement_config, "bottom score v1"); + auto bottom_start_idx_v1 = std::max(0, bottom_bc_loc_v1 - left_buffer - barcode_len); + auto bottom_end_idx_v1 = bottom_bc_loc_v1 + right_buffer; + std::string_view bottom_mask_v1 = + read_bottom.substr(bottom_start_idx_v1, bottom_end_idx_v1 - bottom_start_idx_v1); // Fetch barcode mask locations for variant 2 auto [top_result_v2, top_flank_score_v2, top_bc_loc_v2] = extract_flank_fit( top_context_v2, read_top, barcode_len, placement_config, "top score v2"); - std::string_view top_mask_v2 = read_top.substr(top_bc_loc_v2, barcode_len); + auto top_start_idx_v2 = std::max(0, top_bc_loc_v2 - left_buffer - barcode_len); + auto top_end_idx_v2 = top_bc_loc_v2 + right_buffer; + std::string_view top_mask_v2 = + read_top.substr(top_start_idx_v2, top_end_idx_v2 - top_start_idx_v2); auto [bottom_result_v2, bottom_flank_score_v2, bottom_bc_loc_v2] = extract_flank_fit( - bottom_strand_v2, read_bottom, barcode_len, placement_config, "bottom score v2"); - std::string_view bottom_mask_v2 = read_bottom.substr(bottom_bc_loc_v2, barcode_len); + bottom_context_v2, read_bottom, barcode_len, placement_config, "bottom score v2"); + auto bottom_start_idx_v2 = std::max(0, bottom_bc_loc_v2 - left_buffer - barcode_len); + auto bottom_end_idx_v2 = bottom_bc_loc_v2 + right_buffer; + std::string_view bottom_mask_v2 = + read_bottom.substr(bottom_start_idx_v2, bottom_end_idx_v2 - bottom_start_idx_v2); // Find the best variant of the two. int total_v1_score = top_result_v1.editDistance + bottom_result_v1.editDistance; @@ -413,10 +415,14 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe std::vector results; for (size_t i = 0; i < candidate.barcodes1.size(); i++) { - auto& barcode1 = candidate.barcodes1[i]; - auto& barcode1_rev = candidate.barcodes1_rev[i]; - auto& barcode2 = candidate.barcodes2[i]; - auto& barcode2_rev = candidate.barcodes2_rev[i]; + const auto barcode1 = + top_context_v1_left_buffer + candidate.barcodes1[i] + top_context_v1_right_buffer; + const auto barcode1_rev = bottom_context_v2_left_buffer + candidate.barcodes1_rev[i] + + bottom_context_v2_right_buffer; + const auto barcode2 = + top_context_v2_left_buffer + candidate.barcodes2[i] + top_context_v2_right_buffer; + const auto barcode2_rev = bottom_context_v1_left_buffer + candidate.barcodes2_rev[i] + + bottom_context_v1_right_buffer; auto& barcode_name = candidate.barcode_names[i]; if (!barcode_is_permitted(allowed_barcodes, barcode_name)) { @@ -435,8 +441,8 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe BarcodeScoreResult v1; v1.top_score = top_mask_result_score_v1; v1.bottom_score = bottom_mask_result_score_v1; - v1.score = std::max(v1.top_score, v1.bottom_score); - v1.use_top = v1.top_score > v1.bottom_score; + v1.score = std::min(v1.top_score, v1.bottom_score); + v1.use_top = v1.top_score < v1.bottom_score; v1.top_flank_score = top_flank_score_v1; v1.bottom_flank_score = bottom_flank_score_v1; v1.flank_score = v1.use_top ? top_flank_score_v1 : bottom_flank_score_v1; @@ -454,8 +460,8 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe BarcodeScoreResult v2; v2.top_score = top_mask_result_score_v2; v2.bottom_score = bottom_mask_result_score_v2; - v2.score = std::max(v2.top_score, v2.bottom_score); - v2.use_top = v2.top_score > v2.bottom_score; + v2.score = std::min(v2.top_score, v2.bottom_score); + v2.use_top = v2.top_score < v2.bottom_score; v2.top_flank_score = top_flank_score_v2; v2.bottom_flank_score = bottom_flank_score_v2; v2.flank_score = v2.use_top ? top_flank_score_v2 : bottom_flank_score_v2; @@ -464,7 +470,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe bottom_start + bottom_result_v2.endLocations[0]}; // The best score is the higher score between the 2 variants. - const bool var1_is_best = v1.score > v2.score; + const bool var1_is_best = v1.score < v2.score; BarcodeScoreResult res = var1_is_best ? v1 : v2; res.variant = var1_is_best ? "var1" : "var2"; res.barcode_name = barcode_name; @@ -506,21 +512,32 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl EdlibAlignConfig mask_config = init_edlib_config_for_mask(); std::string_view top_context = candidate.top_context; + const auto& top_left_buffer = candidate.top_context_left_buffer; + const auto& top_right_buffer = candidate.top_context_right_buffer; + std::string_view bottom_strand = candidate.top_context_rev; + const auto& bottom_left_buffer = candidate.top_context_rev_left_buffer; + const auto& bottom_right_buffer = candidate.top_context_rev_right_buffer; + int barcode_len = int(candidate.barcodes1[0].length()); auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - std::string_view top_mask = read_top.substr(top_bc_loc, barcode_len); + auto top_start_idx = std::max(0, top_bc_loc - left_buffer - barcode_len); + auto top_end_idx = top_bc_loc + right_buffer; + std::string_view top_mask = read_top.substr(top_start_idx, top_end_idx - top_start_idx); auto [bottom_result, bottom_flank_score, bottom_bc_loc] = extract_flank_fit( bottom_strand, read_bottom, barcode_len, placement_config, "bottom score"); - std::string_view bottom_mask = read_bottom.substr(bottom_bc_loc, barcode_len); + auto bottom_start_idx = std::max(0, bottom_bc_loc - left_buffer - barcode_len); + auto bottom_end_idx = bottom_bc_loc + right_buffer; + std::string_view bottom_mask = + read_bottom.substr(bottom_start_idx, bottom_end_idx - bottom_start_idx); std::vector results; for (size_t i = 0; i < candidate.barcodes1.size(); i++) { - auto& barcode = candidate.barcodes1[i]; - auto& barcode_rev = candidate.barcodes1_rev[i]; + auto barcode = top_left_buffer + candidate.barcodes1[i] + top_right_buffer; + auto barcode_rev = bottom_left_buffer + candidate.barcodes1_rev[i] + bottom_right_buffer; auto& barcode_name = candidate.barcode_names[i]; if (!barcode_is_permitted(allowed_barcodes, barcode_name)) { @@ -539,8 +556,8 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl res.barcode_kit = candidate.barcode_kit; res.top_score = top_mask_score; res.bottom_score = bottom_mask_score; - res.score = std::max(res.top_score, res.bottom_score); - res.use_top = res.top_score > res.bottom_score; + res.score = std::min(res.top_score, res.bottom_score); + res.use_top = res.top_score < res.bottom_score; res.top_flank_score = top_flank_score; res.bottom_flank_score = bottom_flank_score; res.flank_score = res.use_top ? res.top_flank_score : res.bottom_flank_score; @@ -576,22 +593,25 @@ std::vector BarcodeClassifier::calculate_barcode_score( std::string_view top_context = candidate.top_context; int barcode_len = int(candidate.barcodes1[0].length()); - auto [top_result, top_flank_score, top_bc_loc, mask_len] = extract_flank_fit_rbk( - top_context, read_top, barcode_len, placement_config, "top score"); - std::string_view top_mask = read_top.substr(top_bc_loc, mask_len); + auto [top_result, top_flank_score, top_bc_loc] = + extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); + auto start_idx = std::max(0, top_bc_loc - left_buffer - barcode_len); + auto end_idx = top_bc_loc + right_buffer; + std::string_view top_mask = read_top.substr(start_idx, end_idx - start_idx); + spdlog::trace("BC location {}", top_bc_loc); std::vector results; for (size_t i = 0; i < candidate.barcodes1.size(); i++) { - auto& barcode = candidate.barcodes1[i]; + const auto barcode = candidate.top_context_left_buffer + candidate.barcodes1[i] + + candidate.top_context_right_buffer; + ; auto& barcode_name = candidate.barcode_names[i]; if (!barcode_is_permitted(allowed_barcodes, barcode_name)) { continue; } - spdlog::trace("Checking barcode {}", barcode_name); - auto top_mask_score = extract_mask_score(barcode, top_mask, mask_config, "top window"); BarcodeScoreResult res; @@ -620,10 +640,10 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( const std::vector& candidates, bool barcode_both_ends, const BarcodingInfo::FilterSet& allowed_barcodes) const { - if (read_seq.length() < TRIM_LENGTH) { - spdlog::trace("Read length shorter than minimum required ({}) : {}", TRIM_LENGTH, read_seq); - return UNCLASSIFIED; - } + //if (read_seq.length() < TRIM_LENGTH) { + // //spdlog::info("reject due to seq length"); + // return UNCLASSIFIED; + //} const std::string_view fwd = read_seq; // First find best barcode kit. @@ -653,36 +673,39 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( } if (scores.empty()) { + spdlog::info("reject due to no score"); return UNCLASSIFIED; } - if (kit.double_ends) { - // For a double ended barcode, ensure that the best barcode according - // to the top window and the best barcode according to the bottom window - // are the same. If they suggest different barcodes confidently, then - // consider the read unclassified. - auto best_top_score = std::max_element( - scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.top_score < r.top_score; }); - auto best_bottom_score = std::max_element( - scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); - if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && - (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && - (best_top_score->barcode_name != best_bottom_score->barcode_name)) { - spdlog::trace("Two ends confidently predict different BCs: top bc {}, bottom bc {}", - best_top_score->barcode_name, best_bottom_score->barcode_name); - return UNCLASSIFIED; - } - } + //if (kit.double_ends) { + // // For a double ended barcode, ensure that the best barcode according + // // to the top window and the best barcode according to the bottom window + // // are the same. If they suggest different barcodes confidently, then + // // consider the read unclassified. + // auto best_top_score = std::max_element( + // scores.begin(), scores.end(), + // [](const auto& l, const auto& r) { return l.top_score < r.top_score; }); + // auto best_bottom_score = std::max_element( + // scores.begin(), scores.end(), + // [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); + // spdlog::trace("Check double ends: top bc {}, bottom bc {}", best_top_score->barcode_name, + // best_bottom_score->barcode_name); + // if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && + // (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && + // (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + // return UNCLASSIFIED; + // } + //} // Sort the scores windows by their barcode score. + //std::sort(scores.begin(), scores.end(), + // [](const auto& l, const auto& r) { return l.score > r.score; }); std::sort(scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.score > r.score; }); + [](const auto& l, const auto& r) { return l.score < r.score; }); std::stringstream d; for (auto& s : scores) { - d << s.score << " " << s.barcode_name << ", "; + d << s.barcode_name << " " << s.score << ", "; } spdlog::trace("Scores: {}", d.str()); auto best_score = scores.begin(); @@ -702,12 +725,16 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( } } else { auto second_best_score = std::next(best_score); - if (best_score->score - second_best_score->score >= 0.1f) { - const float kMargin = m_scoring_params.min_barcode_score_dist; - if (are_scores_acceptable(*best_score) || - (best_score->score - second_best_score->score >= kMargin)) { - out = *best_score; - } + //if (best_score->score - second_best_score->score > 0.05f) { + // const float kMargin = m_scoring_params.min_barcode_score_dist; + // if (are_scores_acceptable(*best_score) || + // (best_score->score - second_best_score->score >= kMargin)) { + // out = *best_score; + // } + //} + if (std::abs(best_score->score - second_best_score->score) >= 3 && + best_score->top_barcode_pos.first <= 75) { + out = *best_score; } } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 61f7e4df..0e54e380 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -8,6 +8,7 @@ #include #include #include +#include #include namespace dorado { From 1659264009497020d7c6e9898fcf140bd5095a1a Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Mon, 8 Jan 2024 03:50:12 +0000 Subject: [PATCH 05/23] extend new bc classification heuristic to all kits --- dorado/demux/BarcodeClassifier.cpp | 136 ++++++++++++++++++----------- dorado/demux/BarcodeClassifier.h | 3 + 2 files changed, 86 insertions(+), 53 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index fc6fb5f2..829d24af 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -50,12 +50,19 @@ EdlibAlignConfig init_edlib_config_for_mask() { int extract_mask_location(EdlibAlignResult aln, std::string_view query) { int query_cursor = 0; int target_cursor = 0; + bool in_mask = false; for (int i = 0; i < aln.alignmentLength; i++) { + if (query[query_cursor] != 'N' && in_mask) { + break; + } if (aln.alignment[i] == EDLIB_EDOP_MATCH) { query_cursor++; target_cursor++; if (query[query_cursor] == 'N') { - break; + in_mask = true; + //break; + //} else if (in_mask) { + // break; } } else if (aln.alignment[i] == EDLIB_EDOP_MISMATCH) { query_cursor++; @@ -66,6 +73,7 @@ int extract_mask_location(EdlibAlignResult aln, std::string_view query) { query_cursor++; } } + spdlog::trace("query cursor {} target cursor {}", query_cursor, target_cursor); return aln.startLocations[0] + target_cursor; } @@ -81,8 +89,9 @@ std::tuple extract_flank_fit(std::string_view stra int(read.length()), placement_config); float score = 1.f - static_cast(result.editDistance) / (strand.length() - barcode_len); (void)extract_mask_location(result, strand); - int bc_loc = result.startLocations[0]; //std::max(0, result.startLocations[0] - barcode_len); - spdlog::trace("{} {} position {} bc_loc {} score {}", debug_prefix, result.editDistance, + //int bc_loc = result.startLocations[0]; + int bc_loc = extract_mask_location(result, strand); + spdlog::trace("{} dist {} position {} bc_loc {} score {}", debug_prefix, result.editDistance, result.startLocations[0], bc_loc, score); spdlog::trace("\n{}", utils::alignment_to_str(strand.data(), read.data(), result)); return {result, score, bc_loc}; @@ -136,8 +145,8 @@ std::unordered_map process_custom_ki namespace demux { const int TRIM_LENGTH = 175; -const int left_buffer = 5; -const int right_buffer = 10; +const int LEFT_BUFFER = 5; +const int RIGHT_BUFFER = 10; const BarcodeScoreResult UNCLASSIFIED{}; struct BarcodeClassifier::BarcodeCandidateKit { @@ -244,6 +253,20 @@ std::vector BarcodeClassifier::generate_ "an equal number of them"); } + // Update left and right buffer lengths based on the actual flank regions of the + // chosen kit. + m_left_buffer = std::min({LEFT_BUFFER, int(kit_info.top_front_flank.length()), + int(kit_info.top_rear_flank.length())}); + m_right_buffer = std::min({RIGHT_BUFFER, int(kit_info.top_rear_flank.length()), + int(kit_info.top_front_flank.length())}); + if (!kit_info.barcodes2.empty()) { + m_left_buffer = std::min({int(m_left_buffer), int(kit_info.bottom_front_flank.length()), + int(kit_info.bottom_rear_flank.length())}); + m_right_buffer = + std::min({int(m_right_buffer), int(kit_info.bottom_rear_flank.length()), + int(kit_info.bottom_front_flank.length())}); + } + BarcodeCandidateKit candidate; candidate.kit = kit_name; candidate.barcode_kit = kit_info.name; @@ -251,36 +274,37 @@ std::vector BarcodeClassifier::generate_ const auto& ref_bc = get_barcode_sequence(ref_bc_name); std::string bc_mask(ref_bc.length(), 'N'); - candidate.top_context = kit_info.top_rear_flank; + candidate.top_context = kit_info.top_front_flank + bc_mask + kit_info.top_rear_flank; candidate.top_context_left_buffer = kit_info.top_front_flank.substr( - std::max(0lu, kit_info.top_front_flank.length() - left_buffer)); - candidate.top_context_right_buffer = kit_info.top_rear_flank.substr(0, right_buffer); + std::max(0lu, kit_info.top_front_flank.length() - m_left_buffer)); + candidate.top_context_right_buffer = kit_info.top_rear_flank.substr(0, m_right_buffer); auto top_front_flank_rc = utils::reverse_complement(kit_info.top_front_flank); auto top_rear_flank_rc = utils::reverse_complement(kit_info.top_rear_flank); - candidate.top_context_rev = top_front_flank_rc; + candidate.top_context_rev = top_rear_flank_rc + bc_mask + top_front_flank_rc; candidate.top_context_rev_left_buffer = - top_rear_flank_rc.substr(std::max(0lu, top_rear_flank_rc.length() - left_buffer)); - candidate.top_context_rev_right_buffer = top_front_flank_rc.substr(0, right_buffer); + top_rear_flank_rc.substr(std::max(0lu, top_rear_flank_rc.length() - m_left_buffer)); + candidate.top_context_rev_right_buffer = top_front_flank_rc.substr(0, m_right_buffer); if (!kit_info.barcodes2.empty()) { const auto& ref_bc2_name = kit_info.barcodes2[0]; const auto& ref_bc2 = get_barcode_sequence(ref_bc2_name); std::string bc2_mask(ref_bc2.length(), 'N'); - candidate.bottom_context = kit_info.bottom_rear_flank; + candidate.bottom_context = + kit_info.bottom_front_flank + bc2_mask + kit_info.bottom_rear_flank; candidate.bottom_context_left_buffer = kit_info.bottom_front_flank.substr( - std::max(0lu, kit_info.bottom_front_flank.length() - left_buffer)); + std::max(0lu, kit_info.bottom_front_flank.length() - m_left_buffer)); candidate.bottom_context_right_buffer = - kit_info.bottom_rear_flank.substr(0, right_buffer); + kit_info.bottom_rear_flank.substr(0, m_right_buffer); auto bottom_front_flank_rc = utils::reverse_complement(kit_info.bottom_front_flank); auto bottom_rear_flank_rc = utils::reverse_complement(kit_info.bottom_rear_flank); - candidate.bottom_context_rev = bottom_front_flank_rc; + candidate.bottom_context_rev = bottom_rear_flank_rc + bc_mask + bottom_front_flank_rc; candidate.bottom_context_rev_left_buffer = bottom_rear_flank_rc.substr( - std::max(0lu, bottom_rear_flank_rc.length() - left_buffer)); + std::max(0lu, bottom_rear_flank_rc.length() - m_left_buffer)); candidate.bottom_context_rev_right_buffer = - bottom_front_flank_rc.substr(0, right_buffer); + bottom_front_flank_rc.substr(0, m_right_buffer); } for (size_t idx = 0; idx < kit_info.barcodes.size(); idx++) { @@ -371,30 +395,30 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe // Fetch barcode mask locations for variant 1 auto [top_result_v1, top_flank_score_v1, top_bc_loc_v1] = extract_flank_fit( top_context_v1, read_top, barcode_len, placement_config, "top score v1"); - auto top_start_idx_v1 = std::max(0, top_bc_loc_v1 - left_buffer - barcode_len); - auto top_end_idx_v1 = top_bc_loc_v1 + right_buffer; + auto top_start_idx_v1 = std::max(0, top_bc_loc_v1 - m_left_buffer - barcode_len); + auto top_end_idx_v1 = top_bc_loc_v1 + m_right_buffer; std::string_view top_mask_v1 = read_top.substr(top_start_idx_v1, top_end_idx_v1 - top_start_idx_v1); auto [bottom_result_v1, bottom_flank_score_v1, bottom_bc_loc_v1] = extract_flank_fit( bottom_context_v1, read_bottom, barcode_len, placement_config, "bottom score v1"); - auto bottom_start_idx_v1 = std::max(0, bottom_bc_loc_v1 - left_buffer - barcode_len); - auto bottom_end_idx_v1 = bottom_bc_loc_v1 + right_buffer; + auto bottom_start_idx_v1 = std::max(0, bottom_bc_loc_v1 - m_left_buffer - barcode_len); + auto bottom_end_idx_v1 = bottom_bc_loc_v1 + m_right_buffer; std::string_view bottom_mask_v1 = read_bottom.substr(bottom_start_idx_v1, bottom_end_idx_v1 - bottom_start_idx_v1); // Fetch barcode mask locations for variant 2 auto [top_result_v2, top_flank_score_v2, top_bc_loc_v2] = extract_flank_fit( top_context_v2, read_top, barcode_len, placement_config, "top score v2"); - auto top_start_idx_v2 = std::max(0, top_bc_loc_v2 - left_buffer - barcode_len); - auto top_end_idx_v2 = top_bc_loc_v2 + right_buffer; + auto top_start_idx_v2 = std::max(0, top_bc_loc_v2 - m_left_buffer - barcode_len); + auto top_end_idx_v2 = top_bc_loc_v2 + m_right_buffer; std::string_view top_mask_v2 = read_top.substr(top_start_idx_v2, top_end_idx_v2 - top_start_idx_v2); auto [bottom_result_v2, bottom_flank_score_v2, bottom_bc_loc_v2] = extract_flank_fit( bottom_context_v2, read_bottom, barcode_len, placement_config, "bottom score v2"); - auto bottom_start_idx_v2 = std::max(0, bottom_bc_loc_v2 - left_buffer - barcode_len); - auto bottom_end_idx_v2 = bottom_bc_loc_v2 + right_buffer; + auto bottom_start_idx_v2 = std::max(0, bottom_bc_loc_v2 - m_left_buffer - barcode_len); + auto bottom_end_idx_v2 = bottom_bc_loc_v2 + m_right_buffer; std::string_view bottom_mask_v2 = read_bottom.substr(bottom_start_idx_v2, bottom_end_idx_v2 - bottom_start_idx_v2); @@ -523,14 +547,14 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - auto top_start_idx = std::max(0, top_bc_loc - left_buffer - barcode_len); - auto top_end_idx = top_bc_loc + right_buffer; + auto top_start_idx = std::max(0, top_bc_loc - m_left_buffer - barcode_len); + auto top_end_idx = top_bc_loc + m_right_buffer; std::string_view top_mask = read_top.substr(top_start_idx, top_end_idx - top_start_idx); auto [bottom_result, bottom_flank_score, bottom_bc_loc] = extract_flank_fit( bottom_strand, read_bottom, barcode_len, placement_config, "bottom score"); - auto bottom_start_idx = std::max(0, bottom_bc_loc - left_buffer - barcode_len); - auto bottom_end_idx = bottom_bc_loc + right_buffer; + auto bottom_start_idx = std::max(0, bottom_bc_loc - m_left_buffer - barcode_len); + auto bottom_end_idx = bottom_bc_loc + m_right_buffer; std::string_view bottom_mask = read_bottom.substr(bottom_start_idx, bottom_end_idx - bottom_start_idx); @@ -595,8 +619,8 @@ std::vector BarcodeClassifier::calculate_barcode_score( auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - auto start_idx = std::max(0, top_bc_loc - left_buffer - barcode_len); - auto end_idx = top_bc_loc + right_buffer; + auto start_idx = std::max(0, top_bc_loc - m_left_buffer - barcode_len); + auto end_idx = top_bc_loc + m_right_buffer; std::string_view top_mask = read_top.substr(start_idx, end_idx - start_idx); spdlog::trace("BC location {}", top_bc_loc); @@ -677,25 +701,27 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( return UNCLASSIFIED; } - //if (kit.double_ends) { - // // For a double ended barcode, ensure that the best barcode according - // // to the top window and the best barcode according to the bottom window - // // are the same. If they suggest different barcodes confidently, then - // // consider the read unclassified. - // auto best_top_score = std::max_element( - // scores.begin(), scores.end(), - // [](const auto& l, const auto& r) { return l.top_score < r.top_score; }); - // auto best_bottom_score = std::max_element( - // scores.begin(), scores.end(), - // [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); - // spdlog::trace("Check double ends: top bc {}, bottom bc {}", best_top_score->barcode_name, - // best_bottom_score->barcode_name); - // if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && - // (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && - // (best_top_score->barcode_name != best_bottom_score->barcode_name)) { - // return UNCLASSIFIED; - // } - //} + if (kit.double_ends) { + // For a double ended barcode, ensure that the best barcode according + // to the top window and the best barcode according to the bottom window + // are the same. If they suggest different barcodes confidently, then + // consider the read unclassified. + auto best_top_score = std::min_element( + scores.begin(), scores.end(), + [](const auto& l, const auto& r) { return l.top_score < r.top_score; }); + auto best_bottom_score = std::min_element( + scores.begin(), scores.end(), + [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); + spdlog::trace("Check double ends: top bc {}, bottom bc {}", best_top_score->barcode_name, + best_bottom_score->barcode_name); + //if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && + // (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && + // (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + if ((best_top_score->score < 10) && (best_bottom_score->score < 10) && + (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + return UNCLASSIFIED; + } + } // Sort the scores windows by their barcode score. //std::sort(scores.begin(), scores.end(), @@ -732,8 +758,13 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // out = *best_score; // } //} + //if (std::abs(best_score->score - second_best_score->score) >= 3 && + // (best_score->top_barcode_pos.first <= 125 || best_score->bottom_barcode_pos.second >= int(read_seq.length()) - 125)) { + // out = *best_score; + //} if (std::abs(best_score->score - second_best_score->score) >= 3 && - best_score->top_barcode_pos.first <= 75) { + (best_score->top_barcode_pos.first <= 75 || + best_score->bottom_barcode_pos.second >= int(read_seq.length() - 75))) { out = *best_score; } } @@ -742,8 +773,7 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // For more stringent classification, ensure that both ends of a read // have a high score for the same barcode. If not then consider it // unclassified. - if (std::min(out.top_score, out.bottom_score) < - m_scoring_params.min_hard_barcode_threshold) { + if (out.top_score >= 10 || out.bottom_score >= 10) { return UNCLASSIFIED; } } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 0e54e380..c0cd28f4 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -54,6 +54,9 @@ class BarcodeClassifier { const dorado::barcode_kits::KitInfo& get_kit_info(const std::string& kit_name) const; const std::string& get_barcode_sequence(const std::string& barcode_name) const; + + int m_left_buffer; + int m_right_buffer; }; } // namespace demux From 0b99d0f103c8ead01d722f5a5301c927eb48af74 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Sun, 11 Feb 2024 16:25:09 +0000 Subject: [PATCH 06/23] Update config file with new parameters which can be used with the new barcoding algorithm which depends primarily on distance of barcodes between best and second best barcode hit. --- dorado/demux/BarcodeClassifier.cpp | 138 +++++++++--------- dorado/demux/parse_custom_kit.cpp | 23 +-- dorado/demux/parse_custom_kit.h | 11 +- tests/CustomBarcodeParserTest.cpp | 26 ++-- tests/TestUtils.h | 9 ++ .../custom_barcodes/scoring_params.toml | 11 +- 6 files changed, 119 insertions(+), 99 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 829d24af..1bff46ed 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -60,9 +60,6 @@ int extract_mask_location(EdlibAlignResult aln, std::string_view query) { target_cursor++; if (query[query_cursor] == 'N') { in_mask = true; - //break; - //} else if (in_mask) { - // break; } } else if (aln.alignment[i] == EDLIB_EDOP_MISMATCH) { query_cursor++; @@ -88,8 +85,6 @@ std::tuple extract_flank_fit(std::string_view stra EdlibAlignResult result = edlibAlign(strand.data(), int(strand.length()), read.data(), int(read.length()), placement_config); float score = 1.f - static_cast(result.editDistance) / (strand.length() - barcode_len); - (void)extract_mask_location(result, strand); - //int bc_loc = result.startLocations[0]; int bc_loc = extract_mask_location(result, strand); spdlog::trace("{} dist {} position {} bc_loc {} score {}", debug_prefix, result.editDistance, result.startLocations[0], bc_loc, score); @@ -105,10 +100,8 @@ float extract_mask_score(std::string_view barcode, const char* debug_prefix) { auto result = edlibAlign(barcode.data(), int(barcode.length()), read.data(), int(read.length()), config); - //float score = 1.f - static_cast(result.editDistance) / barcode.length(); - float score = result.editDistance; - spdlog::trace("{} {} position {} score {}", debug_prefix, result.editDistance, - result.startLocations[0], score); + auto score = result.editDistance; + spdlog::trace("{} {} position {}", debug_prefix, score, result.startLocations[0]); spdlog::trace("\n{}", utils::alignment_to_str(barcode.data(), read.data(), result)); edlibFreeAlignResult(result); return score; @@ -140,6 +133,16 @@ std::unordered_map process_custom_ki return kit_map; } +// Helper to extract left buffer from a flank. +std::string extract_left_buffer(const std::string& flank, int buffer) { + return flank.substr(std::max(0lu, flank.length() - buffer)); +} + +std::string extract_right_buffer(const std::string& flank, int buffer) { + ; + return flank.substr(0, buffer); +} + } // namespace namespace demux { @@ -253,12 +256,19 @@ std::vector BarcodeClassifier::generate_ "an equal number of them"); } + bool use_leading_flank = true; + if (kit_name.find("SQK-RBK114") != std::string::npos) { + use_leading_flank = false; + } + // Update left and right buffer lengths based on the actual flank regions of the // chosen kit. - m_left_buffer = std::min({LEFT_BUFFER, int(kit_info.top_front_flank.length()), - int(kit_info.top_rear_flank.length())}); - m_right_buffer = std::min({RIGHT_BUFFER, int(kit_info.top_rear_flank.length()), - int(kit_info.top_front_flank.length())}); + m_left_buffer = + std::min({m_scoring_params.flank_left_pad, int(kit_info.top_front_flank.length()), + int(kit_info.top_rear_flank.length())}); + m_right_buffer = + std::min({m_scoring_params.flank_right_pad, int(kit_info.top_rear_flank.length()), + int(kit_info.top_front_flank.length())}); if (!kit_info.barcodes2.empty()) { m_left_buffer = std::min({int(m_left_buffer), int(kit_info.bottom_front_flank.length()), int(kit_info.bottom_rear_flank.length())}); @@ -274,37 +284,51 @@ std::vector BarcodeClassifier::generate_ const auto& ref_bc = get_barcode_sequence(ref_bc_name); std::string bc_mask(ref_bc.length(), 'N'); - candidate.top_context = kit_info.top_front_flank + bc_mask + kit_info.top_rear_flank; - candidate.top_context_left_buffer = kit_info.top_front_flank.substr( - std::max(0lu, kit_info.top_front_flank.length() - m_left_buffer)); - candidate.top_context_right_buffer = kit_info.top_rear_flank.substr(0, m_right_buffer); + + // Pre-populate the sequences representing the front and rear flanks of the barcode. This + // is generated for both ends of the barcode in double ended barcodes. + // In addition to the flanks, a short padding sequence is also extracted from the flanks on + // either end of the barcode. This padding sequence is used during alignment ofthe extracted mask + // region with candidate barcodes. e.g. + // | FLANK1 | BC | FLANK2 | + // | ACTGCA | CCC | GGTCAT | + // If the padding width is 2, then instead of matching "CCC" to the extracted mask region, + // "CACCCGG" is matched against a padded mask region. This helps anchor the front + // and rear of the barcode flanks, and improves barcode matching. + candidate.top_context = (use_leading_flank ? kit_info.top_front_flank : "") + bc_mask + + kit_info.top_rear_flank; + candidate.top_context_left_buffer = + extract_left_buffer(kit_info.top_front_flank, m_left_buffer); + candidate.top_context_right_buffer = + extract_right_buffer(kit_info.top_rear_flank, m_right_buffer); auto top_front_flank_rc = utils::reverse_complement(kit_info.top_front_flank); auto top_rear_flank_rc = utils::reverse_complement(kit_info.top_rear_flank); candidate.top_context_rev = top_rear_flank_rc + bc_mask + top_front_flank_rc; candidate.top_context_rev_left_buffer = - top_rear_flank_rc.substr(std::max(0lu, top_rear_flank_rc.length() - m_left_buffer)); - candidate.top_context_rev_right_buffer = top_front_flank_rc.substr(0, m_right_buffer); + extract_left_buffer(top_rear_flank_rc, m_left_buffer); + candidate.top_context_rev_right_buffer = + extract_right_buffer(top_front_flank_rc, m_right_buffer); if (!kit_info.barcodes2.empty()) { const auto& ref_bc2_name = kit_info.barcodes2[0]; const auto& ref_bc2 = get_barcode_sequence(ref_bc2_name); std::string bc2_mask(ref_bc2.length(), 'N'); - candidate.bottom_context = - kit_info.bottom_front_flank + bc2_mask + kit_info.bottom_rear_flank; - candidate.bottom_context_left_buffer = kit_info.bottom_front_flank.substr( - std::max(0lu, kit_info.bottom_front_flank.length() - m_left_buffer)); + candidate.bottom_context = (use_leading_flank ? kit_info.bottom_front_flank : "") + + bc2_mask + kit_info.bottom_rear_flank; + candidate.bottom_context_left_buffer = + extract_left_buffer(kit_info.bottom_front_flank, m_left_buffer); candidate.bottom_context_right_buffer = - kit_info.bottom_rear_flank.substr(0, m_right_buffer); + extract_right_buffer(kit_info.bottom_rear_flank, m_right_buffer); auto bottom_front_flank_rc = utils::reverse_complement(kit_info.bottom_front_flank); auto bottom_rear_flank_rc = utils::reverse_complement(kit_info.bottom_rear_flank); candidate.bottom_context_rev = bottom_rear_flank_rc + bc_mask + bottom_front_flank_rc; - candidate.bottom_context_rev_left_buffer = bottom_rear_flank_rc.substr( - std::max(0lu, bottom_rear_flank_rc.length() - m_left_buffer)); + candidate.bottom_context_rev_left_buffer = + extract_left_buffer(bottom_rear_flank_rc, m_left_buffer); candidate.bottom_context_rev_right_buffer = - bottom_front_flank_rc.substr(0, m_right_buffer); + extract_right_buffer(bottom_front_flank_rc, m_right_buffer); } for (size_t idx = 0; idx < kit_info.barcodes.size(); idx++) { @@ -539,7 +563,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl const auto& top_left_buffer = candidate.top_context_left_buffer; const auto& top_right_buffer = candidate.top_context_right_buffer; - std::string_view bottom_strand = candidate.top_context_rev; + std::string_view bottom_context = candidate.top_context_rev; const auto& bottom_left_buffer = candidate.top_context_rev_left_buffer; const auto& bottom_right_buffer = candidate.top_context_rev_right_buffer; @@ -552,7 +576,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl std::string_view top_mask = read_top.substr(top_start_idx, top_end_idx - top_start_idx); auto [bottom_result, bottom_flank_score, bottom_bc_loc] = extract_flank_fit( - bottom_strand, read_bottom, barcode_len, placement_config, "bottom score"); + bottom_context, read_bottom, barcode_len, placement_config, "bottom score"); auto bottom_start_idx = std::max(0, bottom_bc_loc - m_left_buffer - barcode_len); auto bottom_end_idx = bottom_bc_loc + m_right_buffer; std::string_view bottom_mask = @@ -629,12 +653,12 @@ std::vector BarcodeClassifier::calculate_barcode_score( for (size_t i = 0; i < candidate.barcodes1.size(); i++) { const auto barcode = candidate.top_context_left_buffer + candidate.barcodes1[i] + candidate.top_context_right_buffer; - ; auto& barcode_name = candidate.barcode_names[i]; if (!barcode_is_permitted(allowed_barcodes, barcode_name)) { continue; } + spdlog::trace("Checking barcode {}", barcode_name); auto top_mask_score = extract_mask_score(barcode, top_mask, mask_config, "top window"); @@ -664,10 +688,6 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( const std::vector& candidates, bool barcode_both_ends, const BarcodingInfo::FilterSet& allowed_barcodes) const { - //if (read_seq.length() < TRIM_LENGTH) { - // //spdlog::info("reject due to seq length"); - // return UNCLASSIFIED; - //} const std::string_view fwd = read_seq; // First find best barcode kit. @@ -697,7 +717,7 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( } if (scores.empty()) { - spdlog::info("reject due to no score"); + spdlog::warn("Barcode unclassified because no barcodes found in kit."); return UNCLASSIFIED; } @@ -712,20 +732,18 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( auto best_bottom_score = std::min_element( scores.begin(), scores.end(), [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); - spdlog::trace("Check double ends: top bc {}, bottom bc {}", best_top_score->barcode_name, - best_bottom_score->barcode_name); - //if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && - // (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && - // (best_top_score->barcode_name != best_bottom_score->barcode_name)) { - if ((best_top_score->score < 10) && (best_bottom_score->score < 10) && + auto best_score = std::max(best_top_score->score, best_bottom_score->score); + auto score_dist = std::abs(best_top_score->score - best_bottom_score->score); + if ((best_score <= m_scoring_params.max_barcode_score) && + (score_dist <= m_scoring_params.min_barcode_score_dist) && (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + spdlog::trace("Two ends confidently predict different BCs: top bc {}, bottom bc {}", + best_top_score->barcode_name, best_bottom_score->barcode_name); return UNCLASSIFIED; } } // Sort the scores windows by their barcode score. - //std::sort(scores.begin(), scores.end(), - // [](const auto& l, const auto& r) { return l.score > r.score; }); std::sort(scores.begin(), scores.end(), [](const auto& l, const auto& r) { return l.score < r.score; }); @@ -736,12 +754,7 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( spdlog::trace("Scores: {}", d.str()); auto best_score = scores.begin(); auto are_scores_acceptable = [this](const auto& score) { - return (score.flank_score >= m_scoring_params.min_soft_flank_threshold && - score.score >= m_scoring_params.min_hard_barcode_threshold) || - (score.score >= m_scoring_params.min_soft_barcode_threshold && - score.flank_score >= m_scoring_params.min_hard_flank_threshold) || - (score.top_score >= m_scoring_params.min_hard_barcode_threshold && - score.bottom_score >= m_scoring_params.min_hard_barcode_threshold); + return score.score <= m_scoring_params.max_barcode_score; }; BarcodeScoreResult out = UNCLASSIFIED; @@ -750,21 +763,14 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( out = *best_score; } } else { - auto second_best_score = std::next(best_score); - //if (best_score->score - second_best_score->score > 0.05f) { - // const float kMargin = m_scoring_params.min_barcode_score_dist; - // if (are_scores_acceptable(*best_score) || - // (best_score->score - second_best_score->score >= kMargin)) { - // out = *best_score; - // } - //} - //if (std::abs(best_score->score - second_best_score->score) >= 3 && - // (best_score->top_barcode_pos.first <= 125 || best_score->bottom_barcode_pos.second >= int(read_seq.length()) - 125)) { - // out = *best_score; - //} - if (std::abs(best_score->score - second_best_score->score) >= 3 && - (best_score->top_barcode_pos.first <= 75 || - best_score->bottom_barcode_pos.second >= int(read_seq.length() - 75))) { + const auto& second_best_score = std::next(best_score); + const int score_dist = second_best_score->score - best_score->score; + if (((score_dist >= m_scoring_params.min_barcode_score_dist && + are_scores_acceptable(*best_score)) || + (score_dist >= m_scoring_params.min_separation_only_dist)) && + (best_score->top_barcode_pos.first <= m_scoring_params.barcode_end_proximity || + best_score->bottom_barcode_pos.second >= + int(read_seq.length() - m_scoring_params.barcode_end_proximity))) { out = *best_score; } } @@ -773,7 +779,9 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // For more stringent classification, ensure that both ends of a read // have a high score for the same barcode. If not then consider it // unclassified. - if (out.top_score >= 10 || out.bottom_score >= 10) { + if (std::max(out.top_score, out.bottom_score) > m_scoring_params.max_barcode_score) { + spdlog::trace("Min of top {} and bottom scores {} > max barcode score {}", + out.top_score, out.bottom_score, m_scoring_params.max_barcode_score); return UNCLASSIFIED; } } diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index e8061523..ee003cfd 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -119,20 +119,23 @@ BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file } const auto& config = toml::find(config_toml, "scoring"); - if (config.contains("min_soft_barcode_threshold")) { - params.min_soft_barcode_threshold = toml::find(config, "min_soft_barcode_threshold"); + if (config.contains("max_barcode_score")) { + params.max_barcode_score = toml::find(config, "max_barcode_score"); } - if (config.contains("min_hard_barcode_threshold")) { - params.min_hard_barcode_threshold = toml::find(config, "min_hard_barcode_threshold"); + if (config.contains("barcode_end_proximity")) { + params.barcode_end_proximity = toml::find(config, "barcode_end_proximity"); } - if (config.contains("min_soft_flank_threshold")) { - params.min_soft_flank_threshold = toml::find(config, "min_soft_flank_threshold"); + if (config.contains("min_barcode_score_dist")) { + params.min_barcode_score_dist = toml::find(config, "min_barcode_score_dist"); } - if (config.contains("min_hard_flank_threshold")) { - params.min_hard_flank_threshold = toml::find(config, "min_hard_flank_threshold"); + if (config.contains("min_separation_only_dist")) { + params.min_separation_only_dist = toml::find(config, "min_separation_only_dist"); } - if (config.contains("min_barcode_score_dist")) { - params.min_barcode_score_dist = toml::find(config, "min_barcode_score_dist"); + if (config.contains("flank_left_pad")) { + params.flank_left_pad = toml::find(config, "flank_left_pad"); + } + if (config.contains("flank_right_pad")) { + params.flank_right_pad = toml::find(config, "flank_right_pad"); } return params; diff --git a/dorado/demux/parse_custom_kit.h b/dorado/demux/parse_custom_kit.h index 63fd2199..b122d3b3 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/demux/parse_custom_kit.h @@ -9,11 +9,12 @@ namespace dorado::demux { struct BarcodeKitScoringParams { - float min_soft_barcode_threshold = 0.7f; - float min_hard_barcode_threshold = 0.6f; - float min_soft_flank_threshold = 0.7f; - float min_hard_flank_threshold = 0.6f; - float min_barcode_score_dist = 0.25f; + int max_barcode_score = 12; + int barcode_end_proximity = 75; + int min_barcode_score_dist = 3; + int min_separation_only_dist = 6; + int flank_left_pad = 5; + int flank_right_pad = 10; }; std::optional> parse_custom_arrangement( diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 82e92d7c..644f7a42 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -120,11 +120,12 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); - CHECK(scoring_params.min_soft_barcode_threshold == Approx(0.1f)); - CHECK(scoring_params.min_hard_barcode_threshold == Approx(0.2f)); - CHECK(scoring_params.min_soft_flank_threshold == Approx(0.3f)); - CHECK(scoring_params.min_hard_flank_threshold == Approx(0.4f)); - CHECK(scoring_params.min_barcode_score_dist == Approx(0.5)); + CHECK(scoring_params.max_barcode_score == 10); + CHECK(scoring_params.barcode_end_proximity == 75); + CHECK(scoring_params.min_barcode_score_dist == 3); + CHECK(scoring_params.min_separation_only_dist == 5); + CHECK(scoring_params.flank_left_pad == 5); + CHECK(scoring_params.flank_right_pad == 10); } TEST_CASE("Parse default scoring params", "[barcode_demux]") { @@ -135,15 +136,12 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); - CHECK(scoring_params.min_soft_barcode_threshold == - Approx(default_params.min_soft_barcode_threshold)); - CHECK(scoring_params.min_hard_barcode_threshold == - Approx(default_params.min_hard_barcode_threshold)); - CHECK(scoring_params.min_soft_flank_threshold == - Approx(default_params.min_soft_flank_threshold)); - CHECK(scoring_params.min_hard_flank_threshold == - Approx(default_params.min_hard_flank_threshold)); - CHECK(scoring_params.min_barcode_score_dist == Approx(default_params.min_barcode_score_dist)); + CHECK(scoring_params.max_barcode_score == default_params.max_barcode_score); + CHECK(scoring_params.barcode_end_proximity == default_params.barcode_end_proximity); + CHECK(scoring_params.min_barcode_score_dist == default_params.min_barcode_score_dist); + CHECK(scoring_params.min_separation_only_dist == default_params.min_separation_only_dist); + CHECK(scoring_params.flank_left_pad == default_params.flank_left_pad); + CHECK(scoring_params.flank_right_pad == default_params.flank_right_pad); } TEST_CASE("Check for normalized id pattern", "[barcode_demux]") { diff --git a/tests/TestUtils.h b/tests/TestUtils.h index e9211a78..89271efa 100644 --- a/tests/TestUtils.h +++ b/tests/TestUtils.h @@ -1,5 +1,8 @@ #pragma once +#include + +#include #include #include #include @@ -37,6 +40,12 @@ struct TempDir { std::filesystem::path m_path; }; +class TraceLogger { +public: + TraceLogger() { spdlog::set_level(spdlog::level::trace); } + ~TraceLogger() { spdlog::set_level(spdlog::level::off); } +}; + } // namespace dorado::tests using namespace dorado::tests; diff --git a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml index 79cba10a..079d50cf 100644 --- a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml +++ b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml @@ -1,7 +1,8 @@ [scoring] -min_soft_barcode_threshold = 0.1 -min_hard_barcode_threshold = 0.2 -min_soft_flank_threshold = 0.3 -min_hard_flank_threshold = 0.4 -min_barcode_score_dist = 0.5 +max_barcode_score = 10 +barcode_end_proximity = 75 +min_barcode_score_dist = 3 +min_separation_only_dist = 5 +flank_left_pad = 5 +flank_right_pad = 10 From 92b3966a1e0d827681ca82c5360fc9c1a201ba57 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Sun, 11 Feb 2024 16:36:36 +0000 Subject: [PATCH 07/23] removed unused variable and add comment --- dorado/demux/BarcodeClassifier.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 1bff46ed..07b8fff3 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -148,8 +148,6 @@ std::string extract_right_buffer(const std::string& flank, int buffer) { namespace demux { const int TRIM_LENGTH = 175; -const int LEFT_BUFFER = 5; -const int RIGHT_BUFFER = 10; const BarcodeScoreResult UNCLASSIFIED{}; struct BarcodeClassifier::BarcodeCandidateKit { @@ -256,6 +254,8 @@ std::vector BarcodeClassifier::generate_ "an equal number of them"); } + // For click chemistry based, the flank region placement is better if the leading + // flank sequence of the top barcode is ignored. bool use_leading_flank = true; if (kit_name.find("SQK-RBK114") != std::string::npos) { use_leading_flank = false; From 672a1473e0318e30898700c74e93640737e1c580 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Thu, 15 Feb 2024 21:57:37 +0000 Subject: [PATCH 08/23] Update barcoding documentation to reflect the new adjustable paramters for barcoding --- documentation/CustomBarcodes.md | 38 ++++++++++--------- dorado/demux/BarcodeClassifier.cpp | 13 +++++-- dorado/demux/parse_custom_kit.cpp | 4 +- dorado/demux/parse_custom_kit.h | 2 +- tests/CustomBarcodeParserTest.cpp | 4 +- .../custom_barcodes/scoring_params.toml | 2 +- 6 files changed, 35 insertions(+), 28 deletions(-) diff --git a/documentation/CustomBarcodes.md b/documentation/CustomBarcodes.md index dcb9a950..c5e987cb 100644 --- a/documentation/CustomBarcodes.md +++ b/documentation/CustomBarcodes.md @@ -41,11 +41,12 @@ last_index = 96 ## Scoring options [scoring] -min_soft_barcode_threshold = 0.2 -min_hard_barcode_threshold = 0.2 -min_soft_flank_threshold = 0.3 -min_hard_flank_threshold = 0.3 -min_barcode_score_dist = 0.1 +max_barcode_cost = 11 +barcode_end_proximity = 75 +min_barcode_score_dist = 3 +min_separation_only_dist = 6 +flank_left_pad = 5 +flank_right_pad = 10 ``` #### Arrangement Options @@ -72,25 +73,26 @@ The pre-built barcode sequence in Dorado can be found in this [file](../dorado/u Dorado maintains a default set of parameters for scoring each barcode to determine the best classification. These parameters have been tuned based on barcoding kits from Oxford Nanopore. However, the default parameters may not be optimal for new arrangements and kits. The classification heuristic applied by Dorado is the following - -1. Dorado calculates a separate score for the barcode flanks and the barcode sequence itself. The score is roughly `1.0f - edit_distance / length(target_seq)`, where the `target_seq` is either the reference flank or reference barcode sequence. -2. For double ended barcodes, the __best__ window (either front or rear) is chosen based on the flank scores. -3. After choosing the best window for an arrangement, each barcode candidate within the arrangement is sorted by the barcode score. +1. Dorado uses the flanking sequences defined in `maskX_front/rear` to find a window in the read where the barcode is situated. +2. For double ended barcodes, the __best__ window (either from the front or rear of the read) is chosen based on the alignment of the flanking mask sequences. +3. After choosing the best window for an arrangement, each barcode candidate within the arrangement is aligned to the subsequence within the window. The alignment may optionally consider additional bases from the preceding/succeeding flank (as specifed in the `flank_left_pad` and `flank_right_pad` parameters). The edit distance of this alignment is assigned as a score to each barcode. Once barcodes are sorted by barcode score, the top candidate is checked against the following rules - -1. If the flank score is above `min_soft_flank_threshold` and the barcode score is above `min_hard_barcode_threshold`, then the barcode is kept as a candidate. -2. If the barcode score is above `min_soft_barcode_threshold` and the flank score is above `min_hard_flank_threshold`, then the barcode is kept as a candidate. -3. If the arrangement is double ended, if both the top and bottom barcode sequence scores are above `min_hard_barcode_threshold`, then the barcode is kept as a candidate. +1. Is the barcode score below `max_barcode_cost` and the distance between top 2 barcode scores greater than `min_barcode_score_dist`?. +2. Is the barcode score above `max_barcode_cost` but the distance between top 2 barcodes scores greater then `min_separation_only_dist`? -If a candidate still remains and there are at least 2 scores candidates in the sorted list, the difference between the best and second best candidate is computed. Only -if that score is greater than `min_barcode_score_dist`, the best candidate is considered a hit. Otherwise the read is considered as `unclassified`. +If a candidate meets one of the above conditions and the location of the start/end of the barcode construct is within `barcode_end_proximity` bases of the ends of the read, then it is considered a hit. | Scoring option | Description | | -- | -- | -| min_soft_barcode_threshold | If barcode score meets this threshold and flank score meets its hard threshold, consider a hit. Soft score is higher than hard score. | -| min_hard_barcode_threshold | Minimum score threshold a barcode must meet. | -| min_soft_flank_threshold | If flank score meets this threshold and barcode score meets its hard threshold, consider a hit. Soft score is higher than hard score. | -| min_hard_barcode_threshold | Minimum score threshold a flank must meet. | -| min_barcode_score_dist | Minimum distance between barcode scores of best and second best hits. | +| max_barcode_cost | The maximum edit distance allowed for a classified barcode. Considered in conjunction with the `min_barcode_score_dist` parameter. | +| min_barcode_score_dist | The minimum score difference between top-2 barcodes required for classification. Used in conjunction with `max_barcode_cost`. | +| min_separation_only_dist | The minimum score difference between the top-2 barcodes required for classification when the `max_barcode_cost` is not met. | +| barcode_end_proximity | Proximity of the end of the barcode construct to the ends of the read required for classification. | +| flank_left_pad | Number of bases to use from preceding flank during barcode alignment. | +| flank_right_pad | Number of bases to use from succeeding flank during barcode alignment. | + +For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment score. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification. ### Custom Sequences File diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 07b8fff3..2c359956 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -168,7 +168,12 @@ struct BarcodeClassifier::BarcodeCandidateKit { std::string bottom_context_rev_left_buffer; std::string bottom_context_rev_right_buffer; std::vector barcode_names; + // This is the specific barcode kit product name + // that is selected by the user, such as SQK-RBK114-96 + // or EXP-PBC096 std::string kit; + // This is the barcode ligation group name, such as RAB + // or 16S, which is shared by multiple product names. std::string barcode_kit; }; @@ -734,7 +739,7 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); auto best_score = std::max(best_top_score->score, best_bottom_score->score); auto score_dist = std::abs(best_top_score->score - best_bottom_score->score); - if ((best_score <= m_scoring_params.max_barcode_score) && + if ((best_score <= m_scoring_params.max_barcode_cost) && (score_dist <= m_scoring_params.min_barcode_score_dist) && (best_top_score->barcode_name != best_bottom_score->barcode_name)) { spdlog::trace("Two ends confidently predict different BCs: top bc {}, bottom bc {}", @@ -754,7 +759,7 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( spdlog::trace("Scores: {}", d.str()); auto best_score = scores.begin(); auto are_scores_acceptable = [this](const auto& score) { - return score.score <= m_scoring_params.max_barcode_score; + return score.score <= m_scoring_params.max_barcode_cost; }; BarcodeScoreResult out = UNCLASSIFIED; @@ -779,9 +784,9 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // For more stringent classification, ensure that both ends of a read // have a high score for the same barcode. If not then consider it // unclassified. - if (std::max(out.top_score, out.bottom_score) > m_scoring_params.max_barcode_score) { + if (std::max(out.top_score, out.bottom_score) > m_scoring_params.max_barcode_cost) { spdlog::trace("Min of top {} and bottom scores {} > max barcode score {}", - out.top_score, out.bottom_score, m_scoring_params.max_barcode_score); + out.top_score, out.bottom_score, m_scoring_params.max_barcode_cost); return UNCLASSIFIED; } } diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index ee003cfd..a1e12850 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -119,8 +119,8 @@ BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file } const auto& config = toml::find(config_toml, "scoring"); - if (config.contains("max_barcode_score")) { - params.max_barcode_score = toml::find(config, "max_barcode_score"); + if (config.contains("max_barcode_cost")) { + params.max_barcode_cost = toml::find(config, "max_barcode_cost"); } if (config.contains("barcode_end_proximity")) { params.barcode_end_proximity = toml::find(config, "barcode_end_proximity"); diff --git a/dorado/demux/parse_custom_kit.h b/dorado/demux/parse_custom_kit.h index b122d3b3..2aaa9691 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/demux/parse_custom_kit.h @@ -9,7 +9,7 @@ namespace dorado::demux { struct BarcodeKitScoringParams { - int max_barcode_score = 12; + int max_barcode_cost = 12; int barcode_end_proximity = 75; int min_barcode_score_dist = 3; int min_separation_only_dist = 6; diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 644f7a42..65b4f7aa 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -120,7 +120,7 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); - CHECK(scoring_params.max_barcode_score == 10); + CHECK(scoring_params.max_barcode_cost == 10); CHECK(scoring_params.barcode_end_proximity == 75); CHECK(scoring_params.min_barcode_score_dist == 3); CHECK(scoring_params.min_separation_only_dist == 5); @@ -136,7 +136,7 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); - CHECK(scoring_params.max_barcode_score == default_params.max_barcode_score); + CHECK(scoring_params.max_barcode_cost == default_params.max_barcode_cost); CHECK(scoring_params.barcode_end_proximity == default_params.barcode_end_proximity); CHECK(scoring_params.min_barcode_score_dist == default_params.min_barcode_score_dist); CHECK(scoring_params.min_separation_only_dist == default_params.min_separation_only_dist); diff --git a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml index 079d50cf..f1500e43 100644 --- a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml +++ b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml @@ -1,6 +1,6 @@ [scoring] -max_barcode_score = 10 +max_barcode_cost = 10 barcode_end_proximity = 75 min_barcode_score_dist = 3 min_separation_only_dist = 5 From d70d92054755674c7730e42b61b738c35c4b9658 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Mon, 11 Mar 2024 21:03:28 +0000 Subject: [PATCH 09/23] Update test data based on actual GT --- .../double_end_variant/unclassified.fastq | 4 --- .../single_end/SQK-RBK114-96_BC01.fastq | 22 ++++++++------ .../single_end/SQK-RBK114-96_BC92.fastq | 14 ++------- .../single_end/SQK-RBK114-96_RBK39.fastq | 16 ++++------ .../single_end/unclassified.fastq | 30 +++++++++++++------ 5 files changed, 43 insertions(+), 43 deletions(-) diff --git a/tests/data/barcode_demux/double_end_variant/unclassified.fastq b/tests/data/barcode_demux/double_end_variant/unclassified.fastq index 1b178f72..24108fe1 100644 --- a/tests/data/barcode_demux/double_end_variant/unclassified.fastq +++ b/tests/data/barcode_demux/double_end_variant/unclassified.fastq @@ -1,7 +1,3 @@ -@3bd5a348-1cb5-4aed-a95a-3c95266b36e -CGTACTTCGTTCAGTTACGTATTGCTGGTGCTGTCTTCTACCGATCCGACAGTTGCTTTACTTGCCTGTCGCTCTATATCTTCGGCGTCTGCTTGGGTGTTTAACCTCTCTCCGCTGAAGACAATGCAGGTTAATGTCATATGAACAGACCCCGGATCGATGTCTTGCCGTTCGTCAGTTAACCGAGCTTTCAATGATCAGCGCAGCGCGCGAT -+ -%%&(*-.21/00----.0,+,,-))*+44,+)&'''((*)*(()2410.)''))&%$%%$#&(*,)44563336:.--.78;72*+,681.,./+**+2::./00/-40/7:3../-/04461**))&$$%&())-**&$''&(((&'**('('(())'&'%%$%'%%%%261557'&''7==;942125=8(&##$%&%((')(()('&&'&& @97c41386-5bc9-4228-8d4a-8a743f743a9 GTACTTCGTTCAGTTACGTATTGCTGGATTGTTTCTGTTGGTGCTGATATTGCGGCGTCTGCTTGGGTATTTAACCTCTCCGCGCATCTCAACGCTCTCTTTCAGGCCGTCCCGGAAACAAAACGACGCGTTTCGTTATTGGCTTCCTGATAGTAAACGTCATAGCCGCCAGCTGGATCACGCAGGCTCCAGCATCCTGGGCGAGCTGGATAGCTTTGCGCCGTCTCCAGCCCTGCGCCCGCACCGCGTCATCTTCTTGGCGAAACGACGATGAGCAGAAAGGCACATGGACGGCACGCGCACGCCGGTTTCGCCAATCGCATTGACCAGCGCCAGACACACTCGCGGCTCGA + diff --git a/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC01.fastq b/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC01.fastq index 46c4ef84..8714e3ca 100644 --- a/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC01.fastq +++ b/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC01.fastq @@ -1,12 +1,16 @@ -@20e43bbc-19d8-4cc3-be24-80e399c25820 -GTTATGTGATTACTGGTACGGTTGTGAAACGATATACAGCCCAGAAAGTTCTGGTGTCTTTGTGGTTTTGCATTTATAGTGAAACGCTTTGCGCGTTTCGTACACCACTAACCAACATAACCCAAACTTTTTCCTCATAGCGGGCCTTCGCCACGCGCGCGGCGAAGAAATGGTGGATTATGACAGAATTTGGCCATAAAGCGGGCCATTCACCCTCGGCGTTAAACTCTGCGGGCAGCACAGGTTGATTCCAATATCCGCAGCTCCAGTTCACGTCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCACATCCATACCAGAAAGATGATCTCGCGGGTCCCGTCGGTATAACCCATTTCCACGCCAGCGTGGTCGCCTCGGAGAACA +@00681739-1170-486a-899c-f6546ca9222e +GCCGATATTAACCAAAGTTCGGTGTCTTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTGGGGCGACCACAGGTTCAATTTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCACATCCATACCAGAAAGATCATCTCGCGGGTCCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCGCCTCGGAGAACT + -%*,,*+,&#"#$()&''%%"'((&&%##%%%$%$'%&#$%(,,'''&(-+$#$+5;8./06799=@?{F&$('')*4&&'.,,33333:;/-&%#%(/(%'()(++*)($$#$$$'$&&''&)/3;03426;/06'''(+,-3;<8<;-05:55,+,1*+)-.:;A4>--0,%&(&*$'#%%(''%%)%'(%$&;//04652(&&&$"%&#%+,;<+)*:001124/0'&&956CC,*.47()+32//)300')-,,,122*/,.05581441103,+**3A:33-*().++06*10143237759//0<;0))-,422066768D=C==&%'98433)32+&&,*)&$%)-+,77;B1/.,.9+))6>;7.99('(/(()1)*(**).)& -@340e49e6-4c5a-4c2a-a8e9-7559c13af09b -TGTTATGTAGTCTACTTGGTTCAGTTCTTGTCAGCCGGATATTAACAAAAGTTGTCGGTGTTTATTGTGGTTTTCGCATTTATCGTGAAAACGTTTTCGCGTTTTTCATACGCATTTGGATCTTGACAAACTCGACGATCTCTTTGCCGCGCGCGTGGCAAAGCCGTGATGGAAGGAAAAGTTTTGCGCTATGTTGGCAATCTGATGAAGATGGCGTCTGCCGCGTGAAGATTGCCGAAGTGGATGGTAATAATCCACTGTTCAAAGTGAAAAATGGCGGAACATGGCCTTGTATCGCCACTATTATCAGCCGCTGCCGTTGGTACTGCGCGGATATGGTGGGCAATGACGTTTACGGCTGCCGGTGTCTTTGCTGATCTGCTGCGTACCCTCTCATGGAAGTTAGGAGTCTGACATGGTTAAAGTTTATGCCCGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGTGCTGGGGGCGGCGGTGACACCTGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCGGCAGAGACATCAGTATTACAACAACCTGGGACGCTTTGCCGATAGCTGCCAGTAGAAAGGGAAAATATCGTTACCTGTCGGTCCCTGGGAGCGTTTTGCCAGGAACTGGGTAAGCAAATTCCAGTGGCGATGACCCTGAAAGGATCCTGCCGATCGGTTCGGCTTAGGCTCCAGTGCCTGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTGCGGCAGGGCAAGCTAATGACACTCGTTTGCTGGCTTTGATGGGCGAGCTGGAAAGCCATATCTCCGGCAGCATTA +""#&'''('#"###$$$$%$$'+>CDEJHHMHGNERSHEFFHSSQKIEHFHIJLEFHGFG<;:;6218*&&&')()'''&&(%%&%%###$(278DEDFFEJLSJHKSSIEEFGSSLFF;9446;;@@HHKLHESGSSLJJSJKMSQIMSSMLLH?PMSLSIFGESLSLNFLNOGISHOSSSQSLHJSGNHBDBDEHBCBDA=<<898:D@;<:;=FB>7CCDJNSFAA?DFE>=<=C@?>94,** +@0074eaf2-59fb-4b62-bbec-eff348eaba58 +GTGTCTACTGGTTCGGTTGTCGCCGATATTGACCAAGAAAGTTGTCGGTGTCTTTATGATACCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCAGCGGGGCCAGCGCCATCTTCATCAATATTGCCAACATAGCGCAAAACTTTTCCTTCATCACGGGCCTTCGCCACGCGCGCGGCAAAGAGATCGTCGAGTTGTGACAGATTCGCCATAAAAGCGGCAACATCACCCTCGGCGTTAAACTCTGCGGGCAGCACAGGTTCAATTTCAATATCCGCCAGCTCCCATTCACGTCCCGTTTCACGGCGA + -"#)*,/*)$""$%%&%+(++)$"##0(,.&%"##%*((%+))'$#$$$+(*%&&%#$#%')'&%%&-0&36>>?+)-(((9&&%'(545B<3,(((%$'//0'&/=+''&&&'%''3+)),)'()00)*431-/0<+**::086=AHM:97?//+0/26&%<6-//4312101DC@CG{J=99?;;=?;,,-.--,*)&$#%*/2/0.-&%'9>@CACDCBBCBB<>:62/0+**'8A667B=<'%&.((%''.&&+&&4A@535;:31044=;9::)'',&$(:<54,---.,12-+&&,71//5.?>7712260..,&%)''(/363,'682;336<=502522260%%7/,+1488>>BC@:::))?G@0*+,867574%&'7;IFGIQ>=?@>9:68+++35BCB>3?+**:BA==CFB@FEF;;;'%0+6521&202('%&*)-5,,01-&)-53.,,+*+/18661.461/-&%&$#%''$#3896/.'/*)*,)%%())$%24(&+.10MG@>2+1@C?@89972>9:01?:3,/.('+&():787FFG?=><864@=4*4:<87/&$%%;96366./0;5.4:9;/,015&)(.2<@DD41/+()+&$$,*())>;<=ABA;A(''/.,()636+&$&)*')*;5590.1221& -@bb72aaa7-b21f-4fb8-8eaf-7c18b46b9fa1 -CTATGTTGACCTTATTCGTCAGTTTTTTTTTTTTTTTTTATGCCAGCCGATATTTGACAGAAGAGTTGTGGGTGTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTCGCTTCACTTCCCAGTGAATGTAGGAGTCTGACATGGTTAGGGCCCTGCCCCCACTTCCGGTCCAATATGAGCATCAGGTTGATGCTTGCTGGGGCAGCGGTGACACCTGTTGATGGTGCATTGCTGGGAGGTGTCGTCACGGCTGAGCAGCAAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGAATAAGCTGCCATCAGAACACGGGAAAATATAGTTTATCAGTGCTGGGAGCGTTTTGCCAGGAACTGGGTAAGCAAATTCCAGTGGCGATGACCCTGGAAAAGAAT +%')''#"#$###$%$%%'&%##"#$&)&'-'&&&(&&&()().-/00028>;994*(#$%$%%$&))3334;=>?IHGJ2111GDGIDBEDKQGEEFGIEB;494+)))()'&'%%$$%&'&&&'*,,5?>>>>EFDMMOJGOSLFHEFFFHPDFDB;776@6@DSSJIHPSISSIJHSIILSJHSHHFFFA;?BA?B>;;:<<>@==H977FDEFIMBBBAHGHSSMJSIILJLSMLSKIMKPGSHDSGLJJMLLJMKIJLILJRFC;;ABG>''''OJIHISSSKISNSHQIN=-.0/,1200010(''22,****)' +@008852b0-3603-4282-8f09-89f6d3788f49 +GTGTTATCATCATTTCGTGGTTATGAAGCCGATATTGACCAAGAAAGTTGTCGGTGTCCTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTCGCTTCACGCAGTGTATTCCATTTCATCGCCATCAGCGCCGCGACCACCGAACAGGCACTGGAGCCTAAGCCCGAACCGATCGGCATATTCTTTTCCAGGGTCATCGCCACTGGATACCAGTTCCTGGCAAAAACGCTCCCAGCACTGATAAACGATATTTTCCCGTGGTTCTGACGGCAGCTTATCGGCAAAGCGTCCGAGGTTGTTGAG + -#$$*+''$#"##('')(((#$%%()*)*)'(((''%%%$%$$###$%($#&''&'&&&*)),('&.+%$$$'()'%'+'.6//42/06;AC@A?A@GOB>>))1:;('(3%%&232,.)'')(+&%'(%&,()83/242<=39:=''+99,+.4--/0;:3/--0/(,,(./+33/,*+*)/(''(,<<@><7*($$$4;=C<;=BE@??@AIA:-'0677==9/3,+*7./B?9:?J;;>4;:8:><>EA98:(%%%8;1234255&-) +$$%&,((%####"#&%$$##$'&&$#"##$$$$$$%&&&&('&&(*+++*110/042-(','HISIJSSMHMHHOIDDCDGFGSKO><<?EHHNSJSKIAAABGSSSIKSSJSKGSLPSSKKGFEDCDSSSIHIMHJSISQGQISSSIIHIHGS;=6>>?JHIG9667''''*&%&')'/09=@?AADIHKSNOCDEDFLCJOOSOMJLLSJSNGMIIISS>>BMLSKIGDFEFHJILM=;;;BBCFGPIHA,+% +@001afee2-1ffc-476d-929f-20a9f90839ae +TATTTACTGGTTCGTTTTTTTTTTTGGCTATGCGCGATTAACAAAAATTTTGTGTCGGTGTCTTTGTGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCATCCGAGGCGACCACGCTGGCGCGGGAAATGGGTTATACCGAACCGGACCCGCGAGATGATCTTTCTGGTATGGATGTGGCGCGTAAACTATTGATTCTCGCTCGTGAAACGGGACGTGAACTGGAGCTGGCGGATATTGAAATTG ++ +##$&((''%%*+/--,*&)&$#""%$#""#$$#"""$&''''%%&'($$$&&')*))));=;:;:7EECKLSSMGGKHJSISBBBBNMJMA@AAIKGKFFF@AC924--46978:6./7@@??ASSHIHIEGIJSISJIIGG..NPKPIHLFSIEGF?BCBE>==7@<8DA@0//GDGEIEGEIGEI>>=>DEJOPGGGFFFEGQLIACEAAAEKLPHFIKGGGFIM?BA?ABBDCFNJGEGJLGIFGDBCBFSOPDDEFGC2 diff --git a/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC92.fastq b/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC92.fastq index c6f239fa..501a8b32 100644 --- a/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC92.fastq +++ b/tests/data/barcode_demux/single_end/SQK-RBK114-96_BC92.fastq @@ -1,12 +1,4 @@ -@df8d8292-d227-464d-ba11-e585ffee29f5 -GTTATGCCCTCTACTTGGTTCAGTTTTTCAGCCGGATATAACCTTCTAAGTAGAAAGGTGACAGAACCATCTCGCATTTATCGTGAAACGCTTTTCGCGTTTTCGTGCGCCGCTTCAGCGGGCCTGCGGCTATTTCTTCTGTTGCTGGTAGCTGACGCCAGTAGGGTTCAGCCGATAAGCATGATCCGCCGGTTCAGGCTTTGCGCTAGCAGCTCCTCCGCCCCGGCAAACCCACCGGCAATCCATGTCGCGCCGACGCAGGCCGCCACGTCACATTTTCATTTGCGACGCAACCGGCCACATCGTTATGAATAAAGTGTACGCGTTCGCTGACGCCGAGTTCTTCCGCGCGACGCTTTGCCTGCGCGGTGAAGAGCGAACTGGTGTCGATGCCAGTCCCGTAATGCCATGATCCCTGGCCCAGTGCGAGGCGTCTCGCCCGAGCCGCTGCCGAGGTGAGAGAATGCGGTACCGGCTTCATGCGCAGCACGCGACCCAGCGTGGCGTACTTCTCTTCGGTGAAGCAT +@cc929148-1d91-46a3-93e8-09042c046893 +GTGGCGCGCTTCGTTCGACCTTGTTCTCAAAATTCGACTAACCTTGTGAGTGGAAAGATACAGGACCGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCATCGCGCCGACGCAGGCCGCCACGTCACATTTTTCATTTGCGACGTAACCGGCCGCCATCGTTATGAATAAAATGTACGCGTTCGCTGACGCCGGAGTTCTTCCGCGCGACGCTTTGCCTGCGCGGTGAAGAGCGAACTCATGTCGATGCCAGTCCCGTAATGCCATGATCCCTGGCCCAGGTGCAGAGCATGTAGCCGAGCCGCTGCCGAGGTCAAGAATGCGGGTGCCCGGCTTCATGCGCAGCACGCGACCCAGCGTGGCGTACTTC + -),.'))(*+)&(&+)*)..0,()(,((&###%(&(&'%&#$#%&+&%%&(/.03@67/,'&%$(%+(,%%%&%23155@;9..0=@{GC::965<+,;202:C1-.-,+20/1.50,/,78BBFE,,,;EJCHFAAAA=ADABDBA<<@GFCE3=>97864;8992,'+6323..-4**+-349,,-1.03-+,.898711197++,*,-7*/=57;9;@??=@77<711.*02;<203<944/,,06760/25:FABE@B657?>=A??>+++;<31129,'$$'24-.1@==:>000;43,,,338D1$%)+$&,,5+++:6**={@@>8775,1498>:6?887<5569;;CCBH@=;;9988EACLN?>832<36&$%'&'&'3/:7541395<=?CA?=?443;A=<79)6<;@>9=?;:667;6.%%%995,-.53-/100/)*6...>669??D<<;345/9<=;<;85*//318?B7:1/.,+-3.454'$ -@43a5da96-c94b-4478-9f48-7feaf29393d5 -TATGTTGTGTAACCTACTTCGTTCATTTGTGTTCAGCCGATATTAACCTTGTGAGTGAGAAAGGTGCAGAACACCCTTCGCATTTATCGTGAAACGCTTTGCGCGTTTTTCGTGCGCCGCTTCACTCGGCTATGACGTGGTGGAAATGGTACTGGCAGACCAGGAAGGCTGGGACCGGTATGAAGCCGCAAAATGGCTGACCATGCGCCGCTGGCTGGAAGCTGGGATTCCTGGATACAACTTCGCCGCCGAAGTCAGGCCGAACTGAACATCGCGCAAAACGTTACGTGACCTACACGCGGGAATGCTTTGGCTGGGGCGTGTCACTCATCGCCCGTAACAACAGGTTCCGGTCTCACTGGAGATGTAAGAAACTGCATTTTTTTGATTTTAATTATCACCTCCAAATCAGTTTATCAATTTGTTATTAATCAAGATGTCACCATAAGGAGCATCATATGTCACAACATCGAACAGTTCCGTCAGGCTTGCGAATGCCGATGTCGGCACCAGATGCACACAGTGCCAGTGATCAATATTGTGCGTAATGCAGCGACAGCAACGGCAATCTCAAACCCTGGGAAGAAGCGGTATGCGGTTTAGCCAGCTTCT -+ -&&*+33/(&)$%$$%&)*-)--:0---61)&)''$$%)&&&$'%###%'+%&$&(/((*)768.-))).)+()$$&&)'(((((3'&'),.-7B<,+,01-,,6:9ADFD-*+499A@<?>=@LCG8@>;;::<:?:;?@2%'%+/007KF?;8.,?=./,,,977>{I=657+,GAB<<=B989BC?84-..//;,,-<<63-.-/&$).'(&,-%$$('$#$'-1/:>=96:66;6:1--225*%,+2(&36&''/0.0&'9?86&&'3139::9C9:'''2023349:55><9:<@76D86+-('(%%&02283055+%9A>>>=<=?8:=BDFD>?6+B@@@?;;=<9)))09830=>E>@:<>?>,)(,54469.;.--C@@@@HF63499C668B@@+)/<<:77B>66658=;..*?/2-92/2,)(2-69($'))('*-.& +##$"""#$%&('%%&&%###$%%%%$%#"####$$$#$%%&0140+)%%&&&((+-/./..00;=A?=>@CJECE=@HC00//79CFGG>>>=HDAB<<7-,-,8<=>HG@:<;0&&*,GGGFGHIHJKGGGGGKKIGGKJJJHGKPSGSHJOHJGHS7667B>F777;BA;CBEJMISKSIFIGIGCDDEHDDFEGHLB?<;9/.('$#$&)**+023333DAGDFSD<;89<>714D72445:589''/653200++))36A688B??667????{GACICBAA/('13.+(-0.+,-&,*,,%%%%'*:876%%'&%$,.&&=4(+,.++.5(''+((*%**+%'*+6411,+,A324-,'''2*'$#$%%&$%'1,()$#%'%$%+&&&:467666:,,.98D:557=31386BA==;:;?=>CD77;<{E77E99:@1/-.-.((/)(&$$$%+'''&%()''&&'%%'))4<=8922C?C@-,-67''(=5.,-8.))//(0*$&.''(.,+('+(,-40*,))()''.%>8369A644A@=7,,667C@999210;=?<20/07:689DC><7+*),39;4*-26:<<@AHG@=7:;>JAB221>A<:(+).**,)*+()'98:@<733::;55248>5/)(.0>4345//2--.;1//1++:='&(3'&'=;76:1.1..6;86('(&$%$*'229:80&'$##&06'$%)+-2-(./2.+,:6;0--.*(-&%+47..0:42+*&'&+('&&%$%.02/)&$)1))*-33%%('*.3149:8:CGB>>))*==-+%&'%%%,/(((& -@efa59c78-ef29-4623-bb94-705fc4972353 -ATGATGCCTTCGTTCGGTTTTGAAGCCGGATATTAACCTGTCCCACACACTTCGTAAGTCCTTGTTTTCGCATTTATCGTGAAACGCTTTGCGCGTTTTTGGTCACCGCTCATGCGCAGTTAATAAAATTGCAGATTGGCAAGACGCAGTGCCGCTGAGAACTGCTCCGGCAGCATGTTAATTGAGCATCACCACATCAGCTTGTTTGAGGATCTGCATCTCGTCACCTCTGCGCGTGATGATCCAGCAGTATCGTTTGTTCCCCGCCGCCGCTTTGTATTCGCCAGATTAATCGCCGGCTTAGCCATAACGAATCATCCTGCGGCAAAACGCCGTGGGCTCAACTTCTGGCATCCATCGCTCTTGAGGACGTTTCGGCGCGATCGGTAGACACCATCGTCGCTAACCAACCTGGCAGTCAGCACCTGCTGAACATTGTAGT +$&&)()&#"##$&&'**.-,,$$###$$%%#$$''%&$%&(+''##$%&&'%&')**+./:B?GDENPEGGDBD882233776<;<<:522;<=@DCGJQNLIINMGIJHFIHHCDCCMFMSJJNJIIMKFIKPSSMSLSSSRKKINNKJKJBSSQLSGIJHIKHGFKGHIEEIQSGSGLFIPHPJIKCC@A8:./*.-))()(%'%&''29?BHLIEA:>>?GDFFHD>LJLISOJIESGEBCCESGRC<=FGSFE5***+???BGISB@/..,,,*(*$ +@d4e4f464-669d-46d5-960c-9d7cb676a86e +TGCCTGCATGTACTTGGTTCAGTTTATGCCGAAACTGGATCTGACCTCTGCCACACACTCGTAAGTCCTTGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTTTGATAAGCGCGGTGAGAGTTAACGATCGTCTGGAAATTCATGATGTTATTGGGCCAGACGAATATACCGAACATGTCAATAATAATGCATACACCAGCTATATGGCCCGCTACAACGTTCAACAGGCGCTGGATATTGCCCGCCAGTTCGGCTGTAGCGACGATGCGTTTATCCATCGCACCGAAATGTTCCTCAAAGAGCTATGGATGCCAGAAATTCAGCCCGACGGCGTTTTGCCGCAGGATGATTCGTTTATGGCTAAGCCGGCGATTAATCTGGCGAAATACAAAGCGGCGGCGGGGAAGCAAACCGTACTGCTGGATTATTCACGCGCCGAAGTGAACGAGATGCAGATCCTCAAACAAGCTGATGTGGTGATGCTCAATTACATGCTGCCGGAGCAGTTCTAGGCGGCATAGTGTATTGCCAATCTGCAATTTTATGAACCGCA + -#&%$""#$--%$$*+&&'-)&%$#$%(&*&*,-(&''()%$%'*,1051431*+-,,563879-)****/1,,-3--&&&:8@87)''22)(',&&'07:'$%&'&$&(.#&-1::9%&%*''(//01.,(()+)(*,--*'*'%&'$$"$')+*%&'&)535/581116=7222834,'.2002,'%%%-12//0<<=<>AE601:0.13.,+..800,*++**222@6834(*&)((((''**'(+**-*)'&&+D1%%*12?666?>=605:95-))+3497130237;<8898668F>9=:<66))-*',(+../<<0/'''.-995353+)'&,,6&%))&)/7',-33/<74*(&&%,467=;9730;>'&-1**'&'(%&&%$&'$%$&*++,/(&'+--0$$#34254'$$"%'15;7;,/0+*2.''('($$$ -@105be6ef-1406-4101-99f7-0ce64845bcb3 -ATGTTATGTTAACCTACTTCGTTCAGTTTTTTTTTTTTTCAGCCGGTATGAAGGCCCAGCGAGCGACTCGTAAGTCCTTGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTTCGTGCGCCGCTCACTGGCATGAGTAAGCTTCTTGAGGAACATTTCGGCGCGATGGATGGCGACGCATCGTCGCTACAGCCGAACTGGCGGGCAATATCAGCGCCTGTTGAACGTTGTAGCGGGCCATATAGCTGGTGTATGCATTATTATTGACATGTTCGGTATATTCGTCTGGCCCAATAACATCATGAATTTCCAGACGATCGTTAACTCTTCACCGCGCGGCTAATGCAGAACTTTGCCGTCTCCAGAAGTAGCGCCATGCCTTCATGCGCAATGAAACTTTCATCCCCCAGTCGTCTGCCAGTATTGAATAACCGCCAGGCGATATCGGCCACCAGATGATGTTCCGCCTGCGCCGAGGCCGCTTTGCCGCAGCCCAGGTGCGAATGTTAATGGCGGCGGAATTCCGGCGTCTCTTCTTCGCCGCTGCGCGCGCTTTCCACGGAAATAGCGCGCCCTGCCAGCCGTTCAGTCGCTTTCTCCTGCGCCT -+ -$')00)**1(&*)()+-/6,**,&%%#&()*+,-.///)#"##%%%#$&$%).'#&*((*(().(&&''*1)5323C;3298&&6'&')9AH634699=7000%'(/453/,'%$$$+(..),/78:?>80/%%%5226930*)*)'(-'$#"#$'*%&&()87782==F=@;:>@C?AD98<=*)+..(,/7766=@;99>M;;4443//0.1595..,,%%),)*:<=9:983345532))*A@;<=LA9459310850017;))2C>>=@DGAAD?@I=<<356...0(6/-.-**7..18-.')7799=;=.--4(*+(('&)/&)***-./.1=00.38C>C;<<@>>...0//0.378?>?99;21:>+*-<:;=557=>)''((()((55''/<=::{B@889659:74'#&+,**,5@@@9DB-**3ACAEFEBB667+)*('(&''8;33,+*43.7967;>99:G{<<344;:AB5:82&@DED<;;=@??<8786777EFEHGI@>?HAB@SS<777:B?6123+'&&&%$$$$%$$%&&&%&'(((*./065555>SSLSLOS1//./238-+++015KIJSSIEGEHNMDPKEBBJIFDC>6669JLJGIIHLGHILKHILSOGKEBDGSKLHSHSFJMNSHSFSSIISIJSJSKSGSLJSISKJIDFB@@ECNHKOMIHIPFDD?===<<>:?@@BD>M...****;?AHSSJJJGIHFSNFSMSPJ6667<<9+')*97)((6<@BBDDDGMEE?GIGOFGJIKSJSGSKLSPHEFKHKLNSKHOFISKIJOSD@@@CSGBCAAC>>=7CDFS6643,*'',+,>=88(''(4+++,7@///0?>>7457@JB@<;=?@64. diff --git a/tests/data/barcode_demux/single_end/unclassified.fastq b/tests/data/barcode_demux/single_end/unclassified.fastq index 4eb71fe2..9ab6f120 100644 --- a/tests/data/barcode_demux/single_end/unclassified.fastq +++ b/tests/data/barcode_demux/single_end/unclassified.fastq @@ -1,12 +1,24 @@ -@310d9c0b-75e5-42e1-a4b3-e3b7143237aa -TTATGTGCATATTTACTGGTTCAGTTGTGGCCAGCCGGATATTAACCAAAAGGACACGCTTCGGTGTGCCTTTGTGGTTTTCGCATTTATCATGATGGCTGATATTCGTGCGCCGCTTCAATGAAGATGGCGTCTGCCGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCGCTGTTCAAAGTGAAAAATGGCGAAAACGCCCTGGCCTTCTAGCCACTATTATCAGCCGCTGCCGTTGGTACTGCGCGGATATGGTGCCGGCAATGACGTTACAGCTGCCGGTGTCTTTGCTGATCTGCTACGTACCCTCTCATGGAAGTTGGAGTCTGACATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATACGAGCGTCGGGTTGATGTGCTCGGGGCGGCGGTGACACCTGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCGGCAGAGACATTCAGGTCTCAACGACCTGGACGCTTTGCAT +@7e10099a-9bea-40ad-9aed-8ea58ba523c3 +TTGTTCTTCGTTCAGTTGAAACCGGTGTTTAACCTCTCTCTAAATTGCGGACCGGTTTTCGCGTTCTCGTGAAACGCTTTCGCGTTTTTCCGTGCGCCGCTTCAGCCACATGGCGAAAGATTCTGTGAGTTTCATGCCAGTGTGTTCTTCCACTTCCAGCAGCTCCCAGATGGTGGCTTTTTCCGTATGTTCCGGGCAGGCCGGATAGCCCGGTGCCGGACGGATGCCCTGGTAGTTTTCGCGGATCAGCTCTTCGTTGCTGAGGTTCTCGTTCGGCGCATAGCCCGGTCGACTTTACGCACACGCTCATGGAGATACTCCGCAAAGGCTTCGGCTAAACGGTCGGCAAGCGCTTTCACCATGATTTTGTTGTAACCATCGTGCTGCGCTTCAAAGGCATCAGCCAGTGCGTCCTCTTTAAGCCGCCAGTCACGGCAAATGCACGATGTAATCTGCTTTACCAGAAAGCTTCGGCGCAACGAAGTCAGCGAGACAGTAGTTAACGAAGCCTGTTTTTTCGGTATTCTGACGCAGATAATTCCT + -$''&&(%$#$#-)))(&%2243.-/3.)####$#%'(*('#&&(,,&$#-/2,*+()%((+&%$$$%&&&('()/-05AFB867566A<;(''')''&$%""$'))----1195234;-+.06'/1.+*.697/01?=2-001CF;?6./<:5,9-:(''62//'&*)*'94../534=;=D<@@BD:8:;==99:<@A@===7560/:A>>@@CA9.-=A:9;44;:<>?=0028))'+-9><334@2/.))*/+&'')+),--6:6-4)@DC?GIN43)(0*'&--499@99=225770.99<@??GA;85:84;'''/,/444*&%%&)-/('(@?D76577355(''HH\667988222.//A@BBAGD<:<;:9?=@>>424FF@;FA;@??9A8<;66/+//43?A<::<(''==.-/++24AA3/-- -@5df0023c-988a-404d-aa90-f29b7a0f766f -ATGCGGTATATCATAACCTACTCGTTCAGTTCTTGTGCCAGCCGATATCAACAAAAATTGTTGATCTATTGTGGTTTTAGTTATAGTGAAACGCTTTCGCGCATTTTGGTGCGCCGCTCAACAGGCGGCGGCGACAACCCGACGCTCATATTGGCACTGGAAGCCGGGGCATAAACTTTCAGCCAGTCTCGAGACTCCTAACTTCCAGTGAGGGTACGTGGCAGATCAGCAAAGACACAGGCAGCTGTAGCGTCATTGCCCGCACCATATCCGCCGCAGATCCCAACGGCGGCGCATGATCGTGGCTATAGAAGGCCAGGCTTCGCCATTTTTCACTTTGAACAGCGGATCATTACCATCCACTTCGGCAATCTTCACGCGGCAAGCGCCATCTTCATGATATTTGCCAACATCGCGCAAAACTTTGCCCTTCATCACGGGCCTTCGCCACGCGCGCGGCAAAGAGATCGTCGAGTTGTGACAGATTCGCCATAAAGCGGCAACATCACCCTCGGCGTTAAACTCTGCGGCAGCGCGGGTCAATTTCAATATCCGCAGCTCCAGTTCGCGTCCGTTTCACGGCAGATCAATAGTTTACGCGCCACGATCCATACCAGAAAGATCATCTCGCGGGTCCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCGCCTCGAGAACTT +$'&$$###%'())''#####&((')--((((3;:<20'',,))**)&%%$$%&$$%)((),-..0.156<;;785,,:>ECE=35/))(')))&(05?>??BSC?>778:62,,,1)))'',)&&&0010''('6:=>>BC9:;LHHIADECBBBCGGNJFDBFSDOGBBHHHSHLQHGLFHFEFGSLLKEHHJICIDFIISSQDHJOJFSIKSLSKKGGNGHDFECHBC<;;995555;;<=??<<=>A95//192111=;6@?BBFGDFFFGGJJIKOJMPJGOIJSKGPSIGHGDFDGSJINRGDJA???FGIGSEKLLQ@@@BSABCCHGGNGIMHHD@ESHFHHGFHSKKFC<1111==,,('())*30,-****/('',,0//0/78579:FE>???CDGSF*)))46;00=>ODCCC@9ILEKLMLSSLJDDCBCIHIKFA:76556776559BDEDDEEMISFGEGFEHSM542;<)(((+&&&'%&))+''')&%$$%$$ +@f76bb339-f2eb-4335-9773-583e21d21354 +ATGTTGACCTGGTTCGGTTATCAGCCAGCCGATAACCTGTACCGATCGAAGCAGGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTAAACCCAGATTCAGTAAGCCAGCGCTCAACTTCGGTAACGCTCATACCTTTACGGCGGGCATAATCTTCAACCTGATCGCGCTGAATTTGTGCTACAGCGTAGTACTTGCTGTCCGGGTGGCTGAAGTACCAACCCGAAACCGATGCACCGGGCCACATGGCGAAAGATTTCTGTGAGTTTCATGCCAGTGTGTTTTTCCACTTCCAGCAGCTCCCAGATGGTGGCTTTTTTCCGTATGTTCCGGGCAGGCCGGATAGCCCGGTGCCGGACGGATGCCCTGATAGTCGCGGATCAGCTCTTCGTTGCTGAGGTTCTCGTTCGGCGCATAGCCCGGTCGACTTTACGCACACGCTCATGGAGATACTCCGCAAAGGCCTCGGCTAAACGGTCGGCAAGCGCTTTCACCATGATTTTGTTGTAATCATCGTGCTGCGCTTCAAAGGCATCAGCCAGTGCGTCCTCTTCCAGCCCGCCAGTCACGGCAAATGCGATAATGTCTTTACCAGAAAGCTTCGGCGCAACGAAGTCAGCGAGACAGTAGTTAGCGAAGCCTGTTTTCGGTCTGTTGACGCAGATGGTGG + -$&&('((&&#%""$&10/0150&++7,&%&)$&'&$$####$(&$&&'###$$$*,*'%$##%%%$$&*1:9**;=@@&%&+&$###69@@>1//65--.)6((+?A--1>9665311&$$$%%%%'''***)(()'&%$()8555034361.;=*((97866,,-0@:>65587,+-.%#%'(+)%$&%''+99=?>=249:DJ>>33&%%-=:?H>85>;'&&,.)%*22<;229C>BB8;=?@A9==E=A9=>63;;5436///54235:8;:CC5447.*)+493,,.,'%)*.-.///3237;100?;;;568281?77/69-++1-%&.*%#$*()),/6655<62%)+:&&656:0+++21;**4D>42*&'*)+)4774223327:323B@''($$*0,''/5984C8;41,-.()*))+*430121095/3741.239.+:;=D=7821218;(((8?)&%('*&%& -@04440439-0d02-4406-9f82-7b60d5ce3093 -ATGTTATGTTTTTATTTACTGGTTCAGTTGTTCTATGCCGCGATATTTGACAGGAAGTTATGGTCTTTGTGGTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCGCCGCTTCATGCGGGCAATGACGTCGCGGCTTCCGGTGTCTTTGCTGATCTGCTACGTACCCTCATGGAAGTTCAGGAGTCTGACATGGTTAAAGTTTGCTGCCCGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGCTCGGGGCGGCGGTGACACCTGTTGATGGTGCATTGCTAGGAGATGTAGTCACGGTTGAGGCGGCAGAGACATTCAGTCTCAGCAACCTCGGACGCTTTGCCGATATAAGCTGCCGTCAGAACCACAGGAAAATATCGTTTATCAGTGCTGGGAGCGTTTTTGCCAGGAACTGGGTAAGCAAATTCCAGTGGCGATGACCCTGGAAAAGAATATGCCGATGGGTTCGGGCTTAGGCTCCAGTGCCTGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTCCGGGCGGCA +##$%%####$%')*0-../22+%##$$$#$&&&$#$$##$$%'&')'&'(8==>>=:B7)()(((''(()2::@DF7755B;;;?>;:9B0..+---,)(+&%&)*++/1271%%%&&'*B@@>?CCD????HFDEFHHJAAB999:SJIGJBC@@BOJHF@?3222;:6:;<@BPKSB?@?DHSRKJFMJGNJOSSKSSNFOMSHACBBKKIJ865692@ABAEFIJH>>?9DFDSCSGSNHCCECHIFHKNSSJ:99;DC=436FFDDA83/18>ACEGIGKJOGSJFCSPHJIL@QOPSFC2.((45-59999;DD@BB<5.,%&&&+38EEH>==>KHJSJISISQKGSIHGKGFD>><0--.3756/.-/2+****))/242DCHDFEDGFFHJHJLGSJIHMJ<QF@>>=@BDGLEIEEEGFHAB@ABBBBDHFE>>>>?SLDGGKGQSH02@=/.-*35..-(,...--66443*) +@5b5c1c03-4813-435d-892b-c389f3dc86d1 +TGTTATATTTCCTGTACTTGGTTCAGTAGCCGCCGTATAAGCAACTGCCATTGGCTCAACTCGGGGTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTCGTGCGCCGCTTCACCGAGTTGCTGAGTAACGGTGAGTTGTAATTTCCGCGTTTGTTATAAGCGAGGTTGTAATAGTCGGTGAATTTCGGCTTAACCTGAATAACTCCGTCCTGTGTTTCGATGTTGTAGCCATTCATTCGACTGTATGTTGTATCAGCGAAATTAAAATATCCGCTGGTCGAATAACGGTAACCCACTAACTGAATATTCGTGCCTGATTCATTGAGCGATTTGTTATAGAGAAAACGCACCGATTGTGCCAGTCATGCTGACTGTATCATCGGGAAGTGTGGAATTAGCCTGCGTCATATCCACAGACAGAGCGCCCAGTGCCCCCATGTTTTTCCCGATACCGAAATTAAAAGCACGATAACGATCCGCCAGTTGCGTTCCACCATATATTGTCCAGCCAGCCGGAAGGCCGTGGAGTAATGTACTCTGGAAAAAGCGGGTTTTTCCTGCTGCGCATTTCCACTACGGACACTCCTGCCGTAATGGAATAACGAGTATGCCA + -#((+0-''&'%()$&&'%%%(**++-,,-*)($$&**$$"%%%)')(*&&'))'&('$$###%''()58:,,(62*(*%*/B97..2:9BD?,**;8.,+1/10<@*'(0115)24689B8;?,++27:1/3&&'&$#$%#((*../.2:21++-==@DCKDD?>>@8;8:876C@*'));AF@43&$&58CC;<8::8?779BD9889;;-&#$#$3)''-<4:100..23121.1671+**(&-.446>@632&%%AHJ8997780002227844//8../?<9<=B88)))B==>4B***3--+-8@9=BB?7770,0,6379.&75522.33;((+99,,,35433?<;;460+5*64279233::=::;F:>43))0530**$$$*,/201)('.9;;@@GF@968:BG@96666528,,,1BD867=421/1(''1/8E<99?3111;;:9945./*00)))6;<=@0.8891++EBD{@;>@F:;577B;<;DDA756=214.--8:@?A@DDCCBA112200034>;111%##+23+)&% +$%+.*-''&&%%&''%$$$#%'),+*)%$####$%$$$%%&%&'++**,+3-''&(*(((+((++4./35,+++:<6>>>FEFISR8887CBCDGF3PG>>==HEEH>=<<=EGKIFJFIQNL''''=SGJEGSSDGHFJFFHNHEE'FED=E@NHEKEFBACDDDEIPIFGSECBBDBE33339/)((*597HFC@9::;HMGSHGNFLKQISGCFJGD@>ACB@AENEFHSHJLSHIKGIEJJJLFGKHMJLJLSNIGGNSNFBSLPRSHLKHH889;;9976-((,3.,,,8D<:::=5009878<;;EFJSFSRMSSISJEEMFH9;99==;>:1/('')%%%%'())**79?GH@?C@DDG76669:42-% +@96e5ef58-b235-4e81-8946-638be2dbcd40 +TTTGCGTATTCGTTCGGTTGTGCAGCCGGTGTTTAACCAGTTTCCATCACTTCAGACTTGGGGTTTTCAGGCCGGCGTTTCTCGTGAAAACGCTTTTCGCGTTTTTCGTGCGCCGCTTCAACGTTGTATCAGCGAAATTAAAATATCCGCTGGTCGAATAACGGTAACCCACTAACTGAATATTCGTGCCTGATTCATTGAGCGATTTGTTATAGAGAAAACGCACCGAATTATCCCCCGTCATGCTGACTGTCATCGGGAAGTGTGAAATTAGCCTGCGTCATATCCACAGACAGGGCGCCCAGTGCCCCCATGTTTTTCCCGATACCGAAATTAAAAGCACGATAACGATCCGCCAGTTGCGTTCCACCATATATTGTCCAGCCAGCCGGAAGGCCGTCGGAGTAATGTACTCTGGAAAAAGCGGGTTTTTTCCTGCTGCGCATTTCCACTACGGTATTCTCCTGCCGTAATGGAATAACGAGTATC ++ +$$&&%$$#$%&()))%$#%%%$%&$#$$',****,20('''')**,@E77678321-,,(02543/-+&%%$'%$%&'('&''(.+-0)2B;:7641167FGBBAF:99CJGJMOKMMLKSSSJIGFHFSGB>=>?88767;FSGJHEDFEEIHJSISKIJDDFELISHIFGINGNCCAD;:::NISNIME=>EJESLH><;;3)((&+++++*+008899DECDFDDAD----.33?N>==;7,+,&33498;@AFQ??>>>;:5678>475547FDCA<;;<3.*++(%%)()2+,9>CDBB33)'''8877822,+***))(+0423,,4<989BJFLGKGQNIJIKGGSKILEGKFO<>+**''&&(,--)+221&'&'2@JKGKGFNCEPFEMQDCDHIA@?97(('&%&033?65344>@CE=>>?@A?+)())).,-334=677354//)'%%%&&%$#$%))),//((--/566841.3,'%$%),-5679DDFGGSHEC=;3000443000076====9977888<;8755755524588EGSRPSLHHOSMMHHIHIIILIMPPSMJGJSMSSJIKKKSSKPSOLJISLGSIPEFDFGLIQMSEEFHGHDDDDDEBBADISLLSNFIAAB((((6)('%$$%%%$$$&,,*)%$$%''('# +@e7c5823b-1310-4a9b-bef6-3206c3b9251d +ATGTCATCTTACTGGTTCGTTATCGCCGCCGGATGTTAACCTTGTGAGTGGAAAAATCCAGCATCGCATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTGGCTGGGCAGCGTCAGGAAATCGCTTGTCGAGCTGACGCCGCAGGCCTGGGCTATTTCTTCTGTTGCTGGTAGCTGACGCCAGTAGGGTTCGCCGATAAGCATGATCCCGCCCGGTTTCAGGCTTTGCGCTAGCAGCTCCTCCGCCCCGGCAAACCCACCGGCAATCCATGTCGCGCCGACGCAGGCCGCCACGTCACATTTTTCATTTGCGACGTAACCGGCCGCATCGTTATGAATAAAATGTACGCGTTCGCTGACGCCAGTTCTTCCGCGCGACGCTTTGCCTGCGCGGTGAAGAGCGAACTCATGTCGATGCCAGTCCCCCGT ++ +%&)*(()((*&(-*+./,))(%&&$""""#$%()(((%(++..+.+**+*((''&&$#$#%$$$$#%&&'.=>EEEEDB>=I<=>90/00/0-0../05.,,,13DIEDBDFGHKGNSKJMHGKNMKIFGOKSSKSQMISINOHSMMSHONJKIFSJ@???EQSSMFFFEEHGPSSKJSKHLHHLNLJHDEEKSEDK?E7555.+/)))).--.CGFB?A@BEKHDBDH Date: Mon, 11 Mar 2024 22:27:04 +0000 Subject: [PATCH 10/23] Remove unnecessary operation on flank overhang params because the flank overhang can be asymmetric and use different lengths of the flank from top and bottom depending on how long they are. --- dorado/demux/BarcodeClassifier.cpp | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 2c359956..1f61bd5b 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -101,7 +101,7 @@ float extract_mask_score(std::string_view barcode, auto result = edlibAlign(barcode.data(), int(barcode.length()), read.data(), int(read.length()), config); auto score = result.editDistance; - spdlog::trace("{} {} position {}", debug_prefix, score, result.startLocations[0]); + spdlog::trace("{} {}", debug_prefix, score); spdlog::trace("\n{}", utils::alignment_to_str(barcode.data(), read.data(), result)); edlibFreeAlignResult(result); return score; @@ -135,11 +135,10 @@ std::unordered_map process_custom_ki // Helper to extract left buffer from a flank. std::string extract_left_buffer(const std::string& flank, int buffer) { - return flank.substr(std::max(0lu, flank.length() - buffer)); + return flank.substr(std::max(0, static_cast(flank.length()) - buffer)); } std::string extract_right_buffer(const std::string& flank, int buffer) { - ; return flank.substr(0, buffer); } @@ -266,21 +265,8 @@ std::vector BarcodeClassifier::generate_ use_leading_flank = false; } - // Update left and right buffer lengths based on the actual flank regions of the - // chosen kit. - m_left_buffer = - std::min({m_scoring_params.flank_left_pad, int(kit_info.top_front_flank.length()), - int(kit_info.top_rear_flank.length())}); - m_right_buffer = - std::min({m_scoring_params.flank_right_pad, int(kit_info.top_rear_flank.length()), - int(kit_info.top_front_flank.length())}); - if (!kit_info.barcodes2.empty()) { - m_left_buffer = std::min({int(m_left_buffer), int(kit_info.bottom_front_flank.length()), - int(kit_info.bottom_rear_flank.length())}); - m_right_buffer = - std::min({int(m_right_buffer), int(kit_info.bottom_rear_flank.length()), - int(kit_info.bottom_front_flank.length())}); - } + m_left_buffer = m_scoring_params.flank_left_pad; + m_right_buffer = m_scoring_params.flank_right_pad; BarcodeCandidateKit candidate; candidate.kit = kit_name; From 2a22283142f8215ea66888ba319c223b758603fa Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:50:01 +0000 Subject: [PATCH 11/23] update integration test now that more barcodes are detected --- tests/test_simple_basecaller_execution.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index 1c2b775c..a696719e 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -18,7 +18,7 @@ model_name_5k=${4:-dna_r10.4.1_e8.2_400bps_hac@v4.2.0} model_name_5k_v43=${5:-dna_r10.4.1_e8.2_400bps_hac@v4.3.0} model_name_rna004=${6:-rna004_130bps_hac@v3.0.1} data_dir=$test_dir/data -output_dir_name=$(echo $RANDOM | head -c 10) +output_dir_name=test_output_$(echo $RANDOM | head -c 10) output_dir=${test_dir}/${output_dir_name} mkdir -p $output_dir @@ -336,7 +336,7 @@ test_barcoding_read_groups() ( # There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups. test_barcoding_read_groups barcode01 4 barcode04 3 unclassified 2 # There should be 4 reads with BC01 aliased to patient_id_1, and 5 unclassified groups. -test_barcoding_read_groups patient_id_1 4 unclassified 5 $data_dir/barcode_demux/sample_sheet.csv +test_barcoding_read_groups patient_id_1 5 unclassified 4 $data_dir/barcode_demux/sample_sheet.csv # Test demux only on a pre-classified BAM file $dorado_bin demux --no-classify --output-dir "$output_dir/demux_only_test/" $output_dir/read_group_test.bam From b03eb2fc90305d1f3e78e2a39529c67b5b77a099 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:14:42 +0000 Subject: [PATCH 12/23] Update double ended barcode test because the new thresholds detect the barcodes better. --- tests/BarcodeClassifierTest.cpp | 2 +- .../EXP-PBC096_barcode_both_ends_fail.fastq | 6 +++--- .../EXP-PBC096_barcode_both_ends_pass.fastq | 8 -------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index 9fc196cd..31bf0af7 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -147,7 +147,7 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - failing case", TEST_ auto single_end_res = classifier.barcode(seq, false, std::nullopt); auto double_end_res = classifier.barcode(seq, true, std::nullopt); CHECK(double_end_res.barcode_name == "unclassified"); - CHECK(single_end_res.barcode_name == "BC15"); + CHECK(single_end_res.barcode_name == "BC01"); } } diff --git a/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_fail.fastq b/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_fail.fastq index 4d3a7822..07dc6c58 100644 --- a/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_fail.fastq +++ b/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_fail.fastq @@ -1,4 +1,4 @@ -@65735689-cc39-4dd6-8e61-db79f6caa2d -TACTTCGTTCAGTTACGTATTGCTGGTGCTGGGGTCTACCGCTAACACCACTCAACCTTTCTGTTGGTGCTGATATTGCGGCGTCTGCTTGGGTCTTTAACCTGACGGGTACGTGGTTGGCCTCACGGTTGTTATGGTGGCCATTGCTGCGCGGCGTGATTTTGCCGCTGCGCTCGCCGCGTGTGGCGAAGCTGTATGCCTCTGTCTGGGAAGGTGGCTCGCCGCTGATGGTTTACAGCCGCCAGCAACAGCAGGCGCTGGCACAACGTTTACCGGAGATGCCCGTAGCGCTGGGGATGAGCTACGGCTCGCCATCACTGAAGCGCCGTAAAAGATGGCGACTCTGGCGAAACCTTCAGATCCATATTGTGGTGCTGCCGCTATCCGCAATTCTCCTGTTCGGTCGGTGCGGTATGGGATGAACTGGCACGCATTCTGGCGCGCAAGCGTATTCCGGGATATCACATACGTGATTACGCCGATAACCACGATTACATTAATGCACTGGCGAACAGCGTACGCGCTTCTTTTGCCAAACATGGCGAACCGGATCTGCTGCTCTCTTATCATGGCATTCCCCAGCGTTATGCAGATGAAGGCGATGATTACCCGCAACGTTGCCGCACAACGTCATGAACTGGCTTCCGCATTGGGGATGGCACCGGAAAAAGTGATGATGATAGTCGCGCTTTGGTCCGGGAACCCTGGCTGATGCCTTATACCGTGAAACGCTGAAAAATGCTCGGAGAAAGGCGTAGGTCATATTCAGGTGATGTGCCCGGGCTGCTGCGGATTGTCTGGAGGCACTGGAAGCACCGAGCAAAACCGTGAGGTTCACGGTACCGGCGGGAAAAATATGAATATATTCCGGCGCTTAATGCCACGCCGGAACATATCGATGGCTAATCTTGTTGCCGCGTAAGGTTTAACAGCAGACGCCAGATAGAGCGACAGGCAAGTAAGGTTAACAGTGGCAGCGAGGTAGACCTCAGCACCAGCAATGTGT +@f838d15b-7c31-4c88-a599-056b71ba601 +AGTATGCTTCGTTCGGTTACGTATTGCTGGGTGTTTAACCTTGTCGTAATGAATGCTGCGGTAAACGCCTTCCAGCTCGCCCATCAAAGCCAGCAAACGAGTGTCATTAAGCGGCTTGCCGCAGTGTTCATTCATCGCCATCAGCGCCGCGACCACCGAACAGGCACTGGAGCCTAAGCCCGAACCGATCGGCATATTCTTTTCAGGGTCATCGCCACTGGAATTTGCTTACCCAGTTCCTGGCAAAAACGCTCCCAGCACTGATAAACGATATTTTCCCGTGGTTCTGACGGCAGCTTATCGGCAAAGCGTCCGAGGTTGTTGAGACTGAATGTCTCTGCGCACTCAACCGTGACTACATCTCCGAGCAATGCACCATCAGCAAGGGTATTCACCGCCGCCCCGAGCACATCAAACCCGACGCTCATATTGGCACTGGAAGCCGGGGCATAAACTTTAACCATGTCAGACTCCTAACTTCCATGAGAGGGTACGTAGCAGATCCAGCAAAGACACCGGCAGCTGTAACGTCATTGCCCGCACCATATCCGCGCAGTACCAACGGCAGCGGCTGATAATAGTGGCTATAGAAGGCCAGGGCGTTTTCGCCATTTTTTCACTTTGAACAGCGGATCATTACCATCCACTTCGGCAATCTTCACGCGGCAGACGCCATCTTCATCAATATTGCCAACATAGCGCAAAACTTTTCCTTCATCACGGGCGCTTCGCCACGCGCGCGGCAAAAGAGATCGTCGAGTTGTGACAGATTCGCCATAAAAGCGGCAACATCACCCTCGGCGTTAAACTCTGCGGGCAGCACAGGGTTCAATTTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCACATCCATACCAGAAAGATCATCTCGCGGGTCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCCTTACTCCGGAGAAACTCATGCCTTCCAGGTTAAACACCCAAGCAGACGCCGAAGATAGAGCGACAGGCAGTTTTTGAGGCGAGCGGTCAAGGTTAAACACACCAAGAAGCAGACGCCGAAGATAGGGCGACAGGCAAGTAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACCAGCAATACGTAAGC + -%'-03577=>)(**3688:>>=>=>>=((()20001,*)'%%))),,-2)('&&&)0/9FCDC>?=<<>>ADD@98878;:;>>><<432-///+++*03343:72,&%%%%%'100/*%%#%&()04886=>>BAA?@>:::;@<<::<>?>>>766112240222:<9:@?>;:::;=;;::8.--1125200001678:;:99986*382988:;><<==>>?@>B3348A9=8.-.,079?@@44''''///435)&--.///23?>;:436833334:5:>8658==8789;=.-,,,++*)1:@>@A>@@33)(++0,19;::99:4553&%%&&%%+((3400....,..,,345)')++==><;7;;<:::3;6''8;19::==<<;<2>DB>((*53462(&&'*+.8=@??@BA?>=<885678DD@?BA775553..*&%%&(+'-,-88<:.,+,**)+/19<20-4:::=A??A:999?A?>>?ECAB?=;<<=C??@@BC@?98;>??+-/%$$%&()*+9;65-,,+)((*))()*8:*)11(+;>A@DABAC?@??@DCAN;?GIDA>=;<405994442659::?>DD?>5++*+*&*50''''0:<;@@?A100;=?;7778623336732-))****)(%$#$(+,-7;6223;<.-))),7:57;>799:>>=51..0AB@@@A66=54;=@??768:;127;;?<..-.7+*''9A+***;?>88877.1++++)*((&%&&,+&&&'-62(('&&&*5/+('%%$$)''()50,,'(&&+*(*-,,'&-3458::4445>EIHH@?>-,,.97=A;758<<>?=7434333578634568*))**'%%*+3399AACF555/0,+-8;>3075910))$$'-.01-.)%%%''*+''25=6655/-*.-'&(/30(((((*))'''-0,..688767:43178??;::9:*)(( +'+----69:?<;<=''.08>>=>?@@ACAA<<<=>@F:9899?;98(%$$$$$$$%*/51129;<77;:;=@DGAB@AA@A>=;0''(06;?@@CA@9<>>@A>-36*3.---=?@544447><=?===A5544867768::AA::::=BB>=>>@9888;:;<67669::A=<<;?>=>??BKCB@?B/7('/8:<>AC>>=>@>>>>D:=<;959)''''*78=646:;::@AC??>>;:<;7:99==>B@AAAAACDGFG45:03=?@CFB>>?>7776888ICA?.../LDF@@?===<=<<==>>>@>?>???BB=:.*)'))&&))))+)**-7672/..15<==32500...****/6:F2*))*)+&)(()*-/236::>?>@:8:;><=>@88B@A@555545:54>@BA<;<<=;=<3/4>==9<-()/359===<;:820,+&%,49,)+::=>@9::@@><:98888//.,::<=?D=<<;=?=<=>>===>@AA<<<<@A????AAB6/../23=;;::=:;;=B>>>>7667>CD.1..<;;9A>CDDC>ABFFICADA@A@AA@))))*99>??<>??A976673:64:;;<<8888??>???88:?DDFDJDBHB@?@?AABBB?@66--.-3=;;;;:<96222792226:=@AA@??>=>>77?A=A@;;;<@@<=<<=>?=:?6666>765++++02:<>=?/--.;A@@???AEEDB++)((*.356;<:EDEECGA>?>?B=BD*(+12=;<;;@C:<<<<@7<@@@@>>B?>=??@?><<43226;8((>?B888?77<8;;;:=<8?9674366@A>53235))))((-++349995367@?@@@677*)*51H?G@@??=<<=7876999?>>?779:8;;>44542.*++'&.9?B><;<>63231112:2*5>=ICES98,,***+,*.69:@>?A?>>=386:-&'&(28;<>=<--,29=@?BADAA@ACJ@??>=>>:74337;<@@BDA>>=>=@EA>><<<96,)& diff --git a/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_pass.fastq b/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_pass.fastq index cd304cc1..855d5363 100644 --- a/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_pass.fastq +++ b/tests/data/barcode_demux/double_end_variant/EXP-PBC096_barcode_both_ends_pass.fastq @@ -2,11 +2,3 @@ GTACTTCGTTCAGTTACGTATTGCTGGTGCTGAAAGTTGTCGGTGTCTTTGTGTTAACCTACTTGCCTGTCGCTCTATCTTCGGCGTCTGCTTGGGTGTTTAACCTCGAGGCGCGATTTCTCAGGCGACCACGCTGGCGCGGGAAATGGGTTATACCGAACCGGACCCGCGAGATATTCTTTCTGGTATGGATGGCGCGTAAACTATTGATTCTCGCTCGTGGAAACGGGACGTGAACTGGAGCTGGCGGATATTGAAATTGAACCTGTGCTGCCCGCAGAGTTTAACGCCGGGGGTGATGTTGCCGCTTTTATGGCGGAATCTGTCACAACTCGACGATCTCTTTGCCGCGCGCGTGGCGAAGGCCCGTGATGAAGGAAAAGTTTTGCGCTATGTTGGCAATATTGATGAAGATGGCGTCTGCCGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCGCTGTTCAAAGTGAAAAATGGCGAAAACGCCCTGGCCTTCTATAGCCACTATTATCAGCCGCTGCCGTTGGTACTGCGCGGATATGGTGCGGGCAATGACGTTACAGCTGCCGGTGTCTTTGCTGATCTGCTACGTACCCTCTCATGGAAGTTAGGAGTCTGACATGGTTAAAGTTTATCTTGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGTGCTCCGGGGCGGCGGTGACACCTGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCGGCAGAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGATAAGCTGCCGTCAGAACCACGGGAAAATATCGTTTATCAGTGCTGGGGCGTTTTGCCAGGAACTGGGTAAGCAAATTCCAGTGGCGATGACCCTGGAAAAGAATATGCCGATCGGTCCGGGCTTAGGCTCAGTGCCTGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTGCGGCAAGCCGCTTAATGACACTCGTTTGCTGGCTTTGATGGGCGAGCTGGAAGGCCGTATCTCCGGCAGCATTCATTACAGGTTAAACACCAAGCAGACGCCGCAATATCAGCACCAACAGAAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACCAGCAATACGTAA + &&''()5--65..11B?;;=ADAA99?:851000-/>===7,)(()+11/-166:A>=?@C=;;;::::EBD=5333;<;6::95446=?DBF66(((/+()*,*'%%&':77&%'**78:;;<<7;<=;<32+1++367;=>8?>>>??AA>?8??ABAB:99;>+'''(//0-:.--./212*'432*&&&)'((>=;;;<>>A33:?<<='&'++'''':9;=?;>1///0211335@@ACFECSEA@D@<5BA@AA??@?@@>?BA0/4469?>=?@>3--.30118/..''&%',.;8A===/.-''''),-,..-.--015>=>>=<::<=<7414225==>>CA?9B??3334>?C>D>DD@B?@@@EFGDAA?DA>MIGB@>*))''&&&((+/46679::;=CADB===;??B@@<:;<>>=>7;B@=;468;::>=;==300023:>A<::==@>==:21*',,,+/::?44111333470-,-015789;;<=><31.,.01''%%%%$&,+,'')6>?@BBBBDDA>;;<<<;:<=<630//67?8('*-030,-'(14.-++/3>?>=?C><;<=)(();8?77<;:800-./166//-+(''('/3444335469:<32298:==BD>>==?66>@>?>?@??>=;8.-**))+.1:@AA?@=>;;96672./2448(((,226;;?>=88749,,=>@CD;5***+>;<=>?=>?>@@>;<;:97;8/--)**-....8997333,*045100///0/((59:=?@BBBD@??79;?>9779ABCCD?<;<A@=)''*,2=A<;3445?@@A?AC??9788;=>:43117;@<<<:<=;89/..119>B@=----223>???BBH9888?>C987444**::FD@A)''%&&&&'(''+.-356:::D>>A=;;;;=10-.-/+47882111:>:==?@A>>>>???AD>=@;9=>BDA2222:88632, -@4a7f5769-d7f8-4729-a6e4-3985b6c1f9a -GTACTTCGTTCAGTTACGTATTGCTGGTGCTGAAGAAAGTTGTCGGTGTCTTTGTTAACCTACTTGCCTGTCGCTCTATCTTCGGCGTCTGCTTGGGTGTTTAACCTCGAAGGCATGAGTTTCTCCGAGGCGACCACGCTGGCGCGAAATGGGTTATGCCGGAACCGGACCCGCGAGATGATCTTTCTGGTATGGATGGCGCTGTCAACTATTGATTCTCATCGTGAAACAGGACGTGAACTGGAGCTGGCGGATATTGAAATTGAACCTGTGCTGCCCGCAGAGTTTAACGCCGAGGGTGATGTTGCCGCTTTTATGTAGTCTGTGCAACTCGACGATCTCTTTGCCGCGCGCGTGGCGAAGGCCCGTGATGAAGGAAAAAGTTTTGCGCTATGTTGGCAATATTGATGAAGATGGCGTCCTGCCGCGTTGAAGATTGCCGAAGTGGATGGTAATGATCCGCTGTTCAAAGTGAAAAATGGCGAAGCTTGCCCTGGCCTTCTACAGCCACTATTATCAGCCGCTGCCGTTGGTACTGCGCGGATGGTGCGGGCAATGACGTTACGGCTACCGGTGTCTTTGCTGATCACTACGTACCCTCTCATGGAAGTTAGGAGTCTGGCATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGGGCGTCGGGTTTGATGTGCTCCGGGGCGGTGACACCTGTTGATGGTGCATTGCTCGGGAGTATAGTCACGGTTGAGGCGGCAGAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGATAAAAGCTGCCGTCAGAACCACGGGAAAATATCGTTTATCCAGTGCTGAGGGCGTTTTTGCCAGGAACTGGGTAAGCAAATTCCGAAGTGGCGATGACCCTGGAAGAATATGCCGATCGGTTCGGGCTTAGGCTCCAGTGCCTGTTCGGTGGTCGCGGCGCTGATGTGATGAATGAACACTGCGACAAGCCGCTTAATGACCTCGTTTGCTGGCTTTGATGGAGCGAGCTGGAAGGCCGTATCTCGGCAGCATTCATTACGACGGGTTAAACACCCAAGCAGACGCCGCAATATCAGCACCAACAGAAAGGTTAACACAAAGACACCGACAACTTTCTTCCAGCACCAGCAATACGTAACT -+ -%&(+-.365;>7679::10122798976551101-.-..-..440**&&&&)(+++*-+,('&%%%&*)...(()1;>@898776799;;10,&&).>A=<==?((((''*.013579-.1*)()*+4/0987))(&,.-+('&$%%*,0034<==64''''5*)(''''/2=65+*++,>?733//3325?AA??:9:;8@A@;=95+**+0//-)+./134566>5555?2-+++,..8>=;:;6347;B>/.*&%&&&'*)%%%&.002212-,--7,+,,+-)65?G3>D?<=9932333711::<9::/21589666;=9:<==:64+&&%()3349=)))244710--122.,-+-01==?>?>?=<87769;=>B?;AADDB>2(((9;==77.023)()+/&%&'.06;><;:;9:'&&'...,.--.59>@@C<<31-'''(46444368:9=BBDCAA><=:88497:::?A98789:=>?@DA>A<88543/)('((()/1278:==''''8?*))*???@4D=<<>>>=>=9;<;8534,-/2((()).---2/*+554455;9;<7<>:7''&&(?437:;==<>55985110../821112/.288887,7=7511-+**+-/08840/07<@A=875)()**(('%%'.**3:;D@>=;881110.0C???@0746;;=:4))))/;@>A@879:<568325**+/..56;4555,+,-/('''<@@;:99:546;<<=>>=>>>??@ACB@@AA><;4,(&%##%%())))-456;:;:40+)&&)/1999311223)**015;5513337;;<;=<=((''''((66475444'.--))('(-.:<<>=<:3677/)(%%&+:9;<9:;:?==?>6666;<;<1-,***''%()3:+.:99ABC=<<;544/(&),.322256557@E-,-><===?B=;:6644,*&% -@c115c3b2-6160-49a2-a452-bd3ddda56b3 -TACTTCGTTTCAGTTACGTATTGCTGGTGCTGAAGAAGTTGTCGGTGTCCTTTTGTGTTAACCTTTCTGGCCAGTGCTGATATTGCGGCGTCTGCTTGGGTGTTTAACCTTGTCGTAATGAACGCTGAATGCTGCCGGAGATACGGCCTTCCAGCTCGCCCATCAAAGCCCAGCAAACAGGTATCATTGGCGGCTGCCGCGATGTTCATTCATCGCCATCAGCGCCGCGACCACCGAACAGGCACTGGAGCCTAAGCCGAACAAAATCGGCATATTCACGCCTGGGTCATCGCCACTGGAATTTGCCCACCAGTTCCTGGCAAAAACGCTCCACAGCACTGATAAACGATATTTTTCCGTGGTTCTGACGGCGGCAGCTTATCGGCAAAGCGTCCGAGGTTGTTGAGACTGAATGTCTCTGCCGCCCCGCCAGCCGTGACTTCATCTCCAGGCAATGCACCATCAACGGGTGTCACCGCCGCCCCGAGCACATCAAACCAAACGCTCATATTGGCACTGGAAACCGGGGCGGCGCCAGCTTTAACCATGTATCAGACTCTAACTTCCATGAAAATTGCGTAGCAGATCCAGCAAAGACACCAGCGGCTGTAACGTCGTTGCCCGCACCATATCCGCACCGGTATTTTCGCAGCAGCGGCTTCCGTGTAGTAGCTATAGGAGAGCCAGGGCGTTTTCGCCATTTTTCGCCGAACAGCGGATCATTACCATCCACTTCGGCAATCTTCACGCGGCAGACGCCATCTTCATCAATATCATGCCAACATAGCGCAAAACTTTTCCTTCATCACGGGCCTTCGCCACGCGCGCAGCAAGAGATGGTCGAGGATTGTGACAGATTCGCCACGCAAAAGCGGCAACATCACCCTCGGCGTTAAACTCTGCGGGCGCTGGGTTCAATTCCGTATCCGCCAGCTCCAGTTCACGTCCGTTTCACGAGCGAGAATCAATAGTTTACGCGCCATCCATACCAGAAAGATCATCTCGCGGGTCCGGTTCGGTATAACCCATTTCCCGCGCCAGCGTGGTCGCCTCGGAGAAACTCATACCTTCCAGGTTAAACACCAAGCAGACGCCGCAATATCAGCACCAACAGAAAGGTTAACACAAAGACACCGACAACAGCTTTCTTCCAGCACCAGCAATACGTAA -+ -$$%&'*&&*','&%&+.03<<=8A44654443/'')+*259;5/*)*+,'((7410--.+42:;BBA.-)()('''(++,,111-,*))**+,&&&&%%$))*67>999:?A?7C@?>=<986'&%&')66:9:=767A;;:421,+())0+(*.13<=;755522./2003122111,('&'()%%%$%'&&&&%$$$')))()9:6556.-01:A=<<****+/03446657;///00;775756:999-)--.0/'(*5010,//''2/,*%%%%&&&()+,(),/,,+.711/14499;56-+)(()&,25632387:ADS=<430.*)&%'225656ABFABBDF@?=AS3::7;:545())23.-/,,/(&()3555<=B99=;9865*(*---/1111@9?2@@??B?8444:::;8;:*())))'&&'7872/+&&'0//(()*//7765,+++('&&)&%%%%')14421778.--+...41123,,47;2/-++*'&&'(((('%$#()(((('((*)'$$%$%%+)))*46<799:663200+--18?>36+,3())++*(''&$%(-+*++08@J:76899:8<622338,,45<A9.--,,,,,-56==700/06?<;?:856446/-..&%$$$%&+9>AMDA=A888896./'%%%(&,3286.++'())*+++''''((..2200-*)''&%(41*))+,/033:??@:;;644+**,66556<;92***''%%&&)2>=;:;'+()*+@@A@CFC>==,*-@??111093.7=9=973+*))+--,..+,-*()*33536<<>;*;:8546201,+0-6>=>=A887:8878:+)))*(/6666;9:;:=78;=0+,-==B:87.,('),++)''''-88995.--,''''.2,()*)*/-)('&&%'')('(((:>....8=<7//6:<@?ACC?<4./112:C@@?<>89?5<<@1//.-,./069;8=;@,)*,++-.,,&%)+**,0- From a69997461caef5068315077c80669477c0b35911 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 12 Mar 2024 19:38:02 +0000 Subject: [PATCH 13/23] fix data type conversion error on windows --- dorado/demux/BarcodeClassifier.cpp | 28 ++++++++++++++-------------- dorado/utils/types.h | 6 +++--- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 1f61bd5b..85847a59 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -94,10 +94,10 @@ std::tuple extract_flank_fit(std::string_view stra // Helper function to globally align a barcode to a region // within the read. -float extract_mask_score(std::string_view barcode, - std::string_view read, - const EdlibAlignConfig& config, - const char* debug_prefix) { +int extract_barcode_score(std::string_view barcode, + std::string_view read, + const EdlibAlignConfig& config, + const char* debug_prefix) { auto result = edlibAlign(barcode.data(), int(barcode.length()), read.data(), int(read.length()), config); auto score = result.editDistance; @@ -472,10 +472,10 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe // Calculate barcode scores for v1. auto top_mask_result_score_v1 = - extract_mask_score(barcode1, top_mask_v1, mask_config, "top window v1"); + extract_barcode_score(barcode1, top_mask_v1, mask_config, "top window v1"); - auto bottom_mask_result_score_v1 = - extract_mask_score(barcode2_rev, bottom_mask_v1, mask_config, "bottom window v1"); + auto bottom_mask_result_score_v1 = extract_barcode_score(barcode2_rev, bottom_mask_v1, + mask_config, "bottom window v1"); BarcodeScoreResult v1; v1.top_score = top_mask_result_score_v1; @@ -491,10 +491,10 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe // Calculate barcode scores for v2. auto top_mask_result_score_v2 = - extract_mask_score(barcode2, top_mask_v2, mask_config, "top window v2"); + extract_barcode_score(barcode2, top_mask_v2, mask_config, "top window v2"); - auto bottom_mask_result_score_v2 = - extract_mask_score(barcode1_rev, bottom_mask_v2, mask_config, "bottom window v2"); + auto bottom_mask_result_score_v2 = extract_barcode_score(barcode1_rev, bottom_mask_v2, + mask_config, "bottom window v2"); BarcodeScoreResult v2; v2.top_score = top_mask_result_score_v2; @@ -584,10 +584,10 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl } spdlog::trace("Checking barcode {}", barcode_name); - auto top_mask_score = extract_mask_score(barcode, top_mask, mask_config, "top window"); + auto top_mask_score = extract_barcode_score(barcode, top_mask, mask_config, "top window"); auto bottom_mask_score = - extract_mask_score(barcode_rev, bottom_mask, mask_config, "bottom window"); + extract_barcode_score(barcode_rev, bottom_mask, mask_config, "bottom window"); BarcodeScoreResult res; res.barcode_name = barcode_name; @@ -651,7 +651,7 @@ std::vector BarcodeClassifier::calculate_barcode_score( } spdlog::trace("Checking barcode {}", barcode_name); - auto top_mask_score = extract_mask_score(barcode, top_mask, mask_config, "top window"); + auto top_mask_score = extract_barcode_score(barcode, top_mask, mask_config, "top window"); BarcodeScoreResult res; res.barcode_name = barcode_name; @@ -661,7 +661,7 @@ std::vector BarcodeClassifier::calculate_barcode_score( res.bottom_flank_score = -1.f; res.flank_score = std::max(res.top_flank_score, res.bottom_flank_score); res.top_score = top_mask_score; - res.bottom_score = -1.f; + res.bottom_score = -1; res.score = res.top_score; res.use_top = true; res.top_barcode_pos = {top_result.startLocations[0], top_result.endLocations[0]}; diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 3b308f9a..184fe05c 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -39,9 +39,9 @@ std::shared_ptr create_barcoding_info( const std::optional &custom_seqs); struct BarcodeScoreResult { - float score = -1.f; - float top_score = -1.f; - float bottom_score = -1.f; + int score = -1; + int top_score = -1; + int bottom_score = -1; float flank_score = -1.f; float top_flank_score = -1.f; float bottom_flank_score = -1.f; From bd9ccef307a1487082ecd79b25a5622b714fcfcd Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:15:53 +0000 Subject: [PATCH 14/23] Make barcode front/rear window configurable --- documentation/CustomBarcodes.md | 4 ++++ dorado/demux/BarcodeClassifier.cpp | 17 +++++++++-------- dorado/demux/BarcodeClassifier.h | 2 ++ dorado/demux/parse_custom_kit.h | 2 ++ tests/CustomBarcodeParserTest.cpp | 2 ++ .../custom_barcodes/scoring_params.toml | 2 ++ 6 files changed, 21 insertions(+), 8 deletions(-) diff --git a/documentation/CustomBarcodes.md b/documentation/CustomBarcodes.md index c5e987cb..e50e7842 100644 --- a/documentation/CustomBarcodes.md +++ b/documentation/CustomBarcodes.md @@ -47,6 +47,8 @@ min_barcode_score_dist = 3 min_separation_only_dist = 6 flank_left_pad = 5 flank_right_pad = 10 +front_barcode_window = 175 +rear_barcode_window = 175 ``` #### Arrangement Options @@ -91,6 +93,8 @@ If a candidate meets one of the above conditions and the location of the start/e | barcode_end_proximity | Proximity of the end of the barcode construct to the ends of the read required for classification. | | flank_left_pad | Number of bases to use from preceding flank during barcode alignment. | | flank_right_pad | Number of bases to use from succeeding flank during barcode alignment. | +| front_barcode_window | Number of bases at the front of the read within which to look for barcodes. | +| rear_barcode_window | Number of bases at the rear of the read within which to look for barcodes. | For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment score. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification. diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 85847a59..ee70f38a 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -146,7 +146,6 @@ std::string extract_right_buffer(const std::string& flank, int buffer) { namespace demux { -const int TRIM_LENGTH = 175; const BarcodeScoreResult UNCLASSIFIED{}; struct BarcodeClassifier::BarcodeCandidateKit { @@ -267,6 +266,8 @@ std::vector BarcodeClassifier::generate_ m_left_buffer = m_scoring_params.flank_left_pad; m_right_buffer = m_scoring_params.flank_right_pad; + m_front_barcode_window = m_scoring_params.front_barcode_window; + m_rear_barcode_window = m_scoring_params.rear_barcode_window; BarcodeCandidateKit candidate; candidate.kit = kit_name; @@ -380,9 +381,9 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); - int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); - std::string_view read_bottom = read_seq.substr(bottom_start, TRIM_LENGTH); + std::string_view read_top = read_seq.substr(0, m_front_barcode_window); + int bottom_start = std::max(0, (int)read_seq.length() - m_rear_barcode_window); + std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); @@ -541,9 +542,9 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); - int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); - std::string_view read_bottom = read_seq.substr(bottom_start, TRIM_LENGTH); + std::string_view read_top = read_seq.substr(0, m_front_barcode_window); + int bottom_start = std::max(0, (int)read_seq.length() - m_rear_barcode_window); + std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); @@ -622,7 +623,7 @@ std::vector BarcodeClassifier::calculate_barcode_score( std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); + std::string_view read_top = read_seq.substr(0, m_front_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index c0cd28f4..d39fd5d3 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -57,6 +57,8 @@ class BarcodeClassifier { int m_left_buffer; int m_right_buffer; + int m_front_barcode_window; + int m_rear_barcode_window; }; } // namespace demux diff --git a/dorado/demux/parse_custom_kit.h b/dorado/demux/parse_custom_kit.h index 2aaa9691..6870bcc1 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/demux/parse_custom_kit.h @@ -15,6 +15,8 @@ struct BarcodeKitScoringParams { int min_separation_only_dist = 6; int flank_left_pad = 5; int flank_right_pad = 10; + int front_barcode_window = 175; + int rear_barcode_window = 175; }; std::optional> parse_custom_arrangement( diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 65b4f7aa..fa2b3834 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -126,6 +126,8 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { CHECK(scoring_params.min_separation_only_dist == 5); CHECK(scoring_params.flank_left_pad == 5); CHECK(scoring_params.flank_right_pad == 10); + CHECK(scoring_params.front_barcode_window == 150); + CHECK(scoring_params.rear_barcode_window == 150); } TEST_CASE("Parse default scoring params", "[barcode_demux]") { diff --git a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml index f1500e43..77996ae2 100644 --- a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml +++ b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml @@ -6,3 +6,5 @@ min_barcode_score_dist = 3 min_separation_only_dist = 5 flank_left_pad = 5 flank_right_pad = 10 +front_barcode_window = 150 +rear_barcode_window = 150 From 673ada1500f4b3a3aaf03a13d4edafd4d4dd5151 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 13 Mar 2024 23:10:05 +0000 Subject: [PATCH 15/23] adding parsing of front/rear window size --- dorado/demux/parse_custom_kit.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index a1e12850..b1827d57 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -137,6 +137,12 @@ BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file if (config.contains("flank_right_pad")) { params.flank_right_pad = toml::find(config, "flank_right_pad"); } + if (config.contains("front_barcode_window")) { + params.front_barcode_window = toml::find(config, "front_barcode_window"); + } + if (config.contains("rear_barcode_window")) { + params.rear_barcode_window = toml::find(config, "rear_barcode_window"); + } return params; } From bbe9c75ad6afc7763be9ea923a0134ca9ad16396 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:00:07 +0000 Subject: [PATCH 16/23] fetch the correct number of flanking bases from the read for barcode evaluation --- dorado/demux/BarcodeClassifier.cpp | 45 +++++++++++++++++++----------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index ee70f38a..428fa333 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -382,7 +382,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { std::string_view read_top = read_seq.substr(0, m_front_barcode_window); - int bottom_start = std::max(0, (int)read_seq.length() - m_rear_barcode_window); + int bottom_start = std::max(0, static_cast(read_seq.length()) - m_rear_barcode_window); std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. @@ -411,30 +411,38 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe // Fetch barcode mask locations for variant 1 auto [top_result_v1, top_flank_score_v1, top_bc_loc_v1] = extract_flank_fit( top_context_v1, read_top, barcode_len, placement_config, "top score v1"); - auto top_start_idx_v1 = std::max(0, top_bc_loc_v1 - m_left_buffer - barcode_len); - auto top_end_idx_v1 = top_bc_loc_v1 + m_right_buffer; + auto top_start_idx_v1 = std::max( + 0, top_bc_loc_v1 - static_cast(top_context_v1_left_buffer.length()) - barcode_len); + auto top_end_idx_v1 = top_bc_loc_v1 + static_cast(top_context_v1_right_buffer.length()); std::string_view top_mask_v1 = read_top.substr(top_start_idx_v1, top_end_idx_v1 - top_start_idx_v1); auto [bottom_result_v1, bottom_flank_score_v1, bottom_bc_loc_v1] = extract_flank_fit( bottom_context_v1, read_bottom, barcode_len, placement_config, "bottom score v1"); - auto bottom_start_idx_v1 = std::max(0, bottom_bc_loc_v1 - m_left_buffer - barcode_len); - auto bottom_end_idx_v1 = bottom_bc_loc_v1 + m_right_buffer; + auto bottom_start_idx_v1 = std::max( + 0, bottom_bc_loc_v1 - static_cast(bottom_context_v1_left_buffer.length()) - + barcode_len); + auto bottom_end_idx_v1 = + bottom_bc_loc_v1 + static_cast(bottom_context_v1_right_buffer.length()); std::string_view bottom_mask_v1 = read_bottom.substr(bottom_start_idx_v1, bottom_end_idx_v1 - bottom_start_idx_v1); // Fetch barcode mask locations for variant 2 auto [top_result_v2, top_flank_score_v2, top_bc_loc_v2] = extract_flank_fit( top_context_v2, read_top, barcode_len, placement_config, "top score v2"); - auto top_start_idx_v2 = std::max(0, top_bc_loc_v2 - m_left_buffer - barcode_len); - auto top_end_idx_v2 = top_bc_loc_v2 + m_right_buffer; + auto top_start_idx_v2 = std::max( + 0, top_bc_loc_v2 - static_cast(top_context_v2_left_buffer.length()) - barcode_len); + auto top_end_idx_v2 = top_bc_loc_v2 + static_cast(top_context_v2_right_buffer.length()); std::string_view top_mask_v2 = read_top.substr(top_start_idx_v2, top_end_idx_v2 - top_start_idx_v2); auto [bottom_result_v2, bottom_flank_score_v2, bottom_bc_loc_v2] = extract_flank_fit( bottom_context_v2, read_bottom, barcode_len, placement_config, "bottom score v2"); - auto bottom_start_idx_v2 = std::max(0, bottom_bc_loc_v2 - m_left_buffer - barcode_len); - auto bottom_end_idx_v2 = bottom_bc_loc_v2 + m_right_buffer; + auto bottom_start_idx_v2 = std::max( + 0, bottom_bc_loc_v2 - static_cast(bottom_context_v2_left_buffer.length()) - + barcode_len); + auto bottom_end_idx_v2 = + bottom_bc_loc_v2 + static_cast(bottom_context_v2_right_buffer.length()); std::string_view bottom_mask_v2 = read_bottom.substr(bottom_start_idx_v2, bottom_end_idx_v2 - bottom_start_idx_v2); @@ -543,7 +551,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { std::string_view read_top = read_seq.substr(0, m_front_barcode_window); - int bottom_start = std::max(0, (int)read_seq.length() - m_rear_barcode_window); + int bottom_start = std::max(0, static_cast(read_seq.length()) - m_rear_barcode_window); std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. @@ -563,14 +571,16 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - auto top_start_idx = std::max(0, top_bc_loc - m_left_buffer - barcode_len); - auto top_end_idx = top_bc_loc + m_right_buffer; + auto top_start_idx = + std::max(0, top_bc_loc - static_cast(top_left_buffer.length()) - barcode_len); + auto top_end_idx = top_bc_loc + static_cast(top_right_buffer.length()); std::string_view top_mask = read_top.substr(top_start_idx, top_end_idx - top_start_idx); auto [bottom_result, bottom_flank_score, bottom_bc_loc] = extract_flank_fit( bottom_context, read_bottom, barcode_len, placement_config, "bottom score"); - auto bottom_start_idx = std::max(0, bottom_bc_loc - m_left_buffer - barcode_len); - auto bottom_end_idx = bottom_bc_loc + m_right_buffer; + auto bottom_start_idx = std::max( + 0, bottom_bc_loc - static_cast(bottom_left_buffer.length()) - barcode_len); + auto bottom_end_idx = bottom_bc_loc + static_cast(bottom_right_buffer.length()); std::string_view bottom_mask = read_bottom.substr(bottom_start_idx, bottom_end_idx - bottom_start_idx); @@ -632,11 +642,14 @@ std::vector BarcodeClassifier::calculate_barcode_score( std::string_view top_context = candidate.top_context; int barcode_len = int(candidate.barcodes1[0].length()); + const auto& top_left_buffer = candidate.top_context_left_buffer; + const auto& top_right_buffer = candidate.top_context_right_buffer; auto [top_result, top_flank_score, top_bc_loc] = extract_flank_fit(top_context, read_top, barcode_len, placement_config, "top score"); - auto start_idx = std::max(0, top_bc_loc - m_left_buffer - barcode_len); - auto end_idx = top_bc_loc + m_right_buffer; + auto start_idx = + std::max(0, top_bc_loc - static_cast(top_left_buffer.length()) - barcode_len); + auto end_idx = top_bc_loc + static_cast(top_right_buffer.length()); std::string_view top_mask = read_top.substr(start_idx, end_idx - start_idx); spdlog::trace("BC location {}", top_bc_loc); From c32707fda4f3540ccaf0c0ea57d30746665879d4 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Sun, 17 Mar 2024 16:19:15 +0000 Subject: [PATCH 17/23] Use per kit scoring params Move the barcode kit definition into utilities and enable per kit scoring params. --- dorado/demux/BarcodeClassifier.cpp | 27 +++++++- dorado/demux/BarcodeClassifier.h | 2 +- dorado/demux/parse_custom_kit.cpp | 6 +- dorado/demux/parse_custom_kit.h | 15 +--- .../read_pipeline/BarcodeClassifierNode.cpp | 1 + dorado/utils/barcode_kits.cpp | 69 ++++++++++++++----- dorado/utils/barcode_kits.h | 12 ++++ tests/CustomBarcodeParserTest.cpp | 11 +-- 8 files changed, 103 insertions(+), 40 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 428fa333..d90ea992 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -133,6 +133,29 @@ std::unordered_map process_custom_ki return kit_map; } +dorado::barcode_kits::BarcodeKitScoringParams set_scoring_params( + const std::vector& kit_names, + const std::optional& custom_kit) { + dorado::barcode_kits::BarcodeKitScoringParams params{}; + + if (!kit_names.empty()) { + // If it is one of the pre-defined kits, override the default scoring + // params with whatever is set for that specific kit. + const auto& kit_name = kit_names[0]; + const auto& kit_info_map = barcode_kits::get_kit_infos(); + auto prebuilt_kit_iter = kit_info_map.find(kit_name); + if (prebuilt_kit_iter != kit_info_map.end()) { + params = prebuilt_kit_iter->second.scoring_params; + } + } + if (custom_kit) { + // If a custom kit is passed, parse it for any scoring + // params that need to override the default params. + return dorado::demux::parse_scoring_params(*custom_kit, params); + } else { + return params; + } +} // Helper to extract left buffer from a flank. std::string extract_left_buffer(const std::string& flank, int buffer) { return flank.substr(std::max(0, static_cast(flank.length()) - buffer)); @@ -181,8 +204,7 @@ BarcodeClassifier::BarcodeClassifier(const std::vector& kit_names, : m_custom_kit(process_custom_kit(custom_kit)), m_custom_seqs(custom_barcodes ? parse_custom_sequences(*custom_barcodes) : std::unordered_map{}), - m_scoring_params(custom_kit ? parse_scoring_params(*custom_kit) - : BarcodeKitScoringParams{}), + m_scoring_params(set_scoring_params(kit_names, custom_kit)), m_barcode_candidates(generate_candidates(kit_names)) {} BarcodeClassifier::~BarcodeClassifier() = default; @@ -449,6 +471,7 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe // Find the best variant of the two. int total_v1_score = top_result_v1.editDistance + bottom_result_v1.editDistance; int total_v2_score = top_result_v2.editDistance + bottom_result_v2.editDistance; + spdlog::trace("total v1 edit dist {}, total v2 edit dis {}", total_v1_score, total_v2_score); std::string_view top_mask, bottom_mask; if (total_v1_score < total_v2_score) { diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index d39fd5d3..88273686 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -31,7 +31,7 @@ class BarcodeClassifier { private: const std::unordered_map m_custom_kit; const std::unordered_map m_custom_seqs; - const BarcodeKitScoringParams m_scoring_params; + const dorado::barcode_kits::BarcodeKitScoringParams m_scoring_params; const std::vector m_barcode_candidates; std::vector generate_candidates(const std::vector& kit_names); diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index b1827d57..efde6319 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -110,10 +110,12 @@ std::optional> parse_custom_arrang return std::make_pair(kit_name, new_kit); } -BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file) { +dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( + const std::string& arrangement_file, + const dorado::barcode_kits::BarcodeKitScoringParams& base_params) { const toml::value config_toml = toml::parse(arrangement_file); - BarcodeKitScoringParams params{}; + auto params = base_params; if (!config_toml.contains("scoring")) { return params; } diff --git a/dorado/demux/parse_custom_kit.h b/dorado/demux/parse_custom_kit.h index 6870bcc1..d929bdaa 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/demux/parse_custom_kit.h @@ -8,24 +8,15 @@ namespace dorado::demux { -struct BarcodeKitScoringParams { - int max_barcode_cost = 12; - int barcode_end_proximity = 75; - int min_barcode_score_dist = 3; - int min_separation_only_dist = 6; - int flank_left_pad = 5; - int flank_right_pad = 10; - int front_barcode_window = 175; - int rear_barcode_window = 175; -}; - std::optional> parse_custom_arrangement( const std::string& arrangement_file); std::unordered_map parse_custom_sequences( const std::string& sequences_file); -BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file); +dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( + const std::string& arrangement_file, + const dorado::barcode_kits::BarcodeKitScoringParams& default_params); bool check_normalized_id_pattern(const std::string& pattern); diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index cd0bc527..840e1ae2 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -107,6 +107,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, m_default_barcoding_info->allowed_barcodes); auto bc = generate_barcode_string(bc_res); + spdlog::trace("Barcode for {} is {}", bam_get_qname(irecord), bc); bam_aux_append(irecord, "BC", 'Z', int(bc.length() + 1), (uint8_t*)bc.c_str()); m_num_records++; { diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index cd4b67cc..0fc65145 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -106,50 +106,68 @@ const std::vector RBK_1_96 = { "BC78", "BC79", "BC80", "BC81", "BC82", "BC83", "BC84", "BC85", "BC86", "BC87", "BC88", "BC89", "BC90", "BC91", "BC92", "BC93", "BC94", "BC95", "BC96"}; +// Kit specific scoring parameters. + +const BarcodeKitScoringParams DEFAULT_PARAMS{}; + +const BarcodeKitScoringParams RBK114_PARAMS{ + .max_barcode_cost = 12, + .barcode_end_proximity = 75, + .min_barcode_score_dist = 3, + .min_separation_only_dist = 6, + .flank_left_pad = 5, + .flank_right_pad = 10, + .front_barcode_window = 175, + .rear_barcode_window = 175, +}; + // Some arrangement names are just aliases of each other. This is because they were released // as part of different kits, but they map to the same underlying arrangement. const KitInfo kit_16S = { "16S", true, true, RAB_1st_FRONT, RAB_1st_REAR, - RAB_2nd_FRONT, RAB_2nd_REAR, BC_1_24, BC_1_24, + RAB_2nd_FRONT, RAB_2nd_REAR, BC_1_24, BC_1_24, DEFAULT_PARAMS, }; const KitInfo kit_lwb = { "LWB", true, true, LWB_1st_FRONT, LWB_1st_REAR, - LWB_2nd_FRONT, LWB_2nd_REAR, BC_1_12, BC_1_12, + LWB_2nd_FRONT, LWB_2nd_REAR, BC_1_12, BC_1_12, DEFAULT_PARAMS, }; const KitInfo kit_lwb24 = { "LWB24", true, true, LWB_1st_FRONT, LWB_1st_REAR, - LWB_2nd_FRONT, LWB_2nd_REAR, BC_1_24, BC_1_24, + LWB_2nd_FRONT, LWB_2nd_REAR, BC_1_24, BC_1_24, DEFAULT_PARAMS, }; const KitInfo kit_nb12 = { - "NB12", true, true, NB_1st_FRONT, NB_1st_REAR, NB_2nd_FRONT, NB_2nd_REAR, NB_1_12, NB_1_12, + "NB12", true, true, NB_1st_FRONT, NB_1st_REAR, + NB_2nd_FRONT, NB_2nd_REAR, NB_1_12, NB_1_12, DEFAULT_PARAMS, }; const KitInfo kit_nb24 = { - "NB24", true, true, NB_1st_FRONT, NB_1st_REAR, NB_2nd_FRONT, NB_2nd_REAR, NB_1_24, NB_1_24, + "NB24", true, true, NB_1st_FRONT, NB_1st_REAR, + NB_2nd_FRONT, NB_2nd_REAR, NB_1_24, NB_1_24, DEFAULT_PARAMS, }; const KitInfo kit_nb96 = { - "NB96", true, true, NB_1st_FRONT, NB_1st_REAR, NB_2nd_FRONT, NB_2nd_REAR, NB_1_96, NB_1_96, + "NB96", true, true, NB_1st_FRONT, NB_1st_REAR, + NB_2nd_FRONT, NB_2nd_REAR, NB_1_96, NB_1_96, DEFAULT_PARAMS, }; const KitInfo kit_rab = { "RAB", true, true, RAB_1st_FRONT, RAB_1st_REAR, - RAB_2nd_FRONT, RAB_2nd_REAR, BC_1_12, BC_1_12, + RAB_2nd_FRONT, RAB_2nd_REAR, BC_1_12, BC_1_12, DEFAULT_PARAMS, }; const KitInfo kit_rbk96 = { - "RBK96", false, false, RBK4_FRONT, RBK4_REAR, "", "", RBK_1_96, {}, + "RBK96", false, false, RBK4_FRONT, RBK4_REAR, "", "", RBK_1_96, {}, DEFAULT_PARAMS, }; const KitInfo kit_rbk4 = { - "RBK4", false, false, RBK4_FRONT, RBK4_REAR, "", "", BC_1_12, {}, + "RBK4", false, false, RBK4_FRONT, RBK4_REAR, "", "", BC_1_12, {}, DEFAULT_PARAMS, }; const KitInfo kit_rlb = { - "RLB", true, false, RLB_FRONT, RLB_REAR, "", "", BC_1_12A, {}, + "RLB", true, false, RLB_FRONT, RLB_REAR, "", "", BC_1_12A, {}, DEFAULT_PARAMS, }; // Final map to go from kit name to actual barcode arrangement information. @@ -180,6 +198,7 @@ const std::unordered_map kit_info_map = { NB_2nd_REAR, NB_13_24, NB_13_24, + DEFAULT_PARAMS, }}, // NB24 {"SQK-NBD111-24", kit_nb24}, @@ -202,6 +221,7 @@ const std::unordered_map kit_info_map = { BC_2nd_REAR, BC_1_12, BC_1_12, + DEFAULT_PARAMS, }}, // PCR96 {"EXP-PBC096", @@ -215,6 +235,7 @@ const std::unordered_map kit_info_map = { BC_2nd_REAR, BC_1_96, BC_1_96, + DEFAULT_PARAMS, }}, // RAB {"SQK-RAB204", kit_rab}, @@ -231,6 +252,7 @@ const std::unordered_map kit_info_map = { "", BC_1_12, {}, + DEFAULT_PARAMS, }}, // RBK096 {"SQK-RBK110-96", kit_rbk96}, @@ -247,6 +269,7 @@ const std::unordered_map kit_info_map = { "", RBK_1_96, {}, + RBK114_PARAMS, }}, // RBK24 {"SQK-RBK111-24", @@ -260,6 +283,7 @@ const std::unordered_map kit_info_map = { "", BC_1_24, {}, + DEFAULT_PARAMS, }}, // RBK24_kit14 {"SQK-RBK114-24", @@ -273,6 +297,7 @@ const std::unordered_map kit_info_map = { "", BC_1_24, {}, + RBK114_PARAMS, }}, // RBK4 {"SQK-RBK004", kit_rbk4}, @@ -293,6 +318,7 @@ const std::unordered_map kit_info_map = { "", BC2_1_24, {}, + DEFAULT_PARAMS, }}, // VMK {"VSK-VMK001", @@ -306,18 +332,22 @@ const std::unordered_map kit_info_map = { "", {"BC01", "BC02", "BC03", "BC04"}, {}, + DEFAULT_PARAMS, }}, // VMK4 {"VSK-VMK004", - {"VMK4", - false, - false, - RBK4_FRONT, - RBK4_REAR, - "", - "", - {"BC01", "BC02", "BC03", "BC04", "BC05", "BC06", "BC07", "BC08", "BC09", "BC10"}, - {}}}, + { + "VMK4", + false, + false, + RBK4_FRONT, + RBK4_REAR, + "", + "", + {"BC01", "BC02", "BC03", "BC04", "BC05", "BC06", "BC07", "BC08", "BC09", "BC10"}, + {}, + DEFAULT_PARAMS, + }}, {"TWIST-ALL", { "PGx", @@ -359,6 +389,7 @@ const std::unordered_map kit_info_map = { "AG10R_79", "AH10R_80", "AA11R_81", "AB11R_82", "AC11R_83", "AD11R_84", "AE11R_85", "AF11R_86", "AG11R_87", "AH11R_88", "AA12R_89", "AB12R_90", "AC12R_91", "AD12R_92", "AE12R_93", "AF12R_94", "AG12R_95", "AH12R_96"}, + DEFAULT_PARAMS, }}, }; diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index a48a3907..3283a282 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -7,6 +7,17 @@ namespace dorado::barcode_kits { +struct BarcodeKitScoringParams { + int max_barcode_cost = 9; + int barcode_end_proximity = 75; + int min_barcode_score_dist = 3; + int min_separation_only_dist = 6; + int flank_left_pad = 5; + int flank_right_pad = 10; + int front_barcode_window = 175; + int rear_barcode_window = 175; +}; + struct KitInfo { std::string name; bool double_ends; @@ -17,6 +28,7 @@ struct KitInfo { std::string bottom_rear_flank; std::vector barcodes; std::vector barcodes2; + BarcodeKitScoringParams scoring_params; }; const std::unordered_map& get_kit_infos(); diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index fa2b3834..8b5a9d73 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -1,5 +1,6 @@ #include "TestUtils.h" #include "demux/parse_custom_kit.h" +#include "utils/barcode_kits.h" #include @@ -118,7 +119,9 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_params_file = data_dir / "scoring_params.toml"; - auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); + dorado::barcode_kits::BarcodeKitScoringParams default_params; + auto scoring_params = + dorado::demux::parse_scoring_params(test_params_file.string(), default_params); CHECK(scoring_params.max_barcode_cost == 10); CHECK(scoring_params.barcode_end_proximity == 75); @@ -134,9 +137,9 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_params_file = data_dir / "test_kit_single_ended.toml"; - dorado::demux::BarcodeKitScoringParams default_params; - - auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); + dorado::barcode_kits::BarcodeKitScoringParams default_params; + auto scoring_params = + dorado::demux::parse_scoring_params(test_params_file.string(), default_params); CHECK(scoring_params.max_barcode_cost == default_params.max_barcode_cost); CHECK(scoring_params.barcode_end_proximity == default_params.barcode_end_proximity); From 88cf8a6590efae86151a1e4ccee045c5dc105092 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:54:40 +0000 Subject: [PATCH 18/23] Update RPB004 test data since thresholds have been updated --- tests/data/barcode_demux/double_end/SQK-RPB004_BC01.fastq | 4 ---- tests/data/barcode_demux/double_end/SQK-RPB004_BC11.fastq | 4 ---- tests/data/barcode_demux/double_end/unclassified.fastq | 8 ++++++++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/data/barcode_demux/double_end/SQK-RPB004_BC01.fastq b/tests/data/barcode_demux/double_end/SQK-RPB004_BC01.fastq index efd8d024..f88fed6b 100644 --- a/tests/data/barcode_demux/double_end/SQK-RPB004_BC01.fastq +++ b/tests/data/barcode_demux/double_end/SQK-RPB004_BC01.fastq @@ -1,7 +1,3 @@ -@eb642113-7d12-475d-ad0c-d6dd1998e04c -GCAGCCTGATCCTCCTGCTGTCGGCACCATGCCTGGGTCAGGGCATGAACCCCCTGGATGCGCCACAAGACGGTTAAATGTCTGAACACCGCGACCTGGTGATGGTCTCGACGCGGTAGCTGGTGTTGCGCATGATGCTGATCCGCGCCGTCGTACCGGTCCCCGACGACATCCGCCTGAAAGAGCAGTCGCCGTGGTGCGCCGTGGGTTAGCGCCATGGTCACATCAACCTGCTGCTGCACCAGACGATCGCCGCTCGCCAGACGAGGCACTGAACCAAACAGCACAAGATTGGCATGAAAACAACCGCAACCTGGCTGTTGCCATACCATATGCCACCCGCATCCATTATTACAAATAATATCGTAATCACAATAGCTGCTCTACTTTGTAATTAAACTTCCTGCATTTCCCCAAAATAACCACACGCCATCTGGGCTGGGCGAAATTTACTGGCAGGTGTTGTCATTGAAACATGGCGTTTTGGTACACCCGTATGCACACGACGCGTCAGGCTGGCCATCAACTTTGAATTTGACGATATAAGGCAATATCATTTGATTGATCAGGAATTTTATTTATCACACCTAAAGCAGCACCAAAAACGCAAAAGAACACAACAATTTTTCTGGTGA -+ -$$$&%$$##%%&)&%&(()**'&&'''''(''&&&(((%$$$$%%''''(''''()'&%%&%%%%%&&&%%%%%&$$$$$%%$$$$%%%%$$$$%%''''')/)((((&&%$$%$%%$###$%'(&&$$$&%%%%$$$'(&&%$$#$$$%%%$$$$%%%%$$%%%%%$$%&'()(((*)&%%%&%%$%%$$%%%&&&&%%%&&&'&&&%%$####$&'*('&%%$%%&&((*,,+)''&'&%%$$%%%%%%$%&&&&%$$%&&&&%%''&&'&&&''&&&&&%%%$$$$$%$&&'(''((()-,('&%$$#$$###$$%'('%$%%%&%&%%%%$%&%&''%&&%&'&&&&%&$$##$&&&&%%$$$%%&'&&%%%('''$$$%%%%&)**-''&'%&&$''&''&(')))'))(&()))('''')''&''(*)(&%$#$#$$#$$$#$$%&&'&%&%%%%%%$$$$$$$%%&&%'($##$$$$$#$%$$%%&&&&&%%'%%'&%%&$$%&%&%&%&'(())+'%%%&*($$$$&+'''%%%%%%%&&%%&'&&%$$$%$%$$$$%%%))(*+-.,(''&$%$#$#$#$#$'(&%$$#$$$%'-00('%%&((*('&(&&&'(''(((%$%%&%$ @dd587662-57a3-4ea2-8196-e669cb70ef7f TTTTTTATGTGCATGTACTCGTCATTTGTTGCTATCGGGGGCTTAGATCTTGGCAAAGTCACACCCATCCAAGAAAGTATCGTGTCTTTGTGCATTTTTCATTGGCTTGGCAGGTATCGGGTCTTGATGATGTTGATTGGCCGCAAAGCACACAAACACCATGGTATCATAATCATTAGAGCCAGAAGAGGCGTGAACACAGCTGAAGATTCTTATTTAATGGCAGTCCCTGGATCAGTTGCCACAACCAAATGATGAAGTGATGGACCTTGACTTGCTTCAAGCGGTACATCATCAAGGTGGTACGCAAGTAAGAACCGGTCTTTACCTGAAACAACTGGATGTTGACGCCAAAATCAAAGTCCTTTTGATGACGCAACTTATGATTTATTTATGACTGGATGATTTAAGCGCATATCGGTTCAGGATTTAGATCAGCCTGCTGCTCCTGACCATGGCCTTTGGCTTGCTTTACGAAAAATCTAAGGGCTTAAATATTAAACACCAAGTCAAGGTCGTTATTATACATGCTAAGTGAAGAGGTGCCATCGAAGTCATAGGCGAATCAAGAAGGAAGTATTGAGCCATTAGCTGTTAGTTGATCCATTAACGTCAAGACTTTGCCAATCAATCTACATATGACTTCTTTATGCCTCAAAGCGATAAACTATTTCGAAGCATATCTTGGCAACAAGCAGTTTTAGGTAGAAGCAATTTTCTTTGTTAGTTTATGAAGAAGAAAATTATGATCAATTAGATATTATCAATGATTCTTACTCAGGACACTGCAACTTAACTTATTTCCATGTTGAATCTTGCCATTAGTCATTGTAGAGGTCATTAATGAGCCAACGGACGTTGTTTTCATTACAACTGAACACCACATAGTAAGGACTAACGGAGCAGTCACCGACCAGCCAAGCAGCAGAAATCATTACAACATTATGGCATTCATTTATGGCGAAACCATCCGTTGACGATATATTCGCTCAATGGCTTACCAAATGATCACCTCTCGCAGTCAATCATGCCTTCTTTGCTAACAGTACGGATGAGTCACCATGGCTAAAGTTCAACGTTGCACTCATTTGATCCAATCAGCAATTTTGCTAATGGCAATTATTTCATTGCCATGGAACCAGTAGCCGTTTTGCTGTGACAGCGATTATATAATCATATTGCACTAGTTTAATAATGACTGCTCATCGCAATCACTTACCAAGATGAAATATATTTGAAAAAATGTTCTATATTGCGAATGGGCAAAACAGATACTAACCTGGATAATTTTGTTGAAGCCGATGTTTGCGACAAGGCCTTCGCCCTAATTTTACAGCTGAAGGTATGACCTTCAAGCGATTGGAAAAATCAATGAGTAGGGCAAGCAAGTAATTTGACTTATCCTGCCACGGATGGCAAAACATCCAAACCAAGCGATGCGCCACGATCCCATATCGTTGATGATGACAATCACTTTGAAGGTTTAGATATCTATCTAGATGGCGCATCACTAAATTCGTACGATTATTTTATCATCTTTTAGATAACTAGACAGCCATTATATACTCTACATAATTACTGAAGAATACCAACAACATCCATTACATTTACAGAAGCTTTATCTCATCGCATACTGATACTTATAGACATCAATTTCAGTTTTGAAATCAATGAATCTTTCAAGATCTTTAAAGTCATCAGCACCATCCACTCATTTATTTTAAATGTTTTCACATCTAATAAACTGAATTCCTTCAGTCAGTATTTTTCACGTTGGGCACGATTTGCAACAATGGAATTTTCTTCTTTTCTTACGGAAGCCTGGAACATATTCAATCTAGATTCTAATAGTTTTTAGAAGATGTTCGATACTGACACATCCGCTGTTCTTGTCGTTGATTGATTCAAATATCGATAAGTGGTGACAATTTTCAATTTTTCAGTTGTTGGGCATATTCGTCTACCCTTCTTTGAATCACGTTTCGCATCGATCAACAACATTATCTCCTCTTTCTTCTTACTTGGACACCAAGAAGTCGCCGCAGGATTCCGTCATTGCAAAGTTCTCCACAAAGCCGATCACACATCCATAATTTCTTCGATCAAACTACTGATAAATATTCCTCATCAAGCACCCGAACGAAAGCACATGGGCAGCAACTTCCCCATCACCGTTCTTCAGATTTAGACCAATCATGTGCAGGTAAAGCAAGTACCTTGACTCAGCAAATTTCACAACCAAAGCAACTGAAGCTCTAATACTAAGCCTCGATTAATTGCCCCATTTCTCAAAAGAAAAAGACGAATTTTGAGCAATTGATTCAAATATTGAAAAGCGTCAAATGACCAAATTTCTTTTCTAAACCAATTTCAAATCATCTCAGCCACTCAATCTTATCGGTTGCCCTTCGCATACAAACTGTTCATCTTGGCCAACAACAAGTACGACTCATACCTACCTCAGCCATCAACATACTACTACTCTTGCATTACTAGCGACTCACACACATTTCGTCGATAACGATTCAGTTCCGCAGCTGCATCACAAATGCTATCAGCAGCTTCAGGGGTTCAGAGTAATTCACGATCAGATAGACCTCTGGCGGGTTGCAAACTTCGCCCAGCCCCTTGGCCGCCACTGGCGGCAAAAATTCTTTATCAGATATTCGTAACACCATTCCCAAATCATGCAGGATTGCCTCTTGAGCAGCTTTCAATTGCAGCAACAACAAAATCTTCCAGTAAGTAACGCAGAGGTGCATGTTCTTGTCTTACCTAACCAGTTCTCGATGTTGAAGATATCTTGAACACCTTCTTTCAGCATCCTTCATTTGGTGCCATTTCTTCTTCTGCTGATGAAGCCTAATTCAAATTTCGGCCCCGTTTACCGACTCATCTGGATGCTCATAGAGGCATCGAAGCTGACCTCACGCAGCAGCAATTGTCTCAATTCTTGGGCATCTTCCTCAATCAATTCCAGCTCATTGACCATCTCTTCTTATCAGCGATTCGTCATGATACCAAGCATCAAGAAATATCGGGACGACATGGTAAGGATGGTCTTCCTCAGATTATTTCACACTCAATTGTCGTCCTGGGAAGGTGGCAGCGACCACCAGCTTGTTGATGTCCTCGTTCATCTCATCATTCACTGCTTGTTCCTTCAATCGTTTCCTTCCCGAGCTTTTCACTCGAACGGATTTCCTTTCCGTATCTTTCCAACAAGTCTTCAACAATCATCCCTCCAGTTGGCAATGACTTCCAACAAACTGCTGAACCTCAAAATGGACATGATTCACCTATTTGTCCCTTGGACATTGTTCAACGCAGCTTCTTCTTCTTTGGTTGCCACGGAAAAGATTCAGCGGCTCGTCGACCCATGTCGTCGATCTTCTCGTACTTCTTCAGCCCTTCAGCAATGGAATCGCTTTCTCATCCGAATTAACCATGACCTTGATAGCAATTGATTGAAGATGACGAAGTCGAGGCCAGTTCACGCCTGCTGCTCCACGGCCTCCGCTGGCTGTTGTTCAGCGTTCATCGATTTCTCGCAGCAGCAGGATCGCTCTTGATACATTCGATCTATGAAAGCGATGCCATCGAGCCAGATTTCCTGACGGGTCGCATCTGGATCGCGGAAGTTCGGGTCTGTGCAATTGAGATGACGGCTCGATCTGGATTCTGGACAGGCGATAACCAGGCGTAACGGGAAGAGGTAGCGGTGACGGGCTGGCGACTCAAACCGAACTGGTCGCGCCGTTGGTGCTTGGGTGATCATCTCGATCACGTCTTGCTGATGTTCAGCAGGTTCAGCTCCTCGATGACCTTCTCGAACAGCGCCAGTGCGCCGGTCTCCAATCGAGAGCAAGCTCTCGGCGGCGACCCGGCGTTGATCGGATCATCTTGATCCCTCGAAGATCGGTGCTGCTCGCTGATCATCAGCTCCTGGCTGGCCGGCGTCGCCGTGGCGATGTCGTAGTCGCTTCACCGTTGGCCAACTGAACCTGATCACTCCCTGCTCCTCTTCGAAATCCTGTAGCAACCCGAACGAAATCAGCTCTGCTTTCTTCAGGTTGGGCGCTCCAATTGGCATGTCTTCTTTGAAGCGGGCAGCGATGTTGGTGACGTGCTGGCTGGTCTGGATCGCGCTGACAAGGACTCTGGTTCGCGACCCAGGGCAAGATCAGCGATACGCCGCTGGCATGACCTTCTGGTCGAAGAAGTCGAGCGAGCGACAGCCGTACCTTCTCACGTTTACTCCATCGTCGATAGATACTGACGTGCCGAGCGGCGCAGCGAAAAACGCATGAGCACGACAACTGGGC + diff --git a/tests/data/barcode_demux/double_end/SQK-RPB004_BC11.fastq b/tests/data/barcode_demux/double_end/SQK-RPB004_BC11.fastq index 1dfd74c7..abd09848 100644 --- a/tests/data/barcode_demux/double_end/SQK-RPB004_BC11.fastq +++ b/tests/data/barcode_demux/double_end/SQK-RPB004_BC11.fastq @@ -2,10 +2,6 @@ ATATGCCCTATACTCGTTGGTATCGGCGTTCATTGTAAGGAACCTCTAGTGACGTTTCATCTATCGGAGAAATGAACGTTTCGTTGCGCCGCTTCAATCAGTTTTTCATGGCGTGACAACGTGTCCCAAATTTTAATCATCGACCATCAAAATCAGTGCCCATACCATATTTGCTTGCAAAGATTTTAATCATAAAATTTCATTATCTAAAATCACATCAACAATGATTCGTTCATCTCCAATTAAAATTTTCGGTTGAAAAGCTGTTGAGCCACAATCATCACCGCTGCTTTTGACCACCTAGAAAGCTCGTGCAATTCATCAGTTAAATGTACTCTGGATTAGCAATTCCTACGTCTTTAATAATAAAATTGCTGTTTAGCTTTTTCACTTTTGACCACCTTTTGTAATTAAGAAATTTTCACGACAATTTTTTCAACACAATGCAATCAATAAACCATCAAAGGCTCCTAAAATAATGGCAATATAATGACCACGTAAATCATCGCCCCAGTCTTTCGTCTATTTGTAAATATTTTTTCGATCAATAATAATACGACCAGATGTTTCATTTTTCTGGTCATAATCCCATCATCGCCAAGCAGCTTATCTTTTTACCTGAACCACTTTCACCTTCAATTGCAGCGTTATCACCAGCTTTTCATGTAAATTAAGACCTCTAATAAAGGCTTCATCTGCGTGAAAGGTTTCTTATATTTAGCAATTTCTTTAGCATCTACATTTAAGTTATTTTCTTGGGTCAAATGCGTAACCACCTCCCGATATTAAATTAATAATGACAACGCAAAAGCAAGGCTTAAAAAACCCAGTAGACCAACCACAATTACATCTAAGTTATTTTTACCTTGCAACAACAACTCACCTAAAGAGCTGCATCTAGCAGCAAACCATGGCCCGAAATCAAGTACGACTATTAAAGCAATCATATTATCATAAAAATAAAAGGTAACTACAATAGACTCAAGTTTATGGCATTTGGCAAAATGTGCTTAAAGGAGTGATTGCTGTATCAGTTAAACCTAAGCCCTAAGCCGCACGTACGTAGTAAGACACACTTAAAAATCGGCACGGACAATTCCAACCAGTGCAGCTAAAGAAACAGCATAATGAATGACAATATCATTTCGGTAAACATGCTGACCAGAATCATGACCATAAAGAGGGCGTAGGTAAACCGTCCAACTTCTACCAACCGTCCCTGACAATCAACCCCAGCCACCATCATAGCCTTGTCCAGCACCAGCAAAAATGCCAAGTCATGCTGCACAACCTGTAAAAAGCCAACCGAAAAGTAATGAACTCTAAGCCCCATAGAGAATAACCCGAACGCATACGCCCTTGGTCATCTGTTCCTAACCAGTACTGCTTCCAAGGTGGCGATGGCACAGGTACAGCCAAGTCTAATTTCAGAGTGTATTGGTCAAGAAAATCTTTATAAGGGGCCCAATAGCCATCCCTTATCCGTAACATTACTTGTACAGCGGGATCTTTATAATCTGCTTTCTTCAAACACGCCACCAACAAGTCGTTTCGGATATGTTTTAAAGACAGGCAAAATAATAAGCGTCCTCATATTTCACCAACAATGGTTTGTCATTAGCGATAGACTTCGCGGCCCAAGGACAGTCGTCGAATGAGTGACAAAACAAGACTTTGCCACCCTAATTTATGCTGTCCTAAACTTCCATAGTAATGGCACAATAAAAGACATTATTGCCTCCTCTGTGCGTCAAAGTGAATACGCAGATGATCCATTTGATACAGCATCACAAATAGTGGTAAAGACAGACTGAACAGCATAAAGAAAATGATGCACCAAAAATAACAATAGTCGCTGAACAATTGCTTCAACAACAACTACCACACCATCTCCATCAAAGATGATTTCAATAAAAAGATTCCCACAATACCTGCTAAAGCCTCAGGCAATCCAGCAAGACCACTAAAATGGCATTTGAATACATGTTTATATAAGACTTATTGGCTATTTGAACCTTTTGCATTAAGCTGCCAAGACATAAGGTTGTTCACTCTCTAGAAGATGAATGTTGGTGCAGATGGTTGGGCTGGCGAAACTGCACCAATTAATCGTAATAGCAGCAACGTCATGTGCCATCAATCATCGTGATTTTTACAATTACAACGATGATAAAGATATTGACACTAGGTTTTGTAATAGAAGACTAGCTGAAAATATGAACCGCCGCAGAAAGAACAATGAGCGGTATTACCAAAATAAAGGCCACAAAGTCATTCCCTCCGATAGATGAAACATCATT + &&$%&%&('&&%%'%%%'$$$$%$##$####$%%###$$$$$###$$$$%%%*)*+*'),,--.'&&''&'(-&&&%&,,,+***))+11117984441***)(''''*&&&&()))''('')(%&&%&(*-/+)))')*'%%$$&&',**)*+---+,,,*+100112221)+++&'((((*8207,+)%''&')+++'&%$$$$$%%'**+---'''&%$$%&&').,----0766&&%%&+*))))..-&$$$%&&'&))**('%$%%&'''(((1//--,*'%''&%%%)*.//0141(&%%&)-0'&&&'*(('%%%%$$$$#$'&')'''()+./20,,,,,,116721))((**'/6222-./)),))*)'''$&'',+)))''''(,)))*..//.*)(+((''&&&'(**.251/.,,,.0456=>@1000,,*))**---'&&%%),).)('''*)*('&'('('(*)()&%&&(3/.''&&)+,+-((&&&(%%$$##$&&&'(''()%&((*+++*&))((())***.1'&&&&*,,'((''(()000///...1+)'&&&&64+*)((&''&&,(()))/0044665-''(+++'%'%%%%%42121/01123766645;:;:99::9551++**('')+...---,,-++---'&&&)))(((**''%%$%%/3332(%%&&&&%%%%%&,*..,'%((&'),++&%&&&&%%&$%&&)*//06689===753332100000,&&'&%%%%%&($$$$$$$&()).-'''''+%%&(,)*++,-)))('(%%&&(***)&&&&&'&&'()+*+,++('&&%%%&'&&%$$$%%&&)(()((),)),,+,))**+''''&%))%%%%%%%,,,1212---/0//(((()346679::,,)((*,***),,'&&'&&%%%&&&&'*0+*)((&&'''))*++***-./0'%%%&).**++**(((&&'''&'&$$$%&(,/0000.+,0+***+,,)'&&%&(((&'&%%&(&''''**)+**)(((%%%$$$%&'&&(('(++***+*+((&%&%$$##$%$%&''())+*%$$&&'&&&&&'((%%%$$$$&'&'''&'&&()--,+,-,++*++'''&&%%$$%''%%%%&$%$$%%&%&'&&&'&'''''((*35332222)(('''()(''$$%&&&',,((''%%%&&&&%%$%&)(&'%%&&&''&&)*('''(1+322.-.--(&&%&&('))(%%$$$%')'&&&')//2011--..((((()...0)(((+*)&&&())(''&%$$'**-+1+++(((((**)*)+,-,..&&&&&+*'&%%%$%&(%&&&%'%%%)(*(((+,1))*))++000011--*)(()/-+(&&'+/0//)))().-....//0)((((,,,-/%%$$&&&(''(()(('%%&$$%$%)++),+*)%%&))))))**''(&&''&''(***+,++)(%%%%$$##$&%'&&%&&$&&%%&'&&%&&(((*)))(&$$$$%*)$$%%%&))*&%%&&()(&&&&%&'''.0..-(%$$&&'**+--(()),)))(((((,+++&&&%$%&(.22)'''(*(')))+,)&&&&&'%$%%$%%%&%$%%%&&''(-,))*))('(('''&'('&$%%%%'&&((''%$%%&()&&(()++**+*+,++(&%&%%%&%('%%''%%&'')&%&&&)))('(()+*))*)),&'*)(*(%&&&&'%%%%'(&())((''''&&&'&%&$%&&&'&(+***(&&''*)))*).--,,-'''&'('%%%&%%&(((((,(((((+**)(()((----,.-,+%$$$$%%&(&%&-((((),,+**)(''(&&(***+,+,+)(''&&'&%$$$%%&'((./..000.+,,&&&&()*+)&&(&(*''&))))-.--223771///0,--+,..,*)'''('()&&&&('%&&&((%%&&'())*//.)*((&&%%%&'$$%%&02142*&&(()//.,''&&'('''('('&&*+%%&&%%%),'''%%%'''&&&&'(()))*,(&&%$$'%%%$$%&%%&%%%%%'))(&&&')'''((&%%%%()+,..03(''''(+,3336678755688.)))&&%%$%&''''*(()''&(((&%%$%'%%'*)*())'&&''(+.023989985'%%%%'''')$$$%&(,/3232334489:65545610/,&')('()('&%%&%%%&$$&*'%%%%%%%%$$%(&&&$$$$%(/3587;;<><:::96555//-,+& -@6b684a94-d3db-489a-85f1-dbcaec34b6b1 -TTGCATGTATTGATTCAGAGATTGCTTTCTCATGGCTTCAGATTTAGCAAGCACCTCTCTCATGACATTTGGTCTTGGGAGGGAATGGATCATTTTGGTGCGCCGCTTCAACAGCGGATCCTGAACTGAGGACATTGGACTCGTTTCCCCGGTGGCGTTTATGGGCTCGTTACCAGGATGAAGAACTTATTCGTTAGTTGTTATTTCTGGGTGCATCAGCAACACCAGTACAGTTGAACAGCTGTGAATATCGTTGCATCGTCGCAATCTTTGCTTTTGAATCCATGGACCACAGTCTTTACATGACCTCGTGGATCTGTAGGAAGCATTATTCCCAACCTTCACCAGCAACTTATGTTGTTCGCACTTTAATGACTACCTTGATCGCAGCACGAATTACTTGGCCCTGACTTATCTGACCGCCGCAATGTCATAGATGATCTTAAGAGTCTCGTAAATATTCAAGAACAATTCAACACAAAGTATTGGCGGATCAAGTGGCGCATGCTTGAAGCAGAGCGCCCTGCGATTGGCTCTGTGATGAAATCATATATGATTATTCTTCACTTTTTCAGCGGTTGCGTTTTAGGATTTTTGGCACATTGACGCGTCTATCAACTAGCTTGTCGACGGCGTAAAGTTGCATTAGTTTCAGGTACTATTTAAGGCTTGCAAAAGTTTTATGAATCTTTACACACCATGTCACCCATAGCCACATTTAACTTCTTCTGTCTTATATAATTTATAAAGATAGTCGCTTTAATAGTTCCATATATTGCTGCAGCGACAACCTTGAGCTTTTTACCTTCGTTGGTAAGCTTACATCGTCGCAAGTACCTTCTTATTCGCACTCTTTGCCGTCGTAATCGTCTTAATACTTATCTATTGAAGATATTTATATCATATTATTTCTCGCAATACTTTAGCCGCTTGAATACTTTCTGAAGATAAAGAACATTTGGCGCACAGCCGCTATTACCACCGAAAACTAGGCATCTTTTTAGCAATACGACGGTCCATCATCATTACATGGCCGTACCAAACCAATTATATTTGTGCCATTTTTAGTCATTATGACGCCTATGGAAGGGTTCAAACCTATGTTAAGACAAAATGCAGAGTTAAGCCGAAAGAAGCCAAGTATTATTTTTGGTATTTTTTAAAGACACTTACAAAAATTGGACGTTATTTTTTCGTAAAATTTATAGATTTAACTCAGTCAAACGTACTGTTTTTCTCAATGACTTACTTCAACAGCGTGATTGTTTCAACATTTATATTAAAAGAACAATCAACCATGCCACCTACTGTTTCAGAAGCAGGTAAAACAGTTTCTGCGGATGCAATTTGGAGATTTTAACTTTGGTGGGTCATGGTACAGTTACGGTTTTTTCTTTTGATCTTTGCTTTACCAAACACCAGAACATCGCTTTGATAAAGAAGTATATTTAAGAGCAGCTGAAGCTTACAATTTTAGCATCAATTTTCCCTTCAATCAGCATTTAGAAGTGACGCATTCTTCTGGTCCAATTATTGCTTATGGCTTAAGCTTTAATTGATTAACATGTACAGCGTGCCTTGATTGACATTATTACGATGAAGATGACAGCAGTTTCTACCTAGCCTATTTCCGTAATAACATTGCATGCATCGTTTACCTCATTGATCATCTCTTTGTTGAACATGACGGAAAAATTAGGCGGTGCTCGTTTAGACAATGTTATTCTATACGCTTGTCCATTACTTGAAGCCGAACTTCTTTCCTTAAATGGAAGAAGTAGCAAACTTAAAGAAACAGATGACGATATGCGGATGCGTTCGATTGGATACAAATCTTTATTCAACAAGATGTTGACGAAGGCGATTTTTGATTAGCTCTACACCAAATACTGAAGACAGTGATTCGGCTTGTTGTAGCGACACCTGTTGAACAAAGACCTTGTGAACATTATTTACATGACTTTGACTTTAATCACTGGTCAGATGGTTCGGCAATATTTCAGCAGAACAGTGAAGAATACGAGTCATTATTAGGCCAACGTTATCAGATTTCTTATGGGGTTTAACTTATCCTGATTCTTTGACGGAGGCATTATTCCAGAAGCTTTATCAAGAGTTTGACTGAAGCGACTACATTGGAACAATGACCAAGGCCGATCAGTTGATCAACGATTCCGTTCAAATGGCACGGCGCAGCAGGTAAGTACTTTGGTGGAAGTAACATTACAATGTAGTAGACGGCGCAGCGAAAATTA -+ -$&&$%%%%%%%$$&'''&%$%$&$$$$$#$#####"#$$%&&%%$$$$%%%%%&')%&%&%%&&&%$$$%#$$$$$$$%&&&('*)&'''&''')*&%%%&())''())*,-+''&''&&)-*(()*%%%%$$%&&%$$%%%&&'(''('%%%$%&%%&(%'%%%%()'''%&&%%'''''&%%%##$$$$$%&&&&'()))'&))(*%%%$%&%%%&&&%'('(*(&&%'&(&''*+'%%%%$%%**+*,*))*(),,)((&&'&&'++,*&&'''*'''%$%%'/,+++,/.-+,++*'(++*,,/./--,+-/./1*)))*+,1.--)))(('&')+15//1.,))''')+01*****.'&&&&')'%&%&-01.///0//0)))&&'&&&&'++(%%%$%%%(.///1211,-((()'''&&''(('&&'$%%%$$#$'(&'&&%&'''&&%%%&%&&%%)(&&$$&'(%%&'&&%%%%(''&%%'&&%$$$$$$%&'()******'''(,.0-..))('&'((**++../&&&&%''*()(&'''%&&&%&((%&&&&(&%&&&&'&%%%%$%%%%%$$$'*&%%%'(-,,**)))''))((''''&'&('(''''%$%('''''(()))++*,,-.(('''&&&&''*+++))(%%&'&&&%'((,,***+-----,,((()(***''''&**+.1334472./---../+**)%%&''''((*++,110100-----++%%%%%%%%%%%%%&*%%&&''''*'('((,+**,//100)((((,++'$$$$%&&'(%%%%'$%(('''&&('((''''&&'%%&&&%$$$%(((()&&'&%&((()))+*/0,,-,----+*+*&'%%'%%&&%&&&&'&$$(+--,+-.-/.1('&&&)%%&&&)&&&().,**)(%&''%&&&'((''''''%'&&)('&%%%%%$#$%&'&&&&%&'()'(()*''(&&(('&%%'(('&%%%%&&##$&)*)***+,)(%%%%')((()++***+0..//****''&&%''()+,+))))&%%%%(&&'&%&&'&&')%%%'&'(('&&&)'()--0+*&&&%%''&&)()+.*&%%&%$$&'((()'&&%%%$#$&%%%%%(*))('&&&%&&((*,.2)))))*01.2*&&%&%%%%%$%%%%'()**''('''&&+++*,('&&&))***))'''()+'''&$$$%$&&&&*0,*)(()),0-.,(((&&&&%$$%%$$%%%&&%&&'()''''$$$$$&('&'())''(),,,(),,*())&$&$$$$%'(&&&%%&%%''&&'(),+*%%%%%%'++*((&%$$#$$%$#$$'$%$$&((''&''&'&$$$$$%$%&'&%%$%$$$$$%%'(+-+,-''('(&''&'))&')))%%&$$%$$()('&&&%'%%%''-*,-(((()/(%$$$$'(&'&%%'''''&&'**+,)($$%)*-&&&&&&)))),-((''(%%&'(''%##$$$$%&')*((&&$##$$%$%&%&%%%&&)-/0/012021211/)(&'01.)*))((('&%%%%'))*)))'''(*11)((&%%%''*&%&$$$&''')+)(&&&%$&%%%&%$&%%')%%%%%%'-,&(&&&&'(()''(%&''%%%%%%&(('))(&&'*('($$$$$$$%(%&'''*+((&%&&%%%%'''(&&&%%&%$%&$$$&&''&%%%)*+-+*,**'((//013)''''-0/013-++++-)('%')'()+()))''(&',11/0,,*)))()))*0)&&&&'(&(+**&&&%&&''&&%%%%%&$$$$$%%%%%%%%&%%%%&'((%'($$%%%%&&((('&&)))*)*+(((((.-*)))&'()'(&&'(+--65...,-0/*)'''&&&''%&&&*'&&&%$%%&&&&''()*+,,,,+,,+.--,-*)'''-*+*&%%%&('((((++)%%%&&'&'(&&&&$%(&&&%%&&)**())(+.,,,*'(((%&&(''(+))*1/.,+*))(())*,)))((+*,,,+)**&&('')(+,)*/++&%%%%''&&&''%%&&&()&''''(&()'()))**/5((('')('((()++&&''(*)*+,,&%$$$%%%%&$$%%$%'&%%%&%%%%%%'''&&&&)))'%%%$$$&(&&''%''(())((&%&('(*'%%$$$%()**++))'&%%%&&((')**)))&'))('''&&&&&(*%%%%%%%&&+,-+++)))**))% @f4154004-052a-4399-9a68-c674accc524d TATGTGTTGACTGGTGTTGAATCAACCCGATCCGGCCCCTGGAGGCGCCCGTGACGTTTCATCTATCGGAGGGAATGGACGTTTTTCGTGCGCGCTTTTTCATGGCGTTGTTGCCATTGATAATGGAAATATTAGTCAAAGAACTTAAGCATAAAAAAGTTCTTAATTGCCCGTAGTGGTCCTGTTGTCGAGTCTTTAGAAAGTGCGCTTGGTTTCATGCAGGTATTGGTTCGGCCATGTTCCATGAAGGTATGAGTTTGCTTGAGCGTGACCAAGCGGCTACTTGCAGAAGAACTTACGCATTATCAATCTTTCTTGTTCAGAAATTGGTTCAGAAGGCCGTAACTTCCGTTTGCGAGTGATTTAATTTTATTTGATTTGCCTGCAAACCCTGACGTTTTTAGAACAACGTATTAATTTGGCGCTTAACGGTGTTCGTAAGAAAACCGTATCCAATATCCATGTACCTTACTTAATGGGCAAGCGTTTCAAGAGCGTATGTTCAGTTATCTGGTATAATGAAGCATTAAATATTTTGTCGCAATATTTCTCCAACAAGTAGAACCTTTCAAGAAAGCTTATTGTTTGATTCAAAAGATTGCTTTCTTCCGACCAAGGTCAAACATTTGAAGATTTACTCGGAAAAATGCAGTGTTTTTACTTCAAGCACAGACTTAGAAGCAAGTTACAAGCTGGTCAAGATCGTTTGCTAGAATTAGTTCGTATCATCTATATCGTTCGCAAATTATTGCTAAAGCATTTTTAGACAAGCTATGATGACAACACGACGTTACCTATGTTTGTGAAGCGTTTGATGTCTTTAACTAATATTGACTTTGATCAGCAAGTAACGGTACAGTCATTATTGATCTAACCGACCAGATGCAAGTGCAAGGTTTGGCGTTAGATGAAGAAGGTACTCATGAGACTTACTATCGTGACCAAGCGCAAATGCGTGAAGATGCTCCATCTCATTTTAGAACATCCATTTATTTGAAAGTGTCATGGAAATGATCCGTACCAATCATTTGGTAGTACCAATGTTGCTGTGCTTAAGTCCAATGCATTAAAGCAAGGTTGGGTTTACTTATGGATTTCTGGTTTAAAGTTGATGGTGGCACCAAAATCCTTGAACTTGCCATCAAGCTTACCAAAACAACTTATTCGCGTATCTTTAGTGGAAGTAGCAGGACTTTATATGAAAAGATTCATCCAACCATTTCAAAACCGTATTTGCATCACTTAGATGGCAATAGCTGCCGCCAAGTGGTTAAAGCCCCACCATTAAAGACGGCGCACGAAAAACGTCCATTCCCTCCGATAGATGAAACGTAGCC + diff --git a/tests/data/barcode_demux/double_end/unclassified.fastq b/tests/data/barcode_demux/double_end/unclassified.fastq index d12b7a34..01f156a8 100644 --- a/tests/data/barcode_demux/double_end/unclassified.fastq +++ b/tests/data/barcode_demux/double_end/unclassified.fastq @@ -10,3 +10,11 @@ $$$#####%%()&$$$$$%####$$$$$#####$$$$$$$$$#$$$$%$$$$$$##$##$$%'''$#$$$$#&'()*, TCATCACCATCGGCCCGGCATTCCGGCCAACGACCCAGCGGCCGCGCTGGCAAGCGATTCCAGGCTGATCTTGTTGGCGACGCTGCTCTGATATCGCTTTGGCCGGCGCCTTGCTGATCGCGCTGCAGCACCACCGTGGTGCGCATCTTCCGCGCCGTTTTCCGCCTACATCAACGTCTCGCCAGGCTCGTTCATCACGCAGGCCCAGGCGTTGAAGTCTACCGCCGGAATC + $%%&&%%%%%%$$&&'')*())'''(*,***((()($%&$$$$$$$$$$$%&&&&&&&'&'%%&''()(&'&$%'&&&%$$$'('%%$%%%&%%%$$#$%$%&&&&&&'(%&&'(+**&%%$$%%%$$$%%&$$$$%$$###$%%%%(''(&'**)*(()-(3****')((())))+))&&%%&(,*++&&&&$$$$&&%%&&&&'&)-+'''%%&&&%%#$$$%'&'&'&& +@6b684a94-d3db-489a-85f1-dbcaec34b6b1 +TTGCATGTATTGATTCAGAGATTGCTTTCTCATGGCTTCAGATTTAGCAAGCACCTCTCTCATGACATTTGGTCTTGGGAGGGAATGGATCATTTTGGTGCGCCGCTTCAACAGCGGATCCTGAACTGAGGACATTGGACTCGTTTCCCCGGTGGCGTTTATGGGCTCGTTACCAGGATGAAGAACTTATTCGTTAGTTGTTATTTCTGGGTGCATCAGCAACACCAGTACAGTTGAACAGCTGTGAATATCGTTGCATCGTCGCAATCTTTGCTTTTGAATCCATGGACCACAGTCTTTACATGACCTCGTGGATCTGTAGGAAGCATTATTCCCAACCTTCACCAGCAACTTATGTTGTTCGCACTTTAATGACTACCTTGATCGCAGCACGAATTACTTGGCCCTGACTTATCTGACCGCCGCAATGTCATAGATGATCTTAAGAGTCTCGTAAATATTCAAGAACAATTCAACACAAAGTATTGGCGGATCAAGTGGCGCATGCTTGAAGCAGAGCGCCCTGCGATTGGCTCTGTGATGAAATCATATATGATTATTCTTCACTTTTTCAGCGGTTGCGTTTTAGGATTTTTGGCACATTGACGCGTCTATCAACTAGCTTGTCGACGGCGTAAAGTTGCATTAGTTTCAGGTACTATTTAAGGCTTGCAAAAGTTTTATGAATCTTTACACACCATGTCACCCATAGCCACATTTAACTTCTTCTGTCTTATATAATTTATAAAGATAGTCGCTTTAATAGTTCCATATATTGCTGCAGCGACAACCTTGAGCTTTTTACCTTCGTTGGTAAGCTTACATCGTCGCAAGTACCTTCTTATTCGCACTCTTTGCCGTCGTAATCGTCTTAATACTTATCTATTGAAGATATTTATATCATATTATTTCTCGCAATACTTTAGCCGCTTGAATACTTTCTGAAGATAAAGAACATTTGGCGCACAGCCGCTATTACCACCGAAAACTAGGCATCTTTTTAGCAATACGACGGTCCATCATCATTACATGGCCGTACCAAACCAATTATATTTGTGCCATTTTTAGTCATTATGACGCCTATGGAAGGGTTCAAACCTATGTTAAGACAAAATGCAGAGTTAAGCCGAAAGAAGCCAAGTATTATTTTTGGTATTTTTTAAAGACACTTACAAAAATTGGACGTTATTTTTTCGTAAAATTTATAGATTTAACTCAGTCAAACGTACTGTTTTTCTCAATGACTTACTTCAACAGCGTGATTGTTTCAACATTTATATTAAAAGAACAATCAACCATGCCACCTACTGTTTCAGAAGCAGGTAAAACAGTTTCTGCGGATGCAATTTGGAGATTTTAACTTTGGTGGGTCATGGTACAGTTACGGTTTTTTCTTTTGATCTTTGCTTTACCAAACACCAGAACATCGCTTTGATAAAGAAGTATATTTAAGAGCAGCTGAAGCTTACAATTTTAGCATCAATTTTCCCTTCAATCAGCATTTAGAAGTGACGCATTCTTCTGGTCCAATTATTGCTTATGGCTTAAGCTTTAATTGATTAACATGTACAGCGTGCCTTGATTGACATTATTACGATGAAGATGACAGCAGTTTCTACCTAGCCTATTTCCGTAATAACATTGCATGCATCGTTTACCTCATTGATCATCTCTTTGTTGAACATGACGGAAAAATTAGGCGGTGCTCGTTTAGACAATGTTATTCTATACGCTTGTCCATTACTTGAAGCCGAACTTCTTTCCTTAAATGGAAGAAGTAGCAAACTTAAAGAAACAGATGACGATATGCGGATGCGTTCGATTGGATACAAATCTTTATTCAACAAGATGTTGACGAAGGCGATTTTTGATTAGCTCTACACCAAATACTGAAGACAGTGATTCGGCTTGTTGTAGCGACACCTGTTGAACAAAGACCTTGTGAACATTATTTACATGACTTTGACTTTAATCACTGGTCAGATGGTTCGGCAATATTTCAGCAGAACAGTGAAGAATACGAGTCATTATTAGGCCAACGTTATCAGATTTCTTATGGGGTTTAACTTATCCTGATTCTTTGACGGAGGCATTATTCCAGAAGCTTTATCAAGAGTTTGACTGAAGCGACTACATTGGAACAATGACCAAGGCCGATCAGTTGATCAACGATTCCGTTCAAATGGCACGGCGCAGCAGGTAAGTACTTTGGTGGAAGTAACATTACAATGTAGTAGACGGCGCAGCGAAAATTA ++ +$&&$%%%%%%%$$&'''&%$%$&$$$$$#$#####"#$$%&&%%$$$$%%%%%&')%&%&%%&&&%$$$%#$$$$$$$%&&&('*)&'''&''')*&%%%&())''())*,-+''&''&&)-*(()*%%%%$$%&&%$$%%%&&'(''('%%%$%&%%&(%'%%%%()'''%&&%%'''''&%%%##$$$$$%&&&&'()))'&))(*%%%$%&%%%&&&%'('(*(&&%'&(&''*+'%%%%$%%**+*,*))*(),,)((&&'&&'++,*&&'''*'''%$%%'/,+++,/.-+,++*'(++*,,/./--,+-/./1*)))*+,1.--)))(('&')+15//1.,))''')+01*****.'&&&&')'%&%&-01.///0//0)))&&'&&&&'++(%%%$%%%(.///1211,-((()'''&&''(('&&'$%%%$$#$'(&'&&%&'''&&%%%&%&&%%)(&&$$&'(%%&'&&%%%%(''&%%'&&%$$$$$$%&'()******'''(,.0-..))('&'((**++../&&&&%''*()(&'''%&&&%&((%&&&&(&%&&&&'&%%%%$%%%%%$$$'*&%%%'(-,,**)))''))((''''&'&('(''''%$%('''''(()))++*,,-.(('''&&&&''*+++))(%%&'&&&%'((,,***+-----,,((()(***''''&**+.1334472./---../+**)%%&''''((*++,110100-----++%%%%%%%%%%%%%&*%%&&''''*'('((,+**,//100)((((,++'$$$$%&&'(%%%%'$%(('''&&('((''''&&'%%&&&%$$$%(((()&&'&%&((()))+*/0,,-,----+*+*&'%%'%%&&%&&&&'&$$(+--,+-.-/.1('&&&)%%&&&)&&&().,**)(%&''%&&&'((''''''%'&&)('&%%%%%$#$%&'&&&&%&'()'(()*''(&&(('&%%'(('&%%%%&&##$&)*)***+,)(%%%%')((()++***+0..//****''&&%''()+,+))))&%%%%(&&'&%&&'&&')%%%'&'(('&&&)'()--0+*&&&%%''&&)()+.*&%%&%$$&'((()'&&%%%$#$&%%%%%(*))('&&&%&&((*,.2)))))*01.2*&&%&%%%%%$%%%%'()**''('''&&+++*,('&&&))***))'''()+'''&$$$%$&&&&*0,*)(()),0-.,(((&&&&%$$%%$$%%%&&%&&'()''''$$$$$&('&'())''(),,,(),,*())&$&$$$$%'(&&&%%&%%''&&'(),+*%%%%%%'++*((&%$$#$$%$#$$'$%$$&((''&''&'&$$$$$%$%&'&%%$%$$$$$%%'(+-+,-''('(&''&'))&')))%%&$$%$$()('&&&%'%%%''-*,-(((()/(%$$$$'(&'&%%'''''&&'**+,)($$%)*-&&&&&&)))),-((''(%%&'(''%##$$$$%&')*((&&$##$$%$%&%&%%%&&)-/0/012021211/)(&'01.)*))((('&%%%%'))*)))'''(*11)((&%%%''*&%&$$$&''')+)(&&&%$&%%%&%$&%%')%%%%%%'-,&(&&&&'(()''(%&''%%%%%%&(('))(&&'*('($$$$$$$%(%&'''*+((&%&&%%%%'''(&&&%%&%$%&$$$&&''&%%%)*+-+*,**'((//013)''''-0/013-++++-)('%')'()+()))''(&',11/0,,*)))()))*0)&&&&'(&(+**&&&%&&''&&%%%%%&$$$$$%%%%%%%%&%%%%&'((%'($$%%%%&&((('&&)))*)*+(((((.-*)))&'()'(&&'(+--65...,-0/*)'''&&&''%&&&*'&&&%$%%&&&&''()*+,,,,+,,+.--,-*)'''-*+*&%%%&('((((++)%%%&&'&'(&&&&$%(&&&%%&&)**())(+.,,,*'(((%&&(''(+))*1/.,+*))(())*,)))((+*,,,+)**&&('')(+,)*/++&%%%%''&&&''%%&&&()&''''(&()'()))**/5((('')('((()++&&''(*)*+,,&%$$$%%%%&$$%%$%'&%%%&%%%%%%'''&&&&)))'%%%$$$&(&&''%''(())((&%&('(*'%%$$$%()**++))'&%%%&&((')**)))&'))('''&&&&&(*%%%%%%%&&+,-+++)))**))% +@eb642113-7d12-475d-ad0c-d6dd1998e04c +GCAGCCTGATCCTCCTGCTGTCGGCACCATGCCTGGGTCAGGGCATGAACCCCCTGGATGCGCCACAAGACGGTTAAATGTCTGAACACCGCGACCTGGTGATGGTCTCGACGCGGTAGCTGGTGTTGCGCATGATGCTGATCCGCGCCGTCGTACCGGTCCCCGACGACATCCGCCTGAAAGAGCAGTCGCCGTGGTGCGCCGTGGGTTAGCGCCATGGTCACATCAACCTGCTGCTGCACCAGACGATCGCCGCTCGCCAGACGAGGCACTGAACCAAACAGCACAAGATTGGCATGAAAACAACCGCAACCTGGCTGTTGCCATACCATATGCCACCCGCATCCATTATTACAAATAATATCGTAATCACAATAGCTGCTCTACTTTGTAATTAAACTTCCTGCATTTCCCCAAAATAACCACACGCCATCTGGGCTGGGCGAAATTTACTGGCAGGTGTTGTCATTGAAACATGGCGTTTTGGTACACCCGTATGCACACGACGCGTCAGGCTGGCCATCAACTTTGAATTTGACGATATAAGGCAATATCATTTGATTGATCAGGAATTTTATTTATCACACCTAAAGCAGCACCAAAAACGCAAAAGAACACAACAATTTTTCTGGTGA ++ +$$$&%$$##%%&)&%&(()**'&&'''''(''&&&(((%$$$$%%''''(''''()'&%%&%%%%%&&&%%%%%&$$$$$%%$$$$%%%%$$$$%%''''')/)((((&&%$$%$%%$###$%'(&&$$$&%%%%$$$'(&&%$$#$$$%%%$$$$%%%%$$%%%%%$$%&'()(((*)&%%%&%%$%%$$%%%&&&&%%%&&&'&&&%%$####$&'*('&%%$%%&&((*,,+)''&'&%%$$%%%%%%$%&&&&%$$%&&&&%%''&&'&&&''&&&&&%%%$$$$$%$&&'(''((()-,('&%$$#$$###$$%'('%$%%%&%&%%%%$%&%&''%&&%&'&&&&%&$$##$&&&&%%$$$%%&'&&%%%('''$$$%%%%&)**-''&'%&&$''&''&(')))'))(&()))('''')''&''(*)(&%$#$#$$#$$$#$$%&&'&%&%%%%%%$$$$$$$%%&&%'($##$$$$$#$%$$%%&&&&&%%'%%'&%%&$$%&%&%&%&'(())+'%%%&*($$$$&+'''%%%%%%%&&%%&'&&%$$$%$%$$$$%%%))(*+-.,(''&$%$#$#$#$#$'(&%$$#$$$%'-00('%%&((*('&(&&&'(''(((%$%%&%$ From 8c91ec5c93078f4d3c6f4b871983ae75a581021a Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:18:07 +0000 Subject: [PATCH 19/23] Replace score with penalty as the new scoring methodology depends on the edit distance, and the lower the distance the better the "score" is. --- documentation/CustomBarcodes.md | 20 +- dorado/demux/BarcodeClassifier.cpp | 207 +++++++++--------- dorado/demux/BarcodeClassifier.h | 5 - dorado/demux/Trimmer.cpp | 2 +- dorado/demux/parse_custom_kit.cpp | 16 +- dorado/utils/barcode_kits.cpp | 4 +- dorado/utils/barcode_kits.h | 4 +- dorado/utils/types.h | 6 +- tests/CustomBarcodeParserTest.cpp | 17 +- .../flank_free_arrangement.toml | 8 + .../custom_barcodes/scoring_params.toml | 4 +- tests/test_simple_basecaller_execution.sh | 4 +- 12 files changed, 160 insertions(+), 137 deletions(-) create mode 100644 tests/data/barcode_demux/custom_barcodes/flank_free_arrangement.toml diff --git a/documentation/CustomBarcodes.md b/documentation/CustomBarcodes.md index e50e7842..d7a56e72 100644 --- a/documentation/CustomBarcodes.md +++ b/documentation/CustomBarcodes.md @@ -41,9 +41,9 @@ last_index = 96 ## Scoring options [scoring] -max_barcode_cost = 11 +max_barcode_penalty = 11 barcode_end_proximity = 75 -min_barcode_score_dist = 3 +min_barcode_penalty_dist = 3 min_separation_only_dist = 6 flank_left_pad = 5 flank_right_pad = 10 @@ -77,26 +77,26 @@ Dorado maintains a default set of parameters for scoring each barcode to determi The classification heuristic applied by Dorado is the following - 1. Dorado uses the flanking sequences defined in `maskX_front/rear` to find a window in the read where the barcode is situated. 2. For double ended barcodes, the __best__ window (either from the front or rear of the read) is chosen based on the alignment of the flanking mask sequences. -3. After choosing the best window for an arrangement, each barcode candidate within the arrangement is aligned to the subsequence within the window. The alignment may optionally consider additional bases from the preceding/succeeding flank (as specifed in the `flank_left_pad` and `flank_right_pad` parameters). The edit distance of this alignment is assigned as a score to each barcode. +3. After choosing the best window for an arrangement, each barcode candidate within the arrangement is aligned to the subsequence within the window. The alignment may optionally consider additional bases from the preceding/succeeding flank (as specifed in the `flank_left_pad` and `flank_right_pad` parameters). The edit distance of this alignment is assigned as a penalty to each barcode. -Once barcodes are sorted by barcode score, the top candidate is checked against the following rules - -1. Is the barcode score below `max_barcode_cost` and the distance between top 2 barcode scores greater than `min_barcode_score_dist`?. -2. Is the barcode score above `max_barcode_cost` but the distance between top 2 barcodes scores greater then `min_separation_only_dist`? +Once barcodes are sorted by barcode penalty, the top candidate is checked against the following rules - +1. Is the barcode penalty below `max_barcode_penalty` and the distance between top 2 barcode penalties greater than `min_barcode_penalty_dist`?. +2. Is the barcode penalty above `max_barcode_penalty` but the distance between top 2 barcodes penalties greater then `min_separation_only_dist`? If a candidate meets one of the above conditions and the location of the start/end of the barcode construct is within `barcode_end_proximity` bases of the ends of the read, then it is considered a hit. | Scoring option | Description | | -- | -- | -| max_barcode_cost | The maximum edit distance allowed for a classified barcode. Considered in conjunction with the `min_barcode_score_dist` parameter. | -| min_barcode_score_dist | The minimum score difference between top-2 barcodes required for classification. Used in conjunction with `max_barcode_cost`. | -| min_separation_only_dist | The minimum score difference between the top-2 barcodes required for classification when the `max_barcode_cost` is not met. | +| max_barcode_penalty | The maximum edit distance allowed for a classified barcode. Considered in conjunction with the `min_barcode_penalty_dist` parameter. | +| min_barcode_penalty_dist | The minimum penalty difference between top-2 barcodes required for classification. Used in conjunction with `max_barcode_cost`. | +| min_separation_only_dist | The minimum penalty difference between the top-2 barcodes required for classification when the `max_barcode_cost` is not met. | | barcode_end_proximity | Proximity of the end of the barcode construct to the ends of the read required for classification. | | flank_left_pad | Number of bases to use from preceding flank during barcode alignment. | | flank_right_pad | Number of bases to use from succeeding flank during barcode alignment. | | front_barcode_window | Number of bases at the front of the read within which to look for barcodes. | | rear_barcode_window | Number of bases at the rear of the read within which to look for barcodes. | -For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment score. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification. +For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment penalty. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification. ### Custom Sequences File diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index d90ea992..0b3768b4 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -94,17 +94,17 @@ std::tuple extract_flank_fit(std::string_view stra // Helper function to globally align a barcode to a region // within the read. -int extract_barcode_score(std::string_view barcode, - std::string_view read, - const EdlibAlignConfig& config, - const char* debug_prefix) { +int extract_barcode_penalty(std::string_view barcode, + std::string_view read, + const EdlibAlignConfig& config, + const char* debug_prefix) { auto result = edlibAlign(barcode.data(), int(barcode.length()), read.data(), int(read.length()), config); - auto score = result.editDistance; - spdlog::trace("{} {}", debug_prefix, score); + auto penalty = result.editDistance; + spdlog::trace("{} {}", debug_prefix, penalty); spdlog::trace("\n{}", utils::alignment_to_str(barcode.data(), read.data(), result)); edlibFreeAlignResult(result); - return score; + return penalty; } bool barcode_is_permitted(const BarcodingInfo::FilterSet& allowed_barcodes, @@ -286,11 +286,6 @@ std::vector BarcodeClassifier::generate_ use_leading_flank = false; } - m_left_buffer = m_scoring_params.flank_left_pad; - m_right_buffer = m_scoring_params.flank_right_pad; - m_front_barcode_window = m_scoring_params.front_barcode_window; - m_rear_barcode_window = m_scoring_params.rear_barcode_window; - BarcodeCandidateKit candidate; candidate.kit = kit_name; candidate.barcode_kit = kit_info.name; @@ -312,17 +307,17 @@ std::vector BarcodeClassifier::generate_ candidate.top_context = (use_leading_flank ? kit_info.top_front_flank : "") + bc_mask + kit_info.top_rear_flank; candidate.top_context_left_buffer = - extract_left_buffer(kit_info.top_front_flank, m_left_buffer); + extract_left_buffer(kit_info.top_front_flank, m_scoring_params.flank_left_pad); candidate.top_context_right_buffer = - extract_right_buffer(kit_info.top_rear_flank, m_right_buffer); + extract_right_buffer(kit_info.top_rear_flank, m_scoring_params.flank_right_pad); auto top_front_flank_rc = utils::reverse_complement(kit_info.top_front_flank); auto top_rear_flank_rc = utils::reverse_complement(kit_info.top_rear_flank); candidate.top_context_rev = top_rear_flank_rc + bc_mask + top_front_flank_rc; candidate.top_context_rev_left_buffer = - extract_left_buffer(top_rear_flank_rc, m_left_buffer); + extract_left_buffer(top_rear_flank_rc, m_scoring_params.flank_left_pad); candidate.top_context_rev_right_buffer = - extract_right_buffer(top_front_flank_rc, m_right_buffer); + extract_right_buffer(top_front_flank_rc, m_scoring_params.flank_right_pad); if (!kit_info.barcodes2.empty()) { const auto& ref_bc2_name = kit_info.barcodes2[0]; @@ -331,18 +326,18 @@ std::vector BarcodeClassifier::generate_ std::string bc2_mask(ref_bc2.length(), 'N'); candidate.bottom_context = (use_leading_flank ? kit_info.bottom_front_flank : "") + bc2_mask + kit_info.bottom_rear_flank; - candidate.bottom_context_left_buffer = - extract_left_buffer(kit_info.bottom_front_flank, m_left_buffer); - candidate.bottom_context_right_buffer = - extract_right_buffer(kit_info.bottom_rear_flank, m_right_buffer); + candidate.bottom_context_left_buffer = extract_left_buffer( + kit_info.bottom_front_flank, m_scoring_params.flank_left_pad); + candidate.bottom_context_right_buffer = extract_right_buffer( + kit_info.bottom_rear_flank, m_scoring_params.flank_right_pad); auto bottom_front_flank_rc = utils::reverse_complement(kit_info.bottom_front_flank); auto bottom_rear_flank_rc = utils::reverse_complement(kit_info.bottom_rear_flank); candidate.bottom_context_rev = bottom_rear_flank_rc + bc_mask + bottom_front_flank_rc; candidate.bottom_context_rev_left_buffer = - extract_left_buffer(bottom_rear_flank_rc, m_left_buffer); + extract_left_buffer(bottom_rear_flank_rc, m_scoring_params.flank_left_pad); candidate.bottom_context_rev_right_buffer = - extract_right_buffer(bottom_front_flank_rc, m_right_buffer); + extract_right_buffer(bottom_front_flank_rc, m_scoring_params.flank_right_pad); } for (size_t idx = 0; idx < kit_info.barcodes.size(); idx++) { @@ -403,9 +398,11 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, m_front_barcode_window); - int bottom_start = std::max(0, static_cast(read_seq.length()) - m_rear_barcode_window); - std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); + std::string_view read_top = read_seq.substr(0, m_scoring_params.front_barcode_window); + int bottom_start = + std::max(0, static_cast(read_seq.length()) - m_scoring_params.rear_barcode_window); + std::string_view read_bottom = + read_seq.substr(bottom_start, m_scoring_params.rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); @@ -469,12 +466,13 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe read_bottom.substr(bottom_start_idx_v2, bottom_end_idx_v2 - bottom_start_idx_v2); // Find the best variant of the two. - int total_v1_score = top_result_v1.editDistance + bottom_result_v1.editDistance; - int total_v2_score = top_result_v2.editDistance + bottom_result_v2.editDistance; - spdlog::trace("total v1 edit dist {}, total v2 edit dis {}", total_v1_score, total_v2_score); + int total_v1_penalty = top_result_v1.editDistance + bottom_result_v1.editDistance; + int total_v2_penalty = top_result_v2.editDistance + bottom_result_v2.editDistance; + spdlog::trace("total v1 edit dist {}, total v2 edit dis {}", total_v1_penalty, + total_v2_penalty); std::string_view top_mask, bottom_mask; - if (total_v1_score < total_v2_score) { + if (total_v1_penalty < total_v2_penalty) { top_mask = top_mask_v1; bottom_mask = bottom_mask_v1; spdlog::trace("best variant v1"); @@ -502,18 +500,18 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe spdlog::trace("Checking barcode {}", barcode_name); - // Calculate barcode scores for v1. - auto top_mask_result_score_v1 = - extract_barcode_score(barcode1, top_mask_v1, mask_config, "top window v1"); + // Calculate barcode penalties for v1. + auto top_mask_result_penalty_v1 = + extract_barcode_penalty(barcode1, top_mask_v1, mask_config, "top window v1"); - auto bottom_mask_result_score_v1 = extract_barcode_score(barcode2_rev, bottom_mask_v1, - mask_config, "bottom window v1"); + auto bottom_mask_result_penalty_v1 = extract_barcode_penalty( + barcode2_rev, bottom_mask_v1, mask_config, "bottom window v1"); BarcodeScoreResult v1; - v1.top_score = top_mask_result_score_v1; - v1.bottom_score = bottom_mask_result_score_v1; - v1.score = std::min(v1.top_score, v1.bottom_score); - v1.use_top = v1.top_score < v1.bottom_score; + v1.top_penalty = top_mask_result_penalty_v1; + v1.bottom_penalty = bottom_mask_result_penalty_v1; + v1.penalty = std::min(v1.top_penalty, v1.bottom_penalty); + v1.use_top = v1.top_penalty < v1.bottom_penalty; v1.top_flank_score = top_flank_score_v1; v1.bottom_flank_score = bottom_flank_score_v1; v1.flank_score = v1.use_top ? top_flank_score_v1 : bottom_flank_score_v1; @@ -521,18 +519,18 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe v1.bottom_barcode_pos = {bottom_start + bottom_result_v1.startLocations[0], bottom_start + bottom_result_v1.endLocations[0]}; - // Calculate barcode scores for v2. - auto top_mask_result_score_v2 = - extract_barcode_score(barcode2, top_mask_v2, mask_config, "top window v2"); + // Calculate barcode penalties for v2. + auto top_mask_result_penalty_v2 = + extract_barcode_penalty(barcode2, top_mask_v2, mask_config, "top window v2"); - auto bottom_mask_result_score_v2 = extract_barcode_score(barcode1_rev, bottom_mask_v2, - mask_config, "bottom window v2"); + auto bottom_mask_result_penalty_v2 = extract_barcode_penalty( + barcode1_rev, bottom_mask_v2, mask_config, "bottom window v2"); BarcodeScoreResult v2; - v2.top_score = top_mask_result_score_v2; - v2.bottom_score = bottom_mask_result_score_v2; - v2.score = std::min(v2.top_score, v2.bottom_score); - v2.use_top = v2.top_score < v2.bottom_score; + v2.top_penalty = top_mask_result_penalty_v2; + v2.bottom_penalty = bottom_mask_result_penalty_v2; + v2.penalty = std::min(v2.top_penalty, v2.bottom_penalty); + v2.use_top = v2.top_penalty < v2.bottom_penalty; v2.top_flank_score = top_flank_score_v2; v2.bottom_flank_score = bottom_flank_score_v2; v2.flank_score = v2.use_top ? top_flank_score_v2 : bottom_flank_score_v2; @@ -540,8 +538,8 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe v2.bottom_barcode_pos = {bottom_start + bottom_result_v2.startLocations[0], bottom_start + bottom_result_v2.endLocations[0]}; - // The best score is the higher score between the 2 variants. - const bool var1_is_best = v1.score < v2.score; + // The best penalty is the higher penalty between the 2 variants. + const bool var1_is_best = v1.penalty < v2.penalty; BarcodeScoreResult res = var1_is_best ? v1 : v2; res.variant = var1_is_best ? "var1" : "var2"; res.barcode_name = barcode_name; @@ -573,9 +571,11 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, m_front_barcode_window); - int bottom_start = std::max(0, static_cast(read_seq.length()) - m_rear_barcode_window); - std::string_view read_bottom = read_seq.substr(bottom_start, m_rear_barcode_window); + std::string_view read_top = read_seq.substr(0, m_scoring_params.front_barcode_window); + int bottom_start = + std::max(0, static_cast(read_seq.length()) - m_scoring_params.rear_barcode_window); + std::string_view read_bottom = + read_seq.substr(bottom_start, m_scoring_params.rear_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); @@ -618,19 +618,20 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl } spdlog::trace("Checking barcode {}", barcode_name); - auto top_mask_score = extract_barcode_score(barcode, top_mask, mask_config, "top window"); + auto top_mask_penalty = + extract_barcode_penalty(barcode, top_mask, mask_config, "top window"); - auto bottom_mask_score = - extract_barcode_score(barcode_rev, bottom_mask, mask_config, "bottom window"); + auto bottom_mask_penalty = + extract_barcode_penalty(barcode_rev, bottom_mask, mask_config, "bottom window"); BarcodeScoreResult res; res.barcode_name = barcode_name; res.kit = candidate.kit; res.barcode_kit = candidate.barcode_kit; - res.top_score = top_mask_score; - res.bottom_score = bottom_mask_score; - res.score = std::min(res.top_score, res.bottom_score); - res.use_top = res.top_score < res.bottom_score; + res.top_penalty = top_mask_penalty; + res.bottom_penalty = bottom_mask_penalty; + res.penalty = std::min(res.top_penalty, res.bottom_penalty); + res.use_top = res.top_penalty < res.bottom_penalty; res.top_flank_score = top_flank_score; res.bottom_flank_score = bottom_flank_score; res.flank_score = res.use_top ? res.top_flank_score : res.bottom_flank_score; @@ -656,7 +657,7 @@ std::vector BarcodeClassifier::calculate_barcode_score( std::string_view read_seq, const BarcodeCandidateKit& candidate, const BarcodingInfo::FilterSet& allowed_barcodes) const { - std::string_view read_top = read_seq.substr(0, m_front_barcode_window); + std::string_view read_top = read_seq.substr(0, m_scoring_params.front_barcode_window); // Try to find the location of the barcode + flanks in the top and bottom windows. EdlibAlignConfig placement_config = init_edlib_config_for_flanks(); @@ -688,7 +689,8 @@ std::vector BarcodeClassifier::calculate_barcode_score( } spdlog::trace("Checking barcode {}", barcode_name); - auto top_mask_score = extract_barcode_score(barcode, top_mask, mask_config, "top window"); + auto top_mask_penalty = + extract_barcode_penalty(barcode, top_mask, mask_config, "top window"); BarcodeScoreResult res; res.barcode_name = barcode_name; @@ -697,9 +699,9 @@ std::vector BarcodeClassifier::calculate_barcode_score( res.top_flank_score = top_flank_score; res.bottom_flank_score = -1.f; res.flank_score = std::max(res.top_flank_score, res.bottom_flank_score); - res.top_score = top_mask_score; - res.bottom_score = -1; - res.score = res.top_score; + res.top_penalty = top_mask_penalty; + res.bottom_penalty = -1; + res.penalty = res.top_penalty; res.use_top = true; res.top_barcode_pos = {top_result.startLocations[0], top_result.endLocations[0]}; @@ -728,23 +730,23 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( } // Then find the best barcode hit within that kit. - std::vector scores; + std::vector results; const auto& kit = get_kit_info(candidate->kit); if (kit.double_ends) { if (kit.ends_different) { auto out = calculate_barcode_score_different_double_ends(fwd, *candidate, allowed_barcodes); - scores.insert(scores.end(), out.begin(), out.end()); + results.insert(results.end(), out.begin(), out.end()); } else { auto out = calculate_barcode_score_double_ends(fwd, *candidate, allowed_barcodes); - scores.insert(scores.end(), out.begin(), out.end()); + results.insert(results.end(), out.begin(), out.end()); } } else { auto out = calculate_barcode_score(fwd, *candidate, allowed_barcodes); - scores.insert(scores.end(), out.begin(), out.end()); + results.insert(results.end(), out.begin(), out.end()); } - if (scores.empty()) { + if (results.empty()) { spdlog::warn("Barcode unclassified because no barcodes found in kit."); return UNCLASSIFIED; } @@ -754,52 +756,52 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // to the top window and the best barcode according to the bottom window // are the same. If they suggest different barcodes confidently, then // consider the read unclassified. - auto best_top_score = std::min_element( - scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.top_score < r.top_score; }); - auto best_bottom_score = std::min_element( - scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); - auto best_score = std::max(best_top_score->score, best_bottom_score->score); - auto score_dist = std::abs(best_top_score->score - best_bottom_score->score); - if ((best_score <= m_scoring_params.max_barcode_cost) && - (score_dist <= m_scoring_params.min_barcode_score_dist) && - (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + auto best_top_result = std::min_element( + results.begin(), results.end(), + [](const auto& l, const auto& r) { return l.top_penalty < r.top_penalty; }); + auto best_bottom_result = std::min_element( + results.begin(), results.end(), + [](const auto& l, const auto& r) { return l.bottom_penalty < r.bottom_penalty; }); + auto max_penalty = std::max(best_top_result->penalty, best_bottom_result->penalty); + auto penalty_dist = std::abs(best_top_result->penalty - best_bottom_result->penalty); + if ((max_penalty <= m_scoring_params.max_barcode_penalty) && + (penalty_dist <= m_scoring_params.min_barcode_penalty_dist) && + (best_top_result->barcode_name != best_bottom_result->barcode_name)) { spdlog::trace("Two ends confidently predict different BCs: top bc {}, bottom bc {}", - best_top_score->barcode_name, best_bottom_score->barcode_name); + best_top_result->barcode_name, best_bottom_result->barcode_name); return UNCLASSIFIED; } } // Sort the scores windows by their barcode score. - std::sort(scores.begin(), scores.end(), - [](const auto& l, const auto& r) { return l.score < r.score; }); + std::sort(results.begin(), results.end(), + [](const auto& l, const auto& r) { return l.penalty < r.penalty; }); std::stringstream d; - for (auto& s : scores) { - d << s.barcode_name << " " << s.score << ", "; + for (auto& s : results) { + d << s.barcode_name << " " << s.penalty << ", "; } spdlog::trace("Scores: {}", d.str()); - auto best_score = scores.begin(); - auto are_scores_acceptable = [this](const auto& score) { - return score.score <= m_scoring_params.max_barcode_cost; + auto best_result = results.begin(); + auto are_penalties_acceptable = [this](const auto& penalty) { + return penalty.penalty <= m_scoring_params.max_barcode_penalty; }; BarcodeScoreResult out = UNCLASSIFIED; - if (scores.size() == 1) { - if (are_scores_acceptable(*best_score)) { - out = *best_score; + if (results.size() == 1) { + if (are_penalties_acceptable(*best_result)) { + out = *best_result; } } else { - const auto& second_best_score = std::next(best_score); - const int score_dist = second_best_score->score - best_score->score; - if (((score_dist >= m_scoring_params.min_barcode_score_dist && - are_scores_acceptable(*best_score)) || - (score_dist >= m_scoring_params.min_separation_only_dist)) && - (best_score->top_barcode_pos.first <= m_scoring_params.barcode_end_proximity || - best_score->bottom_barcode_pos.second >= + const auto& second_best_result = std::next(best_result); + const int penalty_dist = second_best_result->penalty - best_result->penalty; + if (((penalty_dist >= m_scoring_params.min_barcode_penalty_dist && + are_penalties_acceptable(*best_result)) || + (penalty_dist >= m_scoring_params.min_separation_only_dist)) && + (best_result->top_barcode_pos.first <= m_scoring_params.barcode_end_proximity || + best_result->bottom_barcode_pos.second >= int(read_seq.length() - m_scoring_params.barcode_end_proximity))) { - out = *best_score; + out = *best_result; } } @@ -807,9 +809,10 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( // For more stringent classification, ensure that both ends of a read // have a high score for the same barcode. If not then consider it // unclassified. - if (std::max(out.top_score, out.bottom_score) > m_scoring_params.max_barcode_cost) { - spdlog::trace("Min of top {} and bottom scores {} > max barcode score {}", - out.top_score, out.bottom_score, m_scoring_params.max_barcode_cost); + if (std::max(out.top_penalty, out.bottom_penalty) > m_scoring_params.max_barcode_penalty) { + spdlog::trace("Max of top {} and bottom penalties {} > max barcode penalty {}", + out.top_penalty, out.bottom_penalty, + m_scoring_params.max_barcode_penalty); return UNCLASSIFIED; } } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 88273686..f7b8de7e 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -54,11 +54,6 @@ class BarcodeClassifier { const dorado::barcode_kits::KitInfo& get_kit_info(const std::string& kit_name) const; const std::string& get_barcode_sequence(const std::string& barcode_name) const; - - int m_left_buffer; - int m_right_buffer; - int m_front_barcode_window; - int m_rear_barcode_window; }; } // namespace demux diff --git a/dorado/demux/Trimmer.cpp b/dorado/demux/Trimmer.cpp index 3c327608..59321fa0 100644 --- a/dorado/demux/Trimmer.cpp +++ b/dorado/demux/Trimmer.cpp @@ -58,7 +58,7 @@ std::pair Trimmer::determine_trim_interval(const BarcodeScoreResult& r // the end of top barcode end value because that's the position // in the sequence where the barcode ends. So the actual sequence // because from one after that. - if (res.top_score >= 0 && res.bottom_score >= 0) { + if (res.top_penalty >= 0 && res.bottom_penalty >= 0) { float top_flank_score = res.top_flank_score; if (top_flank_score > kFlankScoreThres) { trim_interval.first = res.top_barcode_pos.second + 1; diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index efde6319..2fddc451 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -82,6 +82,10 @@ std::optional> parse_custom_arrang std::string barcode1_pattern = toml::find(config, "barcode1_pattern"); new_kit.top_front_flank = toml::find(config, "mask1_front"); new_kit.top_rear_flank = toml::find(config, "mask1_rear"); + if (new_kit.top_front_flank.empty() && new_kit.top_rear_flank.empty()) { + throw std::runtime_error( + "At least one of mask1_front or mask1_rear needs to be specified."); + } fill_bc_sequences(barcode1_pattern, new_kit.barcodes); // If any of the 2nd barcode settings are set, ensure ALL second barcode @@ -97,6 +101,10 @@ std::optional> parse_custom_arrang // Fetch barcode 2 context (flanks + sequences). new_kit.bottom_front_flank = toml::find(config, "mask2_front"); new_kit.bottom_rear_flank = toml::find(config, "mask2_rear"); + if (new_kit.bottom_front_flank.empty() && new_kit.bottom_rear_flank.empty()) { + throw std::runtime_error( + "At least one of mask2_front or mask2_rear needs to be specified."); + } std::string barcode2_pattern = toml::find(config, "barcode2_pattern"); fill_bc_sequences(barcode2_pattern, new_kit.barcodes2); @@ -121,14 +129,14 @@ dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( } const auto& config = toml::find(config_toml, "scoring"); - if (config.contains("max_barcode_cost")) { - params.max_barcode_cost = toml::find(config, "max_barcode_cost"); + if (config.contains("max_barcode_penalty")) { + params.max_barcode_penalty = toml::find(config, "max_barcode_penalty"); } if (config.contains("barcode_end_proximity")) { params.barcode_end_proximity = toml::find(config, "barcode_end_proximity"); } - if (config.contains("min_barcode_score_dist")) { - params.min_barcode_score_dist = toml::find(config, "min_barcode_score_dist"); + if (config.contains("min_barcode_penalty_dist")) { + params.min_barcode_penalty_dist = toml::find(config, "min_barcode_penalty_dist"); } if (config.contains("min_separation_only_dist")) { params.min_separation_only_dist = toml::find(config, "min_separation_only_dist"); diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 0fc65145..f8b0a28b 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -111,9 +111,9 @@ const std::vector RBK_1_96 = { const BarcodeKitScoringParams DEFAULT_PARAMS{}; const BarcodeKitScoringParams RBK114_PARAMS{ - .max_barcode_cost = 12, + .max_barcode_penalty = 12, .barcode_end_proximity = 75, - .min_barcode_score_dist = 3, + .min_barcode_penalty_dist = 3, .min_separation_only_dist = 6, .flank_left_pad = 5, .flank_right_pad = 10, diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index 3283a282..a84000f3 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -8,9 +8,9 @@ namespace dorado::barcode_kits { struct BarcodeKitScoringParams { - int max_barcode_cost = 9; + int max_barcode_penalty = 9; int barcode_end_proximity = 75; - int min_barcode_score_dist = 3; + int min_barcode_penalty_dist = 3; int min_separation_only_dist = 6; int flank_left_pad = 5; int flank_right_pad = 10; diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 184fe05c..6492c045 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -39,9 +39,9 @@ std::shared_ptr create_barcoding_info( const std::optional &custom_seqs); struct BarcodeScoreResult { - int score = -1; - int top_score = -1; - int bottom_score = -1; + int penalty = -1; + int top_penalty = -1; + int bottom_penalty = -1; float flank_score = -1.f; float top_flank_score = -1.f; float bottom_flank_score = -1.f; diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 8b5a9d73..dc166fdb 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -102,6 +102,15 @@ TEST_CASE("Parse kit with incomplete double ended settings", "[barcode_demux]") "mask2_front mask2_rear and barcode2_pattern must all be set")); } +TEST_CASE("Parse kit with no flanks", "[barcode_demux]") { + fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); + const auto test_file = data_dir / "flank_free_arrangement.toml"; + + CHECK_THROWS_WITH(dorado::demux::parse_custom_arrangement(test_file.string()), + Catch::Matchers::Contains( + "At least one of mask1_front or mask1_rear needs to be specified")); +} + TEST_CASE("Parse custom barcode sequences", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_sequences = data_dir / "test_sequences.fasta"; @@ -123,9 +132,9 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string(), default_params); - CHECK(scoring_params.max_barcode_cost == 10); + CHECK(scoring_params.max_barcode_penalty == 10); CHECK(scoring_params.barcode_end_proximity == 75); - CHECK(scoring_params.min_barcode_score_dist == 3); + CHECK(scoring_params.min_barcode_penalty_dist == 3); CHECK(scoring_params.min_separation_only_dist == 5); CHECK(scoring_params.flank_left_pad == 5); CHECK(scoring_params.flank_right_pad == 10); @@ -141,9 +150,9 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string(), default_params); - CHECK(scoring_params.max_barcode_cost == default_params.max_barcode_cost); + CHECK(scoring_params.max_barcode_penalty == default_params.max_barcode_penalty); CHECK(scoring_params.barcode_end_proximity == default_params.barcode_end_proximity); - CHECK(scoring_params.min_barcode_score_dist == default_params.min_barcode_score_dist); + CHECK(scoring_params.min_barcode_penalty_dist == default_params.min_barcode_penalty_dist); CHECK(scoring_params.min_separation_only_dist == default_params.min_separation_only_dist); CHECK(scoring_params.flank_left_pad == default_params.flank_left_pad); CHECK(scoring_params.flank_right_pad == default_params.flank_right_pad); diff --git a/tests/data/barcode_demux/custom_barcodes/flank_free_arrangement.toml b/tests/data/barcode_demux/custom_barcodes/flank_free_arrangement.toml new file mode 100644 index 00000000..afb55c1b --- /dev/null +++ b/tests/data/barcode_demux/custom_barcodes/flank_free_arrangement.toml @@ -0,0 +1,8 @@ +[arrangement] +name = "test-kit" +kit = "BC" +mask1_front = "" +mask1_rear = "" +barcode1_pattern = "BC%02i" +first_index = 1 +last_index = 2 diff --git a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml index 77996ae2..b0a07403 100644 --- a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml +++ b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml @@ -1,8 +1,8 @@ [scoring] -max_barcode_cost = 10 +max_barcode_penalty = 10 barcode_end_proximity = 75 -min_barcode_score_dist = 3 +min_barcode_penalty_dist = 3 min_separation_only_dist = 5 flank_left_pad = 5 flank_right_pad = 10 diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index a696719e..038f79e3 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -253,7 +253,7 @@ echo dorado custom demux test stage $dorado_bin demux $data_dir/barcode_demux/double_end/SQK-RPB004_BC01.fastq --output-dir $output_dir/custom_demux --barcode-arrangement $data_dir/barcode_demux/custom_barcodes/RPB004.toml --barcode-sequences $data_dir/barcode_demux/custom_barcodes/RPB004_sequences.fasta samtools quickcheck -u $output_dir/custom_demux/SQK-RPB004_barcode01.bam num_demuxed_reads=$(samtools view -c $output_dir/custom_demux/SQK-RPB004_barcode01.bam) -if [[ $num_demuxed_reads -ne "3" ]]; then +if [[ $num_demuxed_reads -ne "2" ]]; then echo "3 demuxed reads expected. Found ${num_demuxed_reads}" exit 1 fi @@ -335,7 +335,7 @@ test_barcoding_read_groups() ( # There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups. test_barcoding_read_groups barcode01 4 barcode04 3 unclassified 2 -# There should be 4 reads with BC01 aliased to patient_id_1, and 5 unclassified groups. +# There should be 5 reads with BC01 aliased to patient_id_1, and 4 unclassified groups. test_barcoding_read_groups patient_id_1 5 unclassified 4 $data_dir/barcode_demux/sample_sheet.csv # Test demux only on a pre-classified BAM file From 02075e544a5a0795594b4c0587d16651e3931bcd Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 20 Mar 2024 02:05:55 +0000 Subject: [PATCH 20/23] Replace designated initializers with explicit init for structs since that is not officially available till C++20 --- dorado/utils/barcode_kits.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index f8b0a28b..90b83625 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -111,14 +111,14 @@ const std::vector RBK_1_96 = { const BarcodeKitScoringParams DEFAULT_PARAMS{}; const BarcodeKitScoringParams RBK114_PARAMS{ - .max_barcode_penalty = 12, - .barcode_end_proximity = 75, - .min_barcode_penalty_dist = 3, - .min_separation_only_dist = 6, - .flank_left_pad = 5, - .flank_right_pad = 10, - .front_barcode_window = 175, - .rear_barcode_window = 175, + /*max_barcode_penalty*/ 12, + /*barcode_end_proximity*/ 75, + /*min_barcode_penalty_dist*/ 3, + /*min_separation_only_dist*/ 6, + /*flank_left_pad*/ 5, + /*flank_right_pad*/ 10, + /*front_barcode_window*/ 175, + /*rear_barcode_window*/ 175, }; // Some arrangement names are just aliases of each other. This is because they were released From 3f70482783047270f52384eae52147bea2bfe649 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 20 Mar 2024 02:28:27 +0000 Subject: [PATCH 21/23] Add a barcode score based on the penalty which can be used to report the "confidence" of a barcode hit in a [0.0, 1.f] range. --- dorado/demux/BarcodeClassifier.cpp | 10 ++++++++++ dorado/utils/types.h | 1 + 2 files changed, 11 insertions(+) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 0b3768b4..360e4328 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -512,6 +512,9 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe v1.bottom_penalty = bottom_mask_result_penalty_v1; v1.penalty = std::min(v1.top_penalty, v1.bottom_penalty); v1.use_top = v1.top_penalty < v1.bottom_penalty; + v1.barcode_score = + v1.use_top ? (1.f - static_cast(v1.top_penalty) / barcode1.length()) + : (1.f - static_cast(v1.bottom_penalty) / barcode2_rev.length()); v1.top_flank_score = top_flank_score_v1; v1.bottom_flank_score = bottom_flank_score_v1; v1.flank_score = v1.use_top ? top_flank_score_v1 : bottom_flank_score_v1; @@ -530,6 +533,9 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe v2.top_penalty = top_mask_result_penalty_v2; v2.bottom_penalty = bottom_mask_result_penalty_v2; v2.penalty = std::min(v2.top_penalty, v2.bottom_penalty); + v2.barcode_score = + v2.use_top ? (1.f - static_cast(v2.top_penalty) / barcode2.length()) + : (1.f - static_cast(v2.bottom_penalty) / barcode1_rev.length()); v2.use_top = v2.top_penalty < v2.bottom_penalty; v2.top_flank_score = top_flank_score_v2; v2.bottom_flank_score = bottom_flank_score_v2; @@ -632,6 +638,9 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl res.bottom_penalty = bottom_mask_penalty; res.penalty = std::min(res.top_penalty, res.bottom_penalty); res.use_top = res.top_penalty < res.bottom_penalty; + res.barcode_score = + res.use_top ? (1.f - static_cast(res.top_penalty) / barcode.length()) + : (1.f - static_cast(res.bottom_penalty) / barcode_rev.length()); res.top_flank_score = top_flank_score; res.bottom_flank_score = bottom_flank_score; res.flank_score = res.use_top ? res.top_flank_score : res.bottom_flank_score; @@ -703,6 +712,7 @@ std::vector BarcodeClassifier::calculate_barcode_score( res.bottom_penalty = -1; res.penalty = res.top_penalty; res.use_top = true; + res.barcode_score = 1.f - static_cast(res.top_penalty) / barcode.length(); res.top_barcode_pos = {top_result.startLocations[0], top_result.endLocations[0]}; results.push_back(res); diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 6492c045..8f495345 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -42,6 +42,7 @@ struct BarcodeScoreResult { int penalty = -1; int top_penalty = -1; int bottom_penalty = -1; + float barcode_score = -1.f; float flank_score = -1.f; float top_flank_score = -1.f; float bottom_flank_score = -1.f; From dad95935dabf5e6ffb2bc063f01d3a46b17dc87c Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:33:41 +0000 Subject: [PATCH 22/23] Add min_flank_score parameter to filter out hits where the barcode penalty is low but the flank score is low too --- documentation/CustomBarcodes.md | 4 +- dorado/demux/BarcodeClassifier.cpp | 79 ++++++++++++------- dorado/demux/parse_custom_kit.cpp | 3 + dorado/utils/barcode_kits.cpp | 1 + dorado/utils/barcode_kits.h | 1 + tests/CustomBarcodeParserTest.cpp | 1 + .../custom_barcodes/scoring_params.toml | 1 + 7 files changed, 59 insertions(+), 31 deletions(-) diff --git a/documentation/CustomBarcodes.md b/documentation/CustomBarcodes.md index d7a56e72..add33f5a 100644 --- a/documentation/CustomBarcodes.md +++ b/documentation/CustomBarcodes.md @@ -82,8 +82,9 @@ The classification heuristic applied by Dorado is the following - Once barcodes are sorted by barcode penalty, the top candidate is checked against the following rules - 1. Is the barcode penalty below `max_barcode_penalty` and the distance between top 2 barcode penalties greater than `min_barcode_penalty_dist`?. 2. Is the barcode penalty above `max_barcode_penalty` but the distance between top 2 barcodes penalties greater then `min_separation_only_dist`? +3. Is the flank score below the `min_flank_score`? -If a candidate meets one of the above conditions and the location of the start/end of the barcode construct is within `barcode_end_proximity` bases of the ends of the read, then it is considered a hit. +If a candidate meets (1) or (2) AND (3), and the location of the start/end of the barcode construct is within `barcode_end_proximity` bases of the ends of the read, then it is considered a hit. | Scoring option | Description | | -- | -- | @@ -95,6 +96,7 @@ If a candidate meets one of the above conditions and the location of the start/e | flank_right_pad | Number of bases to use from succeeding flank during barcode alignment. | | front_barcode_window | Number of bases at the front of the read within which to look for barcodes. | | rear_barcode_window | Number of bases at the rear of the read within which to look for barcodes. | +| min_flank_score | Minimum score for the flank alignment. Score here is 1.f - (edit distance) / flank_length | For `flank_left_pad` and `flank_right_pad`, something in the range of 5-10 bases is typically good. Note that errors from this padding region are also part of the barcode alignment penalty. Therefore a bigger padding region may require a higher `max_barcode_cost` for classification. diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 360e4328..d60e22e4 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -165,6 +165,25 @@ std::string extract_right_buffer(const std::string& flank, int buffer) { return flank.substr(0, buffer); } +// Helper to pick the top or bottom window in a barcode. The one +// with lower penalty and higher flank score is preferred. If both +// are not satisfied by one of the windows, then just decide based +// on the barcode penalty. +std::tuple pick_top_or_bottom(int top_penalty, + float top_flank_score, + int bottom_penalty, + float bottom_flank_score) { + if (top_penalty <= bottom_penalty && top_flank_score >= bottom_flank_score) { + return {true, top_penalty, top_flank_score}; + } else if (bottom_penalty <= top_penalty && bottom_flank_score >= top_flank_score) { + return {false, bottom_penalty, bottom_flank_score}; + } else if (top_penalty <= bottom_penalty) { + return {true, top_penalty, top_flank_score}; + } else { + return {false, bottom_penalty, bottom_flank_score}; + } +} + } // namespace namespace demux { @@ -471,17 +490,6 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe spdlog::trace("total v1 edit dist {}, total v2 edit dis {}", total_v1_penalty, total_v2_penalty); - std::string_view top_mask, bottom_mask; - if (total_v1_penalty < total_v2_penalty) { - top_mask = top_mask_v1; - bottom_mask = bottom_mask_v1; - spdlog::trace("best variant v1"); - } else { - top_mask = top_mask_v2; - bottom_mask = bottom_mask_v2; - spdlog::trace("best variant v2"); - } - std::vector results; for (size_t i = 0; i < candidate.barcodes1.size(); i++) { const auto barcode1 = @@ -510,14 +518,13 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe BarcodeScoreResult v1; v1.top_penalty = top_mask_result_penalty_v1; v1.bottom_penalty = bottom_mask_result_penalty_v1; - v1.penalty = std::min(v1.top_penalty, v1.bottom_penalty); - v1.use_top = v1.top_penalty < v1.bottom_penalty; + v1.top_flank_score = top_flank_score_v1; + v1.bottom_flank_score = bottom_flank_score_v1; + std::tie(v1.use_top, v1.penalty, v1.flank_score) = pick_top_or_bottom( + v1.top_penalty, v1.top_flank_score, v1.bottom_penalty, v1.bottom_flank_score); v1.barcode_score = v1.use_top ? (1.f - static_cast(v1.top_penalty) / barcode1.length()) : (1.f - static_cast(v1.bottom_penalty) / barcode2_rev.length()); - v1.top_flank_score = top_flank_score_v1; - v1.bottom_flank_score = bottom_flank_score_v1; - v1.flank_score = v1.use_top ? top_flank_score_v1 : bottom_flank_score_v1; v1.top_barcode_pos = {top_result_v1.startLocations[0], top_result_v1.endLocations[0]}; v1.bottom_barcode_pos = {bottom_start + bottom_result_v1.startLocations[0], bottom_start + bottom_result_v1.endLocations[0]}; @@ -532,20 +539,30 @@ std::vector BarcodeClassifier::calculate_barcode_score_diffe BarcodeScoreResult v2; v2.top_penalty = top_mask_result_penalty_v2; v2.bottom_penalty = bottom_mask_result_penalty_v2; - v2.penalty = std::min(v2.top_penalty, v2.bottom_penalty); + v2.top_flank_score = top_flank_score_v2; + v2.bottom_flank_score = bottom_flank_score_v2; + std::tie(v2.use_top, v2.penalty, v2.flank_score) = pick_top_or_bottom( + v2.top_penalty, v2.top_flank_score, v2.bottom_penalty, v2.bottom_flank_score); v2.barcode_score = v2.use_top ? (1.f - static_cast(v2.top_penalty) / barcode2.length()) : (1.f - static_cast(v2.bottom_penalty) / barcode1_rev.length()); - v2.use_top = v2.top_penalty < v2.bottom_penalty; - v2.top_flank_score = top_flank_score_v2; - v2.bottom_flank_score = bottom_flank_score_v2; - v2.flank_score = v2.use_top ? top_flank_score_v2 : bottom_flank_score_v2; v2.top_barcode_pos = {top_result_v2.startLocations[0], top_result_v2.endLocations[0]}; v2.bottom_barcode_pos = {bottom_start + bottom_result_v2.startLocations[0], bottom_start + bottom_result_v2.endLocations[0]}; - // The best penalty is the higher penalty between the 2 variants. - const bool var1_is_best = v1.penalty < v2.penalty; + // The best variant is the one with lower penalty for both barcode + // and flanks. If that's not clear, then just use the barcode score + // penalty to decide. + bool var1_is_best = true; + if (v1.penalty <= v2.penalty && total_v1_penalty <= total_v2_penalty) { + var1_is_best = true; + } else if (v2.penalty <= v1.penalty && total_v2_penalty <= total_v1_penalty) { + var1_is_best = false; + } else if (v1.penalty <= v2.penalty) { + var1_is_best = true; + } else { + var1_is_best = false; + } BarcodeScoreResult res = var1_is_best ? v1 : v2; res.variant = var1_is_best ? "var1" : "var2"; res.barcode_name = barcode_name; @@ -636,14 +653,13 @@ std::vector BarcodeClassifier::calculate_barcode_score_doubl res.barcode_kit = candidate.barcode_kit; res.top_penalty = top_mask_penalty; res.bottom_penalty = bottom_mask_penalty; - res.penalty = std::min(res.top_penalty, res.bottom_penalty); - res.use_top = res.top_penalty < res.bottom_penalty; + res.top_flank_score = top_flank_score; + res.bottom_flank_score = bottom_flank_score; + std::tie(res.use_top, res.penalty, res.flank_score) = pick_top_or_bottom( + res.top_penalty, res.top_flank_score, res.bottom_penalty, res.bottom_flank_score); res.barcode_score = res.use_top ? (1.f - static_cast(res.top_penalty) / barcode.length()) : (1.f - static_cast(res.bottom_penalty) / barcode_rev.length()); - res.top_flank_score = top_flank_score; - res.bottom_flank_score = bottom_flank_score; - res.flank_score = res.use_top ? res.top_flank_score : res.bottom_flank_score; res.top_barcode_pos = {top_result.startLocations[0], top_result.endLocations[0]}; res.bottom_barcode_pos = {bottom_start + bottom_result.startLocations[0], bottom_start + bottom_result.endLocations[0]}; @@ -793,8 +809,11 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( } spdlog::trace("Scores: {}", d.str()); auto best_result = results.begin(); - auto are_penalties_acceptable = [this](const auto& penalty) { - return penalty.penalty <= m_scoring_params.max_barcode_penalty; + auto are_penalties_acceptable = [this](const auto& proposal) { + // If barcode penalty is 0, it's a perfect match. Consider it a pass. + return (proposal.penalty == 0) || + ((proposal.penalty <= m_scoring_params.max_barcode_penalty) && + (proposal.flank_score >= m_scoring_params.min_flank_score)); }; BarcodeScoreResult out = UNCLASSIFIED; diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index 2fddc451..9ccd89ae 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -153,6 +153,9 @@ dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( if (config.contains("rear_barcode_window")) { params.rear_barcode_window = toml::find(config, "rear_barcode_window"); } + if (config.contains("min_flank_score")) { + params.min_flank_score = toml::find(config, "min_flank_score"); + } return params; } diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 90b83625..0ce927f5 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -119,6 +119,7 @@ const BarcodeKitScoringParams RBK114_PARAMS{ /*flank_right_pad*/ 10, /*front_barcode_window*/ 175, /*rear_barcode_window*/ 175, + /*min_flank_score*/ 0.0f, }; // Some arrangement names are just aliases of each other. This is because they were released diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index a84000f3..192e4758 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -16,6 +16,7 @@ struct BarcodeKitScoringParams { int flank_right_pad = 10; int front_barcode_window = 175; int rear_barcode_window = 175; + float min_flank_score = 0.6f; }; struct KitInfo { diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index dc166fdb..d5445aa2 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -140,6 +140,7 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { CHECK(scoring_params.flank_right_pad == 10); CHECK(scoring_params.front_barcode_window == 150); CHECK(scoring_params.rear_barcode_window == 150); + CHECK(scoring_params.min_flank_score == Approx(0.5f)); } TEST_CASE("Parse default scoring params", "[barcode_demux]") { diff --git a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml index b0a07403..2f4e3413 100644 --- a/tests/data/barcode_demux/custom_barcodes/scoring_params.toml +++ b/tests/data/barcode_demux/custom_barcodes/scoring_params.toml @@ -8,3 +8,4 @@ flank_left_pad = 5 flank_right_pad = 10 front_barcode_window = 150 rear_barcode_window = 150 +min_flank_score = 0.5 From 351f333e34d2ae04da1bf4bb262701dacc122a89 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Thu, 21 Mar 2024 02:23:04 +0000 Subject: [PATCH 23/23] Adjust min_flank_score threshold to 0.5 and fix failing integration test. --- dorado/utils/barcode_kits.h | 2 +- tests/test_simple_basecaller_execution.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index 192e4758..6cca6e24 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -16,7 +16,7 @@ struct BarcodeKitScoringParams { int flank_right_pad = 10; int front_barcode_window = 175; int rear_barcode_window = 175; - float min_flank_score = 0.6f; + float min_flank_score = 0.5f; }; struct KitInfo { diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index 038f79e3..1c6c134e 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -335,8 +335,8 @@ test_barcoding_read_groups() ( # There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups. test_barcoding_read_groups barcode01 4 barcode04 3 unclassified 2 -# There should be 5 reads with BC01 aliased to patient_id_1, and 4 unclassified groups. -test_barcoding_read_groups patient_id_1 5 unclassified 4 $data_dir/barcode_demux/sample_sheet.csv +# There should be 4 reads with BC01 aliased to patient_id_1, and 5 unclassified groups. +test_barcoding_read_groups patient_id_1 4 unclassified 5 $data_dir/barcode_demux/sample_sheet.csv # Test demux only on a pre-classified BAM file $dorado_bin demux --no-classify --output-dir "$output_dir/demux_only_test/" $output_dir/read_group_test.bam