From 7e25c9e056264598a46152767bc632e6259eb6ba Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Mon, 25 Mar 2024 09:40:15 +0000 Subject: [PATCH] Merge branch 'DOR-553_RG_HDR_FOR_CUSTOM_BARCODES' into 'master' DOR-553 ensure read group header lines include custom barcodes Closes DOR-553 See merge request machine-learning/dorado!897 (cherry picked from commit 862872a26091cab548fd3b6e38f8017961e07fa5) f37cd6a7 Refactoring to move parsing custom barcodes kits functionality into utils so... 0a9e40af Merge remote-tracking branch 'origin/master' into DOR-553_RG_HDR_FOR_CUSTOM_BARCODES 806ad9b1 Add barcode arrangements RG header generation when using a custom kit 3f2bc410 Refactored adding read group headers separating custom from default kits to avoid a clunky api cc328db5 Fix missing include from symbol_test and added missing else clause if no barcoding kit specified cbdd51a5 Resolved merge conflicts 037257f2 Updated follwoing review --- CMakeLists.txt | 4 +- dorado/cli/basecaller.cpp | 51 ++++++- dorado/cli/duplex.cpp | 3 +- dorado/demux/AdapterDetector.cpp | 3 +- dorado/demux/BarcodeClassifier.cpp | 7 +- dorado/demux/BarcodeClassifier.h | 4 +- dorado/demux/parse_custom_sequences.cpp | 31 ++++ dorado/demux/parse_custom_sequences.h | 11 ++ dorado/utils/CMakeLists.txt | 4 +- dorado/utils/bam_utils.cpp | 150 +++++++++---------- dorado/utils/bam_utils.h | 12 +- dorado/utils/barcode_kits.cpp | 9 ++ dorado/utils/barcode_kits.h | 3 +- dorado/{demux => utils}/parse_custom_kit.cpp | 28 +--- dorado/{demux => utils}/parse_custom_kit.h | 9 +- tests/BamUtilsTest.cpp | 48 ++---- tests/CustomBarcodeParserTest.cpp | 41 ++--- tests/symbol_test.cpp | 10 +- 18 files changed, 243 insertions(+), 185 deletions(-) create mode 100644 dorado/demux/parse_custom_sequences.cpp create mode 100644 dorado/demux/parse_custom_sequences.h rename dorado/{demux => utils}/parse_custom_kit.cpp (88%) rename dorado/{demux => utils}/parse_custom_kit.h (64%) diff --git a/CMakeLists.txt b/CMakeLists.txt index be2d8563..8a88adef 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,10 +231,10 @@ add_library(dorado_lib dorado/demux/BarcodeClassifier.h dorado/demux/BarcodeClassifierSelector.cpp dorado/demux/BarcodeClassifierSelector.h + dorado/demux/parse_custom_sequences.cpp + dorado/demux/parse_custom_sequences.h dorado/demux/Trimmer.cpp dorado/demux/Trimmer.h - dorado/demux/parse_custom_kit.cpp - dorado/demux/parse_custom_kit.h dorado/poly_tail/dna_poly_tail_calculator.cpp dorado/poly_tail/dna_poly_tail_calculator.h dorado/poly_tail/plasmid_poly_tail_calculator.cpp diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 32e5763e..037824fd 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -24,6 +24,7 @@ #include "utils/fs_utils.h" #include "utils/log_utils.h" #include "utils/parameters.h" +#include "utils/parse_custom_kit.h" #include "utils/stats.h" #include "utils/string_utils.h" #include "utils/sys_stats.h" @@ -47,6 +48,32 @@ namespace dorado { +namespace { + +const barcode_kits::KitInfo& get_barcode_kit_info(const std::string& kit_name) { + const auto kit_info = barcode_kits::get_kit_info(kit_name); + if (!kit_info) { + spdlog::error( + "{} is not a valid barcode kit name. Please run the help " + "command to find out available barcode kits.", + kit_name); + std::exit(EXIT_FAILURE); + } + return *kit_info; +} + +std::pair get_custom_barcode_kit_info( + const std::string& custom_kit_file) { + auto custom_kit_info = barcode_kits::parse_custom_arrangement(custom_kit_file); + if (!custom_kit_info) { + spdlog::error("Unable to load custom barcode arrangement file: {}", custom_kit_file); + std::exit(EXIT_FAILURE); + } + return *custom_kit_info; +} + +} // namespace + using dorado::utils::default_parameters; using OutputMode = dorado::utils::HtsFile::OutputMode; using namespace std::chrono_literals; @@ -77,7 +104,7 @@ void setup(std::vector args, const std::string& dump_stats_file, const std::string& dump_stats_filter, const std::string& resume_from_file, - const std::vector& barcode_kits, + const std::string& barcode_kit, bool barcode_both_ends, bool barcode_no_trim, bool adapter_no_trim, @@ -135,7 +162,7 @@ void setup(std::vector args, recursive_file_loading); const bool adapter_trimming_enabled = (!adapter_no_trim || !primer_no_trim); - const bool barcode_enabled = !barcode_kits.empty() || custom_kit; + const bool barcode_enabled = !barcode_kit.empty() || custom_kit; const auto thread_allocations = utils::default_thread_allocations( int(num_devices), !remora_runners.empty() ? int(num_remora_threads) : 0, enable_aligner, barcode_enabled, adapter_trimming_enabled); @@ -149,7 +176,17 @@ void setup(std::vector args, SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); + if (custom_kit) { + auto [kit_name, kit_info] = get_custom_barcode_kit_info(*custom_kit); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, + sample_sheet.get()); + } else if (!barcode_kit.empty()) { + const auto kit_info = get_barcode_kit_info(barcode_kit); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kit, kit_info, + sample_sheet.get()); + } else { + utils::add_rg_headers(hdr.get(), read_groups); + } utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads); @@ -178,8 +215,9 @@ void setup(std::vector args, !primer_no_trim, std::move(custom_primer_file)); } if (barcode_enabled) { + std::vector kit_as_vector{barcode_kit}; current_sink_node = pipeline_desc.add_node( - {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, + {current_sink_node}, thread_allocations.barcoder_threads, kit_as_vector, barcode_both_ends, barcode_no_trim, std::move(allowed_barcodes), std::move(custom_kit), std::move(custom_barcode_file)); } @@ -405,7 +443,8 @@ int basecaller(int argc, char* argv[]) { parser.visible.add_argument("--kit-name") .help("Enable barcoding with the provided kit name. Choose from: " + - dorado::barcode_kits::barcode_kits_list_str() + "."); + dorado::barcode_kits::barcode_kits_list_str() + ".") + .default_value(std::string{}); parser.visible.add_argument("--barcode-both-ends") .help("Require both ends of a read to be barcoded for a double ended barcode.") .default_value(false) @@ -624,7 +663,7 @@ int basecaller(int argc, char* argv[]) { parser.hidden.get("--dump_stats_file"), parser.hidden.get("--dump_stats_filter"), parser.visible.get("--resume-from"), - parser.visible.get>("--kit-name"), + parser.visible.get("--kit-name"), parser.visible.get("--barcode-both-ends"), no_trim_barcodes, no_trim_adapters, no_trim_primers, parser.visible.get("--sample-sheet"), std::move(custom_kit), std::move(custom_barcode_seqs), std::move(custom_primer_file), diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 5b875330..d8e2473d 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -463,8 +463,7 @@ int duplex(int argc, char* argv[]) { recursive_file_loading); read_groups.merge(DataLoader::load_read_groups(reads, duplex_rg_name, "", recursive_file_loading)); - std::vector barcode_kits; - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr); + utils::add_rg_headers(hdr.get(), read_groups); int batch_size(parser.visible.get("-b")); int chunk_size(parser.visible.get("-c")); diff --git a/dorado/demux/AdapterDetector.cpp b/dorado/demux/AdapterDetector.cpp index edb46648..6988e8b4 100644 --- a/dorado/demux/AdapterDetector.cpp +++ b/dorado/demux/AdapterDetector.cpp @@ -1,7 +1,8 @@ #include "AdapterDetector.h" -#include "parse_custom_kit.h" +#include "parse_custom_sequences.h" #include "utils/alignment_utils.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/types.h" diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index d60e22e4..8b6b9e6b 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -1,8 +1,9 @@ #include "BarcodeClassifier.h" -#include "parse_custom_kit.h" +#include "parse_custom_sequences.h" #include "utils/alignment_utils.h" #include "utils/barcode_kits.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/types.h" @@ -124,7 +125,7 @@ std::unordered_map process_custom_ki const std::optional& custom_kit) { std::unordered_map kit_map; if (custom_kit) { - auto custom_arrangement = demux::parse_custom_arrangement(*custom_kit); + auto custom_arrangement = dorado::barcode_kits::parse_custom_arrangement(*custom_kit); if (custom_arrangement) { const auto& [kit_name, kit_info] = *custom_arrangement; kit_map[kit_name] = kit_info; @@ -151,7 +152,7 @@ dorado::barcode_kits::BarcodeKitScoringParams set_scoring_params( if (custom_kit) { // If a custom kit is passed, parse it for any scoring // params that need to override the default params. - return dorado::demux::parse_scoring_params(*custom_kit, params); + return barcode_kits::parse_scoring_params(*custom_kit, params); } else { return params; } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index f7b8de7e..0d101834 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -1,6 +1,6 @@ #pragma once -#include "parse_custom_kit.h" #include "utils/barcode_kits.h" +#include "utils/parse_custom_kit.h" #include "utils/stats.h" #include "utils/types.h" @@ -31,7 +31,7 @@ class BarcodeClassifier { private: const std::unordered_map m_custom_kit; const std::unordered_map m_custom_seqs; - const dorado::barcode_kits::BarcodeKitScoringParams m_scoring_params; + const barcode_kits::BarcodeKitScoringParams m_scoring_params; const std::vector m_barcode_candidates; std::vector generate_candidates(const std::vector& kit_names); diff --git a/dorado/demux/parse_custom_sequences.cpp b/dorado/demux/parse_custom_sequences.cpp new file mode 100644 index 00000000..684d388f --- /dev/null +++ b/dorado/demux/parse_custom_sequences.cpp @@ -0,0 +1,31 @@ +#include "parse_custom_sequences.h" + +#include "utils/bam_utils.h" +#include "utils/types.h" + +#include + +namespace dorado::demux { + +std::unordered_map parse_custom_sequences( + const std::string& sequences_file) { + dorado::HtsFilePtr file(hts_open(sequences_file.c_str(), "r")); + BamPtr record; + record.reset(bam_init1()); + + std::unordered_map sequences; + + int sam_ret_val = 0; + while ((sam_ret_val = sam_read1(file.get(), nullptr, record.get())) != -1) { + if (sam_ret_val < -1) { + throw std::runtime_error("Failed to parse custom sequence file " + sequences_file); + } + std::string qname = bam_get_qname(record.get()); + std::string seq = utils::extract_sequence(record.get()); + sequences[qname] = seq; + } + + return sequences; +} + +} // namespace dorado::demux diff --git a/dorado/demux/parse_custom_sequences.h b/dorado/demux/parse_custom_sequences.h new file mode 100644 index 00000000..1b12bfcb --- /dev/null +++ b/dorado/demux/parse_custom_sequences.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace dorado::demux { + +std::unordered_map parse_custom_sequences( + const std::string& sequences_file); + +} // namespace dorado::demux diff --git a/dorado/utils/CMakeLists.txt b/dorado/utils/CMakeLists.txt index 387ace46..4b2093da 100644 --- a/dorado/utils/CMakeLists.txt +++ b/dorado/utils/CMakeLists.txt @@ -30,6 +30,8 @@ add_library(dorado_utils module_utils.h parameters.cpp parameters.h + parse_custom_kit.cpp + parse_custom_kit.h PostCondition.h SampleSheet.cpp SampleSheet.h @@ -59,7 +61,7 @@ add_library(dorado_utils types.h uuid_utils.cpp uuid_utils.h -) + ) if (DORADO_GPU_BUILD) if(APPLE) diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index a1055085..7ba43fc8 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -21,6 +21,8 @@ const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; #endif // _WIN32 +namespace dorado::utils { + namespace { // Convert the 4bit encoded sequence in a bam1_t structure @@ -33,93 +35,89 @@ std::string convert_nt16_to_str(uint8_t* bseq, size_t slen) { return seq; } -} // namespace - -namespace dorado::utils { +void emit_read_group(sam_hdr_t* hdr, + const std::string& read_group_line, + const std::string& id, + const std::string& additional_tags) { + auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n'; + sam_hdr_add_lines(hdr, line.c_str(), 0); +} -kstring_t allocate_kstring() { - kstring_t str = {0, 0, NULL}; - ks_resize(&str, 1'000'000); - return str; +std::string read_group_to_string(const dorado::ReadGroup& read_group) { + auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; }; + std::ostringstream rg; + { + rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t"; + rg << "PM:" << value_or_unknown(read_group.device_id) << "\t"; + rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t"; + rg << "PL:" + << "ONT" + << "\t"; + rg << "DS:" + << "basecall_model=" << value_or_unknown(read_group.basecalling_model) + << (read_group.modbase_models.empty() ? "" + : (" modbase_models=" + read_group.modbase_models)) + << " runid=" << value_or_unknown(read_group.run_id) << "\t"; + rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t"; + rg << "SM:" << value_or_unknown(read_group.sample_id); + } + return rg.str(); } -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet) { - const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); +void add_barcode_kit_rg_hdrs(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { const auto& barcode_sequences = barcode_kits::get_barcodes(); + for (const auto& barcode_name : kit_info.barcodes) { + const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); + const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); + for (const auto& read_group : read_groups) { + std::string alias; + auto id = read_group.first + '_'; + if (sample_sheet) { + if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { + continue; + } - // Convert a ReadGroup to a string - auto to_string = [](const ReadGroup& read_group) { - // Lambda function to return "unknown" if string is empty - auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; }; - std::ostringstream rg; - { - rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t"; - rg << "PM:" << value_or_unknown(read_group.device_id) << "\t"; - rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t"; - rg << "PL:" - << "ONT" - << "\t"; - rg << "DS:" - << "basecall_model=" << value_or_unknown(read_group.basecalling_model) - << (read_group.modbase_models.empty() - ? "" - : (" modbase_models=" + read_group.modbase_models)) - << " runid=" << value_or_unknown(read_group.run_id) << "\t"; - rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t"; - rg << "SM:" << value_or_unknown(read_group.sample_id); + alias = sample_sheet->get_alias( + read_group.second.flowcell_id, read_group.second.position_id, + read_group.second.experiment_id, normalized_barcode_name); + } + if (!alias.empty()) { + id += alias; + } else { + id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); + } + const std::string read_group_tags = read_group_to_string(read_group.second); + emit_read_group(hdr, read_group_tags, id, additional_tags); } - return std::move(rg).str(); - }; + } +} - auto emit_read_group = [hdr](const std::string& read_group_line, const std::string& id, - const std::string& additional_tags) { - auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n'; - sam_hdr_add_lines(hdr, line.c_str(), 0); - }; +} // namespace - // Emit read group headers without a barcode arrangement. +kstring_t allocate_kstring() { + kstring_t str = {0, 0, NULL}; + ks_resize(&str, 1'000'000); + return str; +} + +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups) { for (const auto& read_group : read_groups) { - const std::string read_group_tags = to_string(read_group.second); - emit_read_group(read_group_tags, read_group.first, {}); + const std::string read_group_tags = read_group_to_string(read_group.second); + emit_read_group(hdr, read_group_tags, read_group.first, {}); } +} - // Emit read group headers for each barcode arrangement. - for (const auto& kit_name : barcode_kits) { - auto kit_iter = barcode_kit_infos.find(kit_name); - if (kit_iter == barcode_kit_infos.end()) { - throw std::runtime_error(kit_name + - " is not a valid barcode kit name. Please run the help " - "command to find out available barcode kits."); - } - const auto& kit_info = kit_iter->second; - for (const auto& barcode_name : kit_info.barcodes) { - const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); - const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); - for (const auto& read_group : read_groups) { - std::string alias; - auto id = read_group.first + '_'; - if (sample_sheet) { - if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { - continue; - } - - alias = sample_sheet->get_alias( - read_group.second.flowcell_id, read_group.second.position_id, - read_group.second.experiment_id, normalized_barcode_name); - } - if (!alias.empty()) { - id += alias; - } else { - id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); - } - const std::string read_group_tags = to_string(read_group.second); - emit_read_group(read_group_tags, id, additional_tags); - } - } - } +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { + add_rg_headers(hdr, read_groups); + add_barcode_kit_rg_hdrs(hdr, read_groups, kit_name, kit_info, sample_sheet); } void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) { diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 7a49cbe0..a334ad36 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -1,4 +1,5 @@ #pragma once +#include "barcode_kits.h" #include "types.h" #include @@ -24,10 +25,13 @@ struct AlignmentOps { size_t substitutions; }; -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet); +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups); + +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet); void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 0ce927f5..1e26b300 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -820,6 +820,15 @@ const std::unordered_map barcodes = { const std::unordered_map& get_kit_infos() { return kit_info_map; } +const KitInfo* get_kit_info(const std::string& kit_name) { + const auto& barcode_kit_infos = get_kit_infos(); + auto kit_iter = barcode_kit_infos.find(kit_name); + if (kit_iter == barcode_kit_infos.end()) { + return nullptr; + } + return &kit_iter->second; +} + const std::unordered_map& get_barcodes() { return barcodes; } const std::unordered_set& get_barcode_identifiers() { diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index 6cca6e24..9a03e1b0 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -33,12 +33,13 @@ struct KitInfo { }; const std::unordered_map& get_kit_infos(); +const KitInfo* get_kit_info(const std::string& kit_name); const std::unordered_map& get_barcodes(); const std::unordered_set& get_barcode_identifiers(); + std::string barcode_kits_list_str(); std::string normalize_barcode_name(const std::string& barcode_name); std::string generate_standard_barcode_name(const std::string& kit_name, const std::string& barcode_name); - } // namespace dorado::barcode_kits diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/utils/parse_custom_kit.cpp similarity index 88% rename from dorado/demux/parse_custom_kit.cpp rename to dorado/utils/parse_custom_kit.cpp index 9ccd89ae..89bc7bdb 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/utils/parse_custom_kit.cpp @@ -1,6 +1,5 @@ #include "parse_custom_kit.h" -#include "utils/bam_utils.h" #include "utils/barcode_kits.h" #include @@ -11,7 +10,7 @@ #include #include -namespace dorado::demux { +namespace dorado::barcode_kits { bool check_normalized_id_pattern(const std::string& pattern) { auto modulo_pos = pattern.find_first_of("%"); @@ -160,27 +159,4 @@ dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( return params; } -std::unordered_map parse_custom_sequences( - const std::string& sequences_file) { - auto file = hts_open(sequences_file.c_str(), "r"); - BamPtr record; - record.reset(bam_init1()); - - std::unordered_map sequences; - - int sam_ret_val = 0; - while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) { - if (sam_ret_val < -1) { - throw std::runtime_error("Failed to parse custom sequence file " + sequences_file); - } - std::string qname = bam_get_qname(record.get()); - std::string seq = utils::extract_sequence(record.get()); - sequences[qname] = seq; - } - - hts_close(file); - - return sequences; -} - -} // namespace dorado::demux +} // namespace dorado::barcode_kits diff --git a/dorado/demux/parse_custom_kit.h b/dorado/utils/parse_custom_kit.h similarity index 64% rename from dorado/demux/parse_custom_kit.h rename to dorado/utils/parse_custom_kit.h index d929bdaa..854b34c3 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/utils/parse_custom_kit.h @@ -6,18 +6,15 @@ #include #include -namespace dorado::demux { +namespace dorado::barcode_kits { std::optional> parse_custom_arrangement( const std::string& arrangement_file); -std::unordered_map parse_custom_sequences( - const std::string& sequences_file); - -dorado::barcode_kits::BarcodeKitScoringParams parse_scoring_params( +BarcodeKitScoringParams parse_scoring_params( const std::string& arrangement_file, const dorado::barcode_kits::BarcodeKitScoringParams& default_params); bool check_normalized_id_pattern(const std::string& pattern); -} // namespace dorado::demux +} // namespace dorado::barcode_kits diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index 2fb1069c..82b5de10 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -41,7 +41,7 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) { "5mCG --emit-sam"); } -TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { +TEST_CASE("BamUtilsTest: Add read group headers scenarios", TEST_GROUP) { auto has_read_group_header = [](sam_hdr_t *ptr, const char *id) { return sam_hdr_line_index(ptr, "RG", id) >= 0; }; @@ -58,7 +58,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("No read groups generate no headers") { dorado::SamHdrPtr sam_header(sam_hdr_init()); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); - dorado::utils::add_rg_hdr(sam_header.get(), {}, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), {}); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); } @@ -73,7 +73,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), read_groups); // Check the IDs of the groups are all there. CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(read_groups.size())); @@ -84,23 +84,15 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { } } - // Pick some of the barcode kits (randomly chosen indices). - const auto &kit_infos = dorado::barcode_kits::get_kit_infos(); - const std::vector barcode_kits{ - std::next(kit_infos.begin(), 1)->first, - std::next(kit_infos.begin(), 7)->first, - }; - - SECTION("Read groups with barcodes") { + SECTION("Read groups with barcode kit") { + const std::string KIT_NAME{"SQK-RAB204"}; + auto kit_info = dorado::barcode_kits::get_kit_info(KIT_NAME); dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits, nullptr); + dorado::utils::add_rg_headers_with_barcode_kit(sam_header.get(), read_groups, KIT_NAME, + *kit_info, nullptr); // Check the IDs of the groups are all there. - size_t total_barcodes = 0; - for (const auto &kit_name : barcode_kits) { - total_barcodes += kit_infos.at(kit_name).barcodes.size(); - } - const size_t total_groups = read_groups.size() * (total_barcodes + 1); + const size_t total_groups = read_groups.size() * (kit_info->barcodes.size() + 1); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(total_groups)); // Check that the IDs match the expected format. @@ -110,24 +102,16 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { CHECK(get_barcode_tag(sam_header.get(), id.c_str()) == std::nullopt); // The headers with barcodes should contain those barcodes. - for (const auto &kit_name : barcode_kits) { - const auto &kit_info = kit_infos.at(kit_name); - for (const auto &barcode_name : kit_info.barcodes) { - const auto full_id = id + "_" + - dorado::barcode_kits::generate_standard_barcode_name( - kit_name, barcode_name); - const auto &barcode_seq = barcode_seqs.at(barcode_name); - CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); - CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); - } + for (const auto &barcode_name : kit_info->barcodes) { + const auto full_id = id + "_" + + dorado::barcode_kits::generate_standard_barcode_name( + KIT_NAME, barcode_name); + const auto &barcode_seq = barcode_seqs.at(barcode_name); + CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); + CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); } } } - - SECTION("Read groups with unknown barcode kit") { - dorado::SamHdrPtr sam_header(sam_hdr_init()); - CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"}, nullptr)); - } } TEST_CASE("BamUtilsTest: Test bam extraction helpers", TEST_GROUP) { diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index d5445aa2..9e5dd2b4 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -1,6 +1,7 @@ #include "TestUtils.h" -#include "demux/parse_custom_kit.h" +#include "demux/parse_custom_sequences.h" #include "utils/barcode_kits.h" +#include "utils/parse_custom_kit.h" #include @@ -10,7 +11,7 @@ TEST_CASE("Parse custom single ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_single_ended.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_single_ended"); CHECK(kit_info.barcodes.size() == 4); @@ -30,7 +31,7 @@ TEST_CASE("Parse double ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_double_ended.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_double_ended"); CHECK(kit_info.barcodes.size() == 24); @@ -50,7 +51,7 @@ TEST_CASE("Parse double ended barcode arrangement with different flanks", "[barc fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_flanks.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_flanks"); CHECK(kit_info.barcodes.size() == 96); @@ -70,7 +71,7 @@ TEST_CASE("Parse double ended barcode arrangement with different barcodes", "[ba fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_barcodes.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_barcodes"); CHECK(kit_info.barcodes.size() == 12); @@ -90,14 +91,14 @@ TEST_CASE("Parse kit with bad indices", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "bad_double_ended_kit.toml"; - CHECK_THROWS(dorado::demux::parse_custom_arrangement(test_file.string())); + CHECK_THROWS(dorado::barcode_kits::parse_custom_arrangement(test_file.string())); } TEST_CASE("Parse kit with incomplete double ended settings", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "bad_double_ended_kit_not_all_params_set.toml"; - CHECK_THROWS_WITH(dorado::demux::parse_custom_arrangement(test_file.string()), + CHECK_THROWS_WITH(dorado::barcode_kits::parse_custom_arrangement(test_file.string()), Catch::Matchers::Contains( "mask2_front mask2_rear and barcode2_pattern must all be set")); } @@ -106,7 +107,7 @@ TEST_CASE("Parse kit with no flanks", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "flank_free_arrangement.toml"; - CHECK_THROWS_WITH(dorado::demux::parse_custom_arrangement(test_file.string()), + CHECK_THROWS_WITH(dorado::barcode_kits::parse_custom_arrangement(test_file.string()), Catch::Matchers::Contains( "At least one of mask1_front or mask1_rear needs to be specified")); } @@ -130,7 +131,7 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { dorado::barcode_kits::BarcodeKitScoringParams default_params; auto scoring_params = - dorado::demux::parse_scoring_params(test_params_file.string(), default_params); + dorado::barcode_kits::parse_scoring_params(test_params_file.string(), default_params); CHECK(scoring_params.max_barcode_penalty == 10); CHECK(scoring_params.barcode_end_proximity == 75); @@ -149,7 +150,7 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { dorado::barcode_kits::BarcodeKitScoringParams default_params; auto scoring_params = - dorado::demux::parse_scoring_params(test_params_file.string(), default_params); + dorado::barcode_kits::parse_scoring_params(test_params_file.string(), default_params); CHECK(scoring_params.max_barcode_penalty == default_params.max_barcode_penalty); CHECK(scoring_params.barcode_end_proximity == default_params.barcode_end_proximity); @@ -160,14 +161,14 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { } TEST_CASE("Check for normalized id pattern", "[barcode_demux]") { - CHECK(dorado::demux::check_normalized_id_pattern("BC%02i")); - CHECK(dorado::demux::check_normalized_id_pattern("abcd%25i")); - CHECK(dorado::demux::check_normalized_id_pattern("%2i")); - - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%d")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02f")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02iab")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("BC%02i")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("abcd%25i")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("%2i")); + + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%d")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02f")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02iab")); } diff --git a/tests/symbol_test.cpp b/tests/symbol_test.cpp index 0e1726a0..620b9cd1 100644 --- a/tests/symbol_test.cpp +++ b/tests/symbol_test.cpp @@ -7,7 +7,7 @@ #include "api/runner_creation.h" #include "basecall/CRFModelConfig.h" #include "basecall/ModelRunner.h" -#include "demux/parse_custom_kit.h" +#include "demux/parse_custom_sequences.h" #include "modbase/ModBaseModelConfig.h" #include "modbase/ModBaseRunner.h" #include "models/models.h" @@ -22,6 +22,7 @@ #include "utils/barcode_kits.h" #include "utils/gpu_monitor.h" #include "utils/parameters.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/string_utils.h" #include "utils/time_utils.h" @@ -81,8 +82,8 @@ void reference_all_public_functions() { force_reference(&dorado::basecall::load_crf_model_config); // basecall/ModelRunner.h force_reference(&dorado::basecall::ModelRunner::accept_chunk); - // demux/parse_custom_kit.h - force_reference(&dorado::demux::parse_custom_arrangement); + // demux/parse_custom_sequences.h + force_reference(&dorado::demux::parse_custom_sequences); // modbase/ModBaseModelConfig.h force_reference(&dorado::modbase::load_modbase_model_config); // modbase/ModBaseRunner.h @@ -107,6 +108,9 @@ void reference_all_public_functions() { force_reference(&dorado::utils::shallow_copy_read); // utils/barcode_kits.h force_reference(&dorado::barcode_kits::get_kit_infos); + // utils/parse_custom_kit.h + force_reference(&dorado::barcode_kits::parse_custom_arrangement); + #if DORADO_CUDA_BUILD // utils/cuda_utils.h force_reference(&dorado::utils::acquire_gpu_lock);