diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index f7d85aa2..0b79fddd 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -48,6 +48,31 @@ namespace dorado { +namespace { + +const barcode_kits::KitInfo& get_barcode_kit_info(const std::string& kit_name) { + const auto kit_info = barcode_kits::get_kit_info(kit_name); + if (!kit_info) { + spdlog::error( + "{} is not a valid barcode kit name. Please run the help " + "command to find out available barcode kits."); + std::exit(EXIT_FAILURE); + } + return *kit_info; +} + +std::pair get_custom_barcode_kit_info( + const std::string& custom_kit_file) { + auto custom_kit_info = barcode_kits::parse_custom_arrangement(custom_kit_file); + if (!custom_kit_info) { + spdlog::error("Unable to load custom barcode arrangement file: {}", custom_kit_file); + std::exit(EXIT_FAILURE); + } + return *custom_kit_info; +} + +} // namespace + using dorado::utils::default_parameters; using OutputMode = dorado::utils::HtsFile::OutputMode; using namespace std::chrono_literals; @@ -151,16 +176,13 @@ void setup(std::vector args, SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); if (custom_kit) { - auto custom_kit_info = barcode_kits::parse_custom_arrangement(*custom_kit); - if (!custom_kit_info) { - spdlog::error("Unable to load custom barcode arrangement file: {}", *custom_kit); - std::exit(EXIT_FAILURE); - } - auto [kit_name, kit_info] = *custom_kit_info; - utils::add_rg_hdr_for_custom_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, - sample_sheet.get()); + auto [kit_name, kit_info] = get_custom_barcode_kit_info(*custom_kit); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, + sample_sheet.get()); } else { - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); + const auto kit_info = get_barcode_kit_info(barcode_kits[0]); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kits[0], kit_info, + sample_sheet.get()); } utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads); diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 13e90bb7..00988de1 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -462,8 +462,7 @@ int duplex(int argc, char* argv[]) { recursive_file_loading); read_groups.merge(DataLoader::load_read_groups(reads, duplex_rg_name, "", recursive_file_loading)); - std::vector barcode_kits; - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr); + utils::add_rg_headers(hdr.get(), read_groups); int batch_size(parser.visible.get("-b")); int chunk_size(parser.visible.get("-c")); diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index d0abac2a..9694522a 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -63,11 +63,11 @@ std::string read_group_to_string(const dorado::ReadGroup& read_group) { return rg.str(); } -void emit_read_groups_for_barcode_kit(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet) { +void add_barcode_kit_rg_hdrs(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { const auto& barcode_sequences = barcode_kits::get_barcodes(); for (const auto& barcode_name : kit_info.barcodes) { const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); @@ -103,43 +103,20 @@ kstring_t allocate_kstring() { return str; } -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet) { - // Emit read group headers without a barcode arrangement. +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups) { for (const auto& read_group : read_groups) { const std::string read_group_tags = read_group_to_string(read_group.second); emit_read_group(hdr, read_group_tags, read_group.first, {}); } - - // Emit read group headers for each barcode arrangement. - const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); - for (const auto& kit_name : barcode_kits) { - auto kit_iter = barcode_kit_infos.find(kit_name); - if (kit_iter == barcode_kit_infos.end()) { - throw std::runtime_error(kit_name + - " is not a valid barcode kit name. Please run the help " - "command to find out available barcode kits."); - } - emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_iter->second, - sample_sheet); - } } -void add_rg_hdr_for_custom_barcode_kit( - sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet) { - // Emit read group headers without a barcode arrangement. - for (const auto& read_group : read_groups) { - const std::string read_group_tags = read_group_to_string(read_group.second); - emit_read_group(hdr, read_group_tags, read_group.first, {}); - } - - emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_info, sample_sheet); +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { + add_rg_headers(hdr, read_groups); + add_barcode_kit_rg_hdrs(hdr, read_groups, kit_name, kit_info, sample_sheet); } void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) { diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 9cf99de4..6fd550af 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -25,17 +25,13 @@ struct AlignmentOps { size_t substitutions; }; -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet); - -void add_rg_hdr_for_custom_barcode_kit( - sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet); +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups); + +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet); void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index d7155667..9d40e76e 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -788,6 +788,15 @@ const std::unordered_map barcodes = { const std::unordered_map& get_kit_infos() { return kit_info_map; } +const KitInfo* get_kit_info(const std::string& kit_name) { + const auto& barcode_kit_infos = get_kit_infos(); + auto kit_iter = get_kit_infos().find(kit_name); + if (kit_iter == barcode_kit_infos.end()) { + return nullptr; + } + return &kit_iter->second; +} + const std::unordered_map& get_barcodes() { return barcodes; } const std::unordered_set& get_barcode_identifiers() { diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index 4f7dee9d..1b71f8dd 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -20,8 +20,10 @@ struct KitInfo { }; const std::unordered_map& get_kit_infos(); +const KitInfo* get_kit_info(const std::string& kit_name); const std::unordered_map& get_barcodes(); const std::unordered_set& get_barcode_identifiers(); + std::string barcode_kits_list_str(); std::string normalize_barcode_name(const std::string& barcode_name); diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index fb9c5c93..012f8421 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -41,7 +41,7 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) { "5mCG --emit-sam"); } -TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { +TEST_CASE("BamUtilsTest: Add read group headers scenarios", TEST_GROUP) { auto has_read_group_header = [](sam_hdr_t *ptr, const char *id) { return sam_hdr_line_index(ptr, "RG", id) >= 0; }; @@ -58,7 +58,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("No read groups generate no headers") { dorado::SamHdrPtr sam_header(sam_hdr_init()); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); - dorado::utils::add_rg_hdr(sam_header.get(), {}, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), {}); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); } @@ -73,7 +73,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), read_groups); // Check the IDs of the groups are all there. CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(read_groups.size())); @@ -84,23 +84,15 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { } } - // Pick some of the barcode kits (randomly chosen indices). - const auto &kit_infos = dorado::barcode_kits::get_kit_infos(); - const std::vector barcode_kits{ - std::next(kit_infos.begin(), 1)->first, - std::next(kit_infos.begin(), 7)->first, - }; - - SECTION("Read groups with barcodes") { + SECTION("Read groups with barcode kit") { + const std::string KIT_NAME{"SQK-RAB204"}; + auto kit_info = dorado::barcode_kits::get_kit_info(KIT_NAME); dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits, nullptr); + dorado::utils::add_rg_headers_with_barcode_kit(sam_header.get(), read_groups, KIT_NAME, + *kit_info, nullptr); // Check the IDs of the groups are all there. - size_t total_barcodes = 0; - for (const auto &kit_name : barcode_kits) { - total_barcodes += kit_infos.at(kit_name).barcodes.size(); - } - const size_t total_groups = read_groups.size() * (total_barcodes + 1); + const size_t total_groups = read_groups.size() * (kit_info->barcodes.size() + 1); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(total_groups)); // Check that the IDs match the expected format. @@ -110,24 +102,16 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { CHECK(get_barcode_tag(sam_header.get(), id.c_str()) == std::nullopt); // The headers with barcodes should contain those barcodes. - for (const auto &kit_name : barcode_kits) { - const auto &kit_info = kit_infos.at(kit_name); - for (const auto &barcode_name : kit_info.barcodes) { - const auto full_id = id + "_" + - dorado::barcode_kits::generate_standard_barcode_name( - kit_name, barcode_name); - const auto &barcode_seq = barcode_seqs.at(barcode_name); - CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); - CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); - } + for (const auto &barcode_name : kit_info->barcodes) { + const auto full_id = id + "_" + + dorado::barcode_kits::generate_standard_barcode_name( + KIT_NAME, barcode_name); + const auto &barcode_seq = barcode_seqs.at(barcode_name); + CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); + CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); } } } - - SECTION("Read groups with unknown barcode kit") { - dorado::SamHdrPtr sam_header(sam_hdr_init()); - CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"}, nullptr)); - } } TEST_CASE("BamUtilsTest: Test bam extraction helpers", TEST_GROUP) {