From f37cd6a708876fc7925aefa2c1d435310fc594b2 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Tue, 19 Mar 2024 16:02:25 +0000 Subject: [PATCH 1/5] Refactoring to move parsing custom barcodes kits functionality into utils so that bam_utils can have a dependency on it for creating the read groups --- CMakeLists.txt | 4 +- dorado/demux/AdapterDetector.cpp | 3 +- dorado/demux/BarcodeClassifier.cpp | 9 +- dorado/demux/BarcodeClassifier.h | 4 +- dorado/demux/parse_custom_sequences.cpp | 39 +++++ dorado/demux/parse_custom_sequences.h | 14 ++ dorado/utils/CMakeLists.txt | 4 +- dorado/utils/bam_utils.cpp | 148 +++++++++++-------- dorado/utils/barcode_kits.h | 1 - dorado/{demux => utils}/parse_custom_kit.cpp | 30 +--- dorado/{demux => utils}/parse_custom_kit.h | 7 +- tests/CustomBarcodeParserTest.cpp | 41 ++--- tests/symbol_test.cpp | 8 +- 13 files changed, 189 insertions(+), 123 deletions(-) create mode 100644 dorado/demux/parse_custom_sequences.cpp create mode 100644 dorado/demux/parse_custom_sequences.h rename dorado/{demux => utils}/parse_custom_kit.cpp (86%) rename dorado/{demux => utils}/parse_custom_kit.h (79%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06c79df3..273fb5ca 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -230,10 +230,10 @@ add_library(dorado_lib dorado/demux/BarcodeClassifier.h dorado/demux/BarcodeClassifierSelector.cpp dorado/demux/BarcodeClassifierSelector.h + dorado/demux/parse_custom_sequences.cpp + dorado/demux/parse_custom_sequences.h dorado/demux/Trimmer.cpp dorado/demux/Trimmer.h - dorado/demux/parse_custom_kit.cpp - dorado/demux/parse_custom_kit.h dorado/poly_tail/dna_poly_tail_calculator.cpp dorado/poly_tail/dna_poly_tail_calculator.h dorado/poly_tail/plasmid_poly_tail_calculator.cpp diff --git a/dorado/demux/AdapterDetector.cpp b/dorado/demux/AdapterDetector.cpp index edb46648..6988e8b4 100644 --- a/dorado/demux/AdapterDetector.cpp +++ b/dorado/demux/AdapterDetector.cpp @@ -1,7 +1,8 @@ #include "AdapterDetector.h" -#include "parse_custom_kit.h" +#include "parse_custom_sequences.h" #include "utils/alignment_utils.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/types.h" diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index de90ec1a..9882b85e 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -1,8 +1,9 @@ #include "BarcodeClassifier.h" -#include "parse_custom_kit.h" +#include "parse_custom_sequences.h" #include "utils/alignment_utils.h" #include "utils/barcode_kits.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/types.h" @@ -118,7 +119,7 @@ std::unordered_map process_custom_ki const std::optional& custom_kit) { std::unordered_map kit_map; if (custom_kit) { - auto custom_arrangement = demux::parse_custom_arrangement(*custom_kit); + auto custom_arrangement = dorado::barcode_kits::parse_custom_arrangement(*custom_kit); if (custom_arrangement) { const auto& [kit_name, kit_info] = *custom_arrangement; kit_map[kit_name] = kit_info; @@ -158,8 +159,8 @@ BarcodeClassifier::BarcodeClassifier(const std::vector& kit_names, : m_custom_kit(process_custom_kit(custom_kit)), m_custom_seqs(custom_barcodes ? parse_custom_sequences(*custom_barcodes) : std::unordered_map{}), - m_scoring_params(custom_kit ? parse_scoring_params(*custom_kit) - : BarcodeKitScoringParams{}), + m_scoring_params(custom_kit ? barcode_kits::parse_scoring_params(*custom_kit) + : barcode_kits::BarcodeKitScoringParams{}), m_barcode_candidates(generate_candidates(kit_names)) {} BarcodeClassifier::~BarcodeClassifier() = default; diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 61f7e4df..8494b4f3 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -1,6 +1,6 @@ #pragma once -#include "parse_custom_kit.h" #include "utils/barcode_kits.h" +#include "utils/parse_custom_kit.h" #include "utils/stats.h" #include "utils/types.h" @@ -30,7 +30,7 @@ class BarcodeClassifier { private: const std::unordered_map m_custom_kit; const std::unordered_map m_custom_seqs; - const BarcodeKitScoringParams m_scoring_params; + const barcode_kits::BarcodeKitScoringParams m_scoring_params; const std::vector m_barcode_candidates; std::vector generate_candidates(const std::vector& kit_names); diff --git a/dorado/demux/parse_custom_sequences.cpp b/dorado/demux/parse_custom_sequences.cpp new file mode 100644 index 00000000..60ad0d44 --- /dev/null +++ b/dorado/demux/parse_custom_sequences.cpp @@ -0,0 +1,39 @@ +#include "parse_custom_sequences.h" + +#include "utils/bam_utils.h" +#include "utils/barcode_kits.h" + +#include +#include +#include + +#include +#include +#include + +namespace dorado::demux { + +std::unordered_map parse_custom_sequences( + const std::string& sequences_file) { + auto file = hts_open(sequences_file.c_str(), "r"); + BamPtr record; + record.reset(bam_init1()); + + std::unordered_map sequences; + + int sam_ret_val = 0; + while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) { + if (sam_ret_val < -1) { + throw std::runtime_error("Failed to parse custom sequence file " + sequences_file); + } + std::string qname = bam_get_qname(record.get()); + std::string seq = utils::extract_sequence(record.get()); + sequences[qname] = seq; + } + + hts_close(file); + + return sequences; +} + +} // namespace dorado::demux diff --git a/dorado/demux/parse_custom_sequences.h b/dorado/demux/parse_custom_sequences.h new file mode 100644 index 00000000..10e70c20 --- /dev/null +++ b/dorado/demux/parse_custom_sequences.h @@ -0,0 +1,14 @@ +#pragma once + +#include "utils/barcode_kits.h" + +#include +#include +#include + +namespace dorado::demux { + +std::unordered_map parse_custom_sequences( + const std::string& sequences_file); + +} // namespace dorado::demux diff --git a/dorado/utils/CMakeLists.txt b/dorado/utils/CMakeLists.txt index 387ace46..4b2093da 100644 --- a/dorado/utils/CMakeLists.txt +++ b/dorado/utils/CMakeLists.txt @@ -30,6 +30,8 @@ add_library(dorado_utils module_utils.h parameters.cpp parameters.h + parse_custom_kit.cpp + parse_custom_kit.h PostCondition.h SampleSheet.cpp SampleSheet.h @@ -59,7 +61,7 @@ add_library(dorado_utils types.h uuid_utils.cpp uuid_utils.h -) + ) if (DORADO_GPU_BUILD) if(APPLE) diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index cbc1bed1..9bb6b972 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -20,6 +20,8 @@ const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; #endif // _WIN32 +namespace dorado::utils { + namespace { // Convert the 4bit encoded sequence in a bam1_t structure @@ -32,9 +34,66 @@ std::string convert_nt16_to_str(uint8_t* bseq, size_t slen) { return seq; } -} // namespace +void emit_read_group(sam_hdr_t* hdr, + const std::string& read_group_line, + const std::string& id, + const std::string& additional_tags) { + auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n'; + sam_hdr_add_lines(hdr, line.c_str(), 0); +} -namespace dorado::utils { +void emit_read_groups_for_barcode_kit(const std::unordered_map& read_groups, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { + const auto& barcode_sequences = barcode_kits::get_barcodes(); + for (const auto& barcode_name : kit_info.barcodes) { + const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); + const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); + for (const auto& read_group : read_groups) { + std::string alias; + auto id = read_group.first + '_'; + if (sample_sheet) { + if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { + continue; + } + + alias = sample_sheet->get_alias( + read_group.second.flowcell_id, read_group.second.position_id, + read_group.second.experiment_id, normalized_barcode_name); + } + if (!alias.empty()) { + id += alias; + } else { + id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); + } + const std::string read_group_tags = read_group_to_string(read_group.second); + emit_read_group(hdr, read_group_tags, id, additional_tags); + } + } +} + +std::string read_group_to_string(const dorado::ReadGroup& read_group) { + auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; }; + std::ostringstream rg; + { + rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t"; + rg << "PM:" << value_or_unknown(read_group.device_id) << "\t"; + rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t"; + rg << "PL:" + << "ONT" + << "\t"; + rg << "DS:" + << "basecall_model=" << value_or_unknown(read_group.basecalling_model) + << (read_group.modbase_models.empty() ? "" + : (" modbase_models=" + read_group.modbase_models)) + << " runid=" << value_or_unknown(read_group.run_id) << "\t"; + rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t"; + rg << "SM:" << value_or_unknown(read_group.sample_id); + } + return rg.str(); +} + +} // namespace kstring_t allocate_kstring() { kstring_t str = {0, 0, NULL}; @@ -49,40 +108,10 @@ void add_rg_hdr(sam_hdr_t* hdr, const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); const auto& barcode_sequences = barcode_kits::get_barcodes(); - // Convert a ReadGroup to a string - auto to_string = [](const ReadGroup& read_group) { - // Lambda function to return "unknown" if string is empty - auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; }; - std::ostringstream rg; - { - rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t"; - rg << "PM:" << value_or_unknown(read_group.device_id) << "\t"; - rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t"; - rg << "PL:" - << "ONT" - << "\t"; - rg << "DS:" - << "basecall_model=" << value_or_unknown(read_group.basecalling_model) - << (read_group.modbase_models.empty() - ? "" - : (" modbase_models=" + read_group.modbase_models)) - << " runid=" << value_or_unknown(read_group.run_id) << "\t"; - rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t"; - rg << "SM:" << value_or_unknown(read_group.sample_id); - } - return std::move(rg).str(); - }; - - auto emit_read_group = [hdr](const std::string& read_group_line, const std::string& id, - const std::string& additional_tags) { - auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n'; - sam_hdr_add_lines(hdr, line.c_str(), 0); - }; - // Emit read group headers without a barcode arrangement. for (const auto& read_group : read_groups) { - const std::string read_group_tags = to_string(read_group.second); - emit_read_group(read_group_tags, read_group.first, {}); + const std::string read_group_tags = read_group_to_string(read_group.second); + emit_read_group(hdr, read_group_tags, read_group.first, {}); } // Emit read group headers for each barcode arrangement. @@ -93,31 +122,32 @@ void add_rg_hdr(sam_hdr_t* hdr, " is not a valid barcode kit name. Please run the help " "command to find out available barcode kits."); } - const auto& kit_info = kit_iter->second; - for (const auto& barcode_name : kit_info.barcodes) { - const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); - const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); - for (const auto& read_group : read_groups) { - std::string alias; - auto id = read_group.first + '_'; - if (sample_sheet) { - if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { - continue; - } - - alias = sample_sheet->get_alias( - read_group.second.flowcell_id, read_group.second.position_id, - read_group.second.experiment_id, normalized_barcode_name); - } - if (!alias.empty()) { - id += alias; - } else { - id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); - } - const std::string read_group_tags = to_string(read_group.second); - emit_read_group(read_group_tags, id, additional_tags); - } - } + emit_read_groups_for_barcode_kit(read_groups, kit_iter->second, sample_sheet); + //const auto& kit_info = kit_iter->second; + //for (const auto& barcode_name : kit_info.barcodes) { + // const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); + // const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); + // for (const auto& read_group : read_groups) { + // std::string alias; + // auto id = read_group.first + '_'; + // if (sample_sheet) { + // if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { + // continue; + // } + + // alias = sample_sheet->get_alias( + // read_group.second.flowcell_id, read_group.second.position_id, + // read_group.second.experiment_id, normalized_barcode_name); + // } + // if (!alias.empty()) { + // id += alias; + // } else { + // id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); + // } + // const std::string read_group_tags = read_group_to_string(read_group.second); + // emit_read_group(hdr, read_group_tags, id, additional_tags); + // } + //} } } diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index a48a3907..4f7dee9d 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -27,5 +27,4 @@ std::string barcode_kits_list_str(); std::string normalize_barcode_name(const std::string& barcode_name); std::string generate_standard_barcode_name(const std::string& kit_name, const std::string& barcode_name); - } // namespace dorado::barcode_kits diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/utils/parse_custom_kit.cpp similarity index 86% rename from dorado/demux/parse_custom_kit.cpp rename to dorado/utils/parse_custom_kit.cpp index e8061523..d143d4a6 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/utils/parse_custom_kit.cpp @@ -1,6 +1,7 @@ #include "parse_custom_kit.h" -#include "utils/bam_utils.h" +//#include "utils/bam_utils.h" +//#include "utils/types.h" #include "utils/barcode_kits.h" #include @@ -11,7 +12,7 @@ #include #include -namespace dorado::demux { +namespace dorado::barcode_kits { bool check_normalized_id_pattern(const std::string& pattern) { auto modulo_pos = pattern.find_first_of("%"); @@ -138,27 +139,4 @@ BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file return params; } -std::unordered_map parse_custom_sequences( - const std::string& sequences_file) { - auto file = hts_open(sequences_file.c_str(), "r"); - BamPtr record; - record.reset(bam_init1()); - - std::unordered_map sequences; - - int sam_ret_val = 0; - while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) { - if (sam_ret_val < -1) { - throw std::runtime_error("Failed to parse custom sequence file " + sequences_file); - } - std::string qname = bam_get_qname(record.get()); - std::string seq = utils::extract_sequence(record.get()); - sequences[qname] = seq; - } - - hts_close(file); - - return sequences; -} - -} // namespace dorado::demux +} // namespace dorado::barcode_kits diff --git a/dorado/demux/parse_custom_kit.h b/dorado/utils/parse_custom_kit.h similarity index 79% rename from dorado/demux/parse_custom_kit.h rename to dorado/utils/parse_custom_kit.h index 63fd2199..af79cdb1 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/utils/parse_custom_kit.h @@ -6,7 +6,7 @@ #include #include -namespace dorado::demux { +namespace dorado::barcode_kits { struct BarcodeKitScoringParams { float min_soft_barcode_threshold = 0.7f; @@ -19,11 +19,8 @@ struct BarcodeKitScoringParams { std::optional> parse_custom_arrangement( const std::string& arrangement_file); -std::unordered_map parse_custom_sequences( - const std::string& sequences_file); - BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file); bool check_normalized_id_pattern(const std::string& pattern); -} // namespace dorado::demux +} // namespace dorado::barcode_kits diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 82e92d7c..43af01d0 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -1,5 +1,6 @@ #include "TestUtils.h" -#include "demux/parse_custom_kit.h" +#include "demux/parse_custom_sequences.h" +#include "utils/parse_custom_kit.h" #include @@ -9,7 +10,7 @@ TEST_CASE("Parse custom single ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_single_ended.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_single_ended"); CHECK(kit_info.barcodes.size() == 4); @@ -29,7 +30,7 @@ TEST_CASE("Parse double ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_double_ended.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_double_ended"); CHECK(kit_info.barcodes.size() == 24); @@ -49,7 +50,7 @@ TEST_CASE("Parse double ended barcode arrangement with different flanks", "[barc fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_flanks.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_flanks"); CHECK(kit_info.barcodes.size() == 96); @@ -69,7 +70,7 @@ TEST_CASE("Parse double ended barcode arrangement with different barcodes", "[ba fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_barcodes.toml"; - auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::barcode_kits::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_barcodes"); CHECK(kit_info.barcodes.size() == 12); @@ -89,14 +90,14 @@ TEST_CASE("Parse kit with bad indices", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "bad_double_ended_kit.toml"; - CHECK_THROWS(dorado::demux::parse_custom_arrangement(test_file.string())); + CHECK_THROWS(dorado::barcode_kits::parse_custom_arrangement(test_file.string())); } TEST_CASE("Parse kit with incomplete double ended settings", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "bad_double_ended_kit_not_all_params_set.toml"; - CHECK_THROWS_WITH(dorado::demux::parse_custom_arrangement(test_file.string()), + CHECK_THROWS_WITH(dorado::barcode_kits::parse_custom_arrangement(test_file.string()), Catch::Matchers::Contains( "mask2_front mask2_rear and barcode2_pattern must all be set")); } @@ -118,7 +119,7 @@ TEST_CASE("Parse custom barcode scoring params", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_params_file = data_dir / "scoring_params.toml"; - auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); + auto scoring_params = dorado::barcode_kits::parse_scoring_params(test_params_file.string()); CHECK(scoring_params.min_soft_barcode_threshold == Approx(0.1f)); CHECK(scoring_params.min_hard_barcode_threshold == Approx(0.2f)); @@ -131,9 +132,9 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_params_file = data_dir / "test_kit_single_ended.toml"; - dorado::demux::BarcodeKitScoringParams default_params; + dorado::barcode_kits::BarcodeKitScoringParams default_params; - auto scoring_params = dorado::demux::parse_scoring_params(test_params_file.string()); + auto scoring_params = dorado::barcode_kits::parse_scoring_params(test_params_file.string()); CHECK(scoring_params.min_soft_barcode_threshold == Approx(default_params.min_soft_barcode_threshold)); @@ -147,14 +148,14 @@ TEST_CASE("Parse default scoring params", "[barcode_demux]") { } TEST_CASE("Check for normalized id pattern", "[barcode_demux]") { - CHECK(dorado::demux::check_normalized_id_pattern("BC%02i")); - CHECK(dorado::demux::check_normalized_id_pattern("abcd%25i")); - CHECK(dorado::demux::check_normalized_id_pattern("%2i")); - - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%d")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02f")); - CHECK_FALSE(dorado::demux::check_normalized_id_pattern("ab%02iab")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("BC%02i")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("abcd%25i")); + CHECK(dorado::barcode_kits::check_normalized_id_pattern("%2i")); + + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%d")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02f")); + CHECK_FALSE(dorado::barcode_kits::check_normalized_id_pattern("ab%02iab")); } diff --git a/tests/symbol_test.cpp b/tests/symbol_test.cpp index 0e1726a0..5c7ae870 100644 --- a/tests/symbol_test.cpp +++ b/tests/symbol_test.cpp @@ -8,6 +8,7 @@ #include "basecall/CRFModelConfig.h" #include "basecall/ModelRunner.h" #include "demux/parse_custom_kit.h" +#include "demux/parse_custom_sequences.h" #include "modbase/ModBaseModelConfig.h" #include "modbase/ModBaseRunner.h" #include "models/models.h" @@ -81,8 +82,8 @@ void reference_all_public_functions() { force_reference(&dorado::basecall::load_crf_model_config); // basecall/ModelRunner.h force_reference(&dorado::basecall::ModelRunner::accept_chunk); - // demux/parse_custom_kit.h - force_reference(&dorado::demux::parse_custom_arrangement); + // demux/parse_custom_sequences.h + force_reference(&dorado::demux::parse_custom_sequences); // modbase/ModBaseModelConfig.h force_reference(&dorado::modbase::load_modbase_model_config); // modbase/ModBaseRunner.h @@ -107,6 +108,9 @@ void reference_all_public_functions() { force_reference(&dorado::utils::shallow_copy_read); // utils/barcode_kits.h force_reference(&dorado::barcode_kits::get_kit_infos); + // utils/parse_custom_kit.h + force_reference(&dorado::barcode_kits::parse_custom_arrangement); + #if DORADO_CUDA_BUILD // utils/cuda_utils.h force_reference(&dorado::utils::acquire_gpu_lock); From 806ad9b1e61675ba7639f9b5b0f739cfec6e0409 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Wed, 20 Mar 2024 12:00:40 +0000 Subject: [PATCH 2/5] Add barcode arrangements RG header generation when using a custom kit --- dorado/cli/basecaller.cpp | 16 ++++++- dorado/utils/bam_utils.cpp | 93 +++++++++++++++++--------------------- dorado/utils/bam_utils.h | 8 ++++ 3 files changed, 65 insertions(+), 52 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 0c50762c..f7d85aa2 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -24,6 +24,7 @@ #include "utils/fs_utils.h" #include "utils/log_utils.h" #include "utils/parameters.h" +#include "utils/parse_custom_kit.h" #include "utils/stats.h" #include "utils/string_utils.h" #include "utils/sys_stats.h" @@ -149,7 +150,18 @@ void setup(std::vector args, SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); + if (custom_kit) { + auto custom_kit_info = barcode_kits::parse_custom_arrangement(*custom_kit); + if (!custom_kit_info) { + spdlog::error("Unable to load custom barcode arrangement file: {}", *custom_kit); + std::exit(EXIT_FAILURE); + } + auto [kit_name, kit_info] = *custom_kit_info; + utils::add_rg_hdr_for_custom_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, + sample_sheet.get()); + } else { + utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); + } utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads); @@ -606,6 +618,7 @@ int basecaller(int argc, char* argv[]) { spdlog::info("> Creating basecall pipeline"); try { + /* clang format off */ setup(args, model_path, data, mods_model_paths, parser.visible.get("-x"), parser.visible.get("--reference"), parser.visible.get("-c"), parser.visible.get("-o"), parser.visible.get("-b"), @@ -625,6 +638,7 @@ int basecaller(int argc, char* argv[]) { std::move(custom_kit), std::move(custom_barcode_seqs), std::move(custom_primer_file), resume_parser, parser.visible.get("--estimate-poly-a"), polya_config.empty() ? nullptr : &polya_config, model_selection); + /* clang format on */ } catch (const std::exception& e) { spdlog::error("{}", e.what()); utils::clean_temporary_models(temp_download_paths); diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 9bb6b972..d0abac2a 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -42,7 +42,30 @@ void emit_read_group(sam_hdr_t* hdr, sam_hdr_add_lines(hdr, line.c_str(), 0); } -void emit_read_groups_for_barcode_kit(const std::unordered_map& read_groups, +std::string read_group_to_string(const dorado::ReadGroup& read_group) { + auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; }; + std::ostringstream rg; + { + rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t"; + rg << "PM:" << value_or_unknown(read_group.device_id) << "\t"; + rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t"; + rg << "PL:" + << "ONT" + << "\t"; + rg << "DS:" + << "basecall_model=" << value_or_unknown(read_group.basecalling_model) + << (read_group.modbase_models.empty() ? "" + : (" modbase_models=" + read_group.modbase_models)) + << " runid=" << value_or_unknown(read_group.run_id) << "\t"; + rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t"; + rg << "SM:" << value_or_unknown(read_group.sample_id); + } + return rg.str(); +} + +void emit_read_groups_for_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, const barcode_kits::KitInfo& kit_info, const utils::SampleSheet* const sample_sheet) { const auto& barcode_sequences = barcode_kits::get_barcodes(); @@ -72,27 +95,6 @@ void emit_read_groups_for_barcode_kit(const std::unordered_map& read_groups, const std::vector& barcode_kits, const utils::SampleSheet* const sample_sheet) { - const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); - const auto& barcode_sequences = barcode_kits::get_barcodes(); - // Emit read group headers without a barcode arrangement. for (const auto& read_group : read_groups) { const std::string read_group_tags = read_group_to_string(read_group.second); @@ -115,6 +114,7 @@ void add_rg_hdr(sam_hdr_t* hdr, } // Emit read group headers for each barcode arrangement. + const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); for (const auto& kit_name : barcode_kits) { auto kit_iter = barcode_kit_infos.find(kit_name); if (kit_iter == barcode_kit_infos.end()) { @@ -122,35 +122,26 @@ void add_rg_hdr(sam_hdr_t* hdr, " is not a valid barcode kit name. Please run the help " "command to find out available barcode kits."); } - emit_read_groups_for_barcode_kit(read_groups, kit_iter->second, sample_sheet); - //const auto& kit_info = kit_iter->second; - //for (const auto& barcode_name : kit_info.barcodes) { - // const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); - // const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); - // for (const auto& read_group : read_groups) { - // std::string alias; - // auto id = read_group.first + '_'; - // if (sample_sheet) { - // if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { - // continue; - // } - - // alias = sample_sheet->get_alias( - // read_group.second.flowcell_id, read_group.second.position_id, - // read_group.second.experiment_id, normalized_barcode_name); - // } - // if (!alias.empty()) { - // id += alias; - // } else { - // id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); - // } - // const std::string read_group_tags = read_group_to_string(read_group.second); - // emit_read_group(hdr, read_group_tags, id, additional_tags); - // } - //} + emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_iter->second, + sample_sheet); } } +void add_rg_hdr_for_custom_barcode_kit( + sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { + // Emit read group headers without a barcode arrangement. + for (const auto& read_group : read_groups) { + const std::string read_group_tags = read_group_to_string(read_group.second); + emit_read_group(hdr, read_group_tags, read_group.first, {}); + } + + emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_info, sample_sheet); +} + void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) { for (auto pair : seqs) { char* name; diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 4f532b0d..9cf99de4 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -1,4 +1,5 @@ #pragma once +#include "barcode_kits.h" #include "types.h" #include @@ -29,6 +30,13 @@ void add_rg_hdr(sam_hdr_t* hdr, const std::vector& barcode_kits, const utils::SampleSheet* const sample_sheet); +void add_rg_hdr_for_custom_barcode_kit( + sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet); + void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); void strip_sq_hdr(sam_hdr_t* hdr); From 3f2bc4105beba95b581966ff78e71aad45be2498 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Wed, 20 Mar 2024 17:41:58 +0000 Subject: [PATCH 3/5] Refactored adding read group headers separating custom from default kits to avoid a clunky api --- dorado/cli/basecaller.cpp | 40 +++++++++++++++++++++------- dorado/cli/duplex.cpp | 3 +-- dorado/utils/bam_utils.cpp | 49 ++++++++++------------------------- dorado/utils/bam_utils.h | 18 +++++-------- dorado/utils/barcode_kits.cpp | 9 +++++++ dorado/utils/barcode_kits.h | 2 ++ tests/BamUtilsTest.cpp | 48 ++++++++++++---------------------- 7 files changed, 79 insertions(+), 90 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index f7d85aa2..0b79fddd 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -48,6 +48,31 @@ namespace dorado { +namespace { + +const barcode_kits::KitInfo& get_barcode_kit_info(const std::string& kit_name) { + const auto kit_info = barcode_kits::get_kit_info(kit_name); + if (!kit_info) { + spdlog::error( + "{} is not a valid barcode kit name. Please run the help " + "command to find out available barcode kits."); + std::exit(EXIT_FAILURE); + } + return *kit_info; +} + +std::pair get_custom_barcode_kit_info( + const std::string& custom_kit_file) { + auto custom_kit_info = barcode_kits::parse_custom_arrangement(custom_kit_file); + if (!custom_kit_info) { + spdlog::error("Unable to load custom barcode arrangement file: {}", custom_kit_file); + std::exit(EXIT_FAILURE); + } + return *custom_kit_info; +} + +} // namespace + using dorado::utils::default_parameters; using OutputMode = dorado::utils::HtsFile::OutputMode; using namespace std::chrono_literals; @@ -151,16 +176,13 @@ void setup(std::vector args, SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); if (custom_kit) { - auto custom_kit_info = barcode_kits::parse_custom_arrangement(*custom_kit); - if (!custom_kit_info) { - spdlog::error("Unable to load custom barcode arrangement file: {}", *custom_kit); - std::exit(EXIT_FAILURE); - } - auto [kit_name, kit_info] = *custom_kit_info; - utils::add_rg_hdr_for_custom_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, - sample_sheet.get()); + auto [kit_name, kit_info] = get_custom_barcode_kit_info(*custom_kit); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, + sample_sheet.get()); } else { - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); + const auto kit_info = get_barcode_kit_info(barcode_kits[0]); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kits[0], kit_info, + sample_sheet.get()); } utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads); diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 13e90bb7..00988de1 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -462,8 +462,7 @@ int duplex(int argc, char* argv[]) { recursive_file_loading); read_groups.merge(DataLoader::load_read_groups(reads, duplex_rg_name, "", recursive_file_loading)); - std::vector barcode_kits; - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr); + utils::add_rg_headers(hdr.get(), read_groups); int batch_size(parser.visible.get("-b")); int chunk_size(parser.visible.get("-c")); diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index d0abac2a..9694522a 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -63,11 +63,11 @@ std::string read_group_to_string(const dorado::ReadGroup& read_group) { return rg.str(); } -void emit_read_groups_for_barcode_kit(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet) { +void add_barcode_kit_rg_hdrs(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { const auto& barcode_sequences = barcode_kits::get_barcodes(); for (const auto& barcode_name : kit_info.barcodes) { const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); @@ -103,43 +103,20 @@ kstring_t allocate_kstring() { return str; } -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet) { - // Emit read group headers without a barcode arrangement. +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups) { for (const auto& read_group : read_groups) { const std::string read_group_tags = read_group_to_string(read_group.second); emit_read_group(hdr, read_group_tags, read_group.first, {}); } - - // Emit read group headers for each barcode arrangement. - const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); - for (const auto& kit_name : barcode_kits) { - auto kit_iter = barcode_kit_infos.find(kit_name); - if (kit_iter == barcode_kit_infos.end()) { - throw std::runtime_error(kit_name + - " is not a valid barcode kit name. Please run the help " - "command to find out available barcode kits."); - } - emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_iter->second, - sample_sheet); - } } -void add_rg_hdr_for_custom_barcode_kit( - sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet) { - // Emit read group headers without a barcode arrangement. - for (const auto& read_group : read_groups) { - const std::string read_group_tags = read_group_to_string(read_group.second); - emit_read_group(hdr, read_group_tags, read_group.first, {}); - } - - emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_info, sample_sheet); +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet) { + add_rg_headers(hdr, read_groups); + add_barcode_kit_rg_hdrs(hdr, read_groups, kit_name, kit_info, sample_sheet); } void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) { diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 9cf99de4..6fd550af 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -25,17 +25,13 @@ struct AlignmentOps { size_t substitutions; }; -void add_rg_hdr(sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::vector& barcode_kits, - const utils::SampleSheet* const sample_sheet); - -void add_rg_hdr_for_custom_barcode_kit( - sam_hdr_t* hdr, - const std::unordered_map& read_groups, - const std::string& kit_name, - const barcode_kits::KitInfo& kit_info, - const utils::SampleSheet* const sample_sheet); +void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map& read_groups); + +void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr, + const std::unordered_map& read_groups, + const std::string& kit_name, + const barcode_kits::KitInfo& kit_info, + const utils::SampleSheet* const sample_sheet); void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index d7155667..9d40e76e 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -788,6 +788,15 @@ const std::unordered_map barcodes = { const std::unordered_map& get_kit_infos() { return kit_info_map; } +const KitInfo* get_kit_info(const std::string& kit_name) { + const auto& barcode_kit_infos = get_kit_infos(); + auto kit_iter = get_kit_infos().find(kit_name); + if (kit_iter == barcode_kit_infos.end()) { + return nullptr; + } + return &kit_iter->second; +} + const std::unordered_map& get_barcodes() { return barcodes; } const std::unordered_set& get_barcode_identifiers() { diff --git a/dorado/utils/barcode_kits.h b/dorado/utils/barcode_kits.h index 4f7dee9d..1b71f8dd 100644 --- a/dorado/utils/barcode_kits.h +++ b/dorado/utils/barcode_kits.h @@ -20,8 +20,10 @@ struct KitInfo { }; const std::unordered_map& get_kit_infos(); +const KitInfo* get_kit_info(const std::string& kit_name); const std::unordered_map& get_barcodes(); const std::unordered_set& get_barcode_identifiers(); + std::string barcode_kits_list_str(); std::string normalize_barcode_name(const std::string& barcode_name); diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index fb9c5c93..012f8421 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -41,7 +41,7 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) { "5mCG --emit-sam"); } -TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { +TEST_CASE("BamUtilsTest: Add read group headers scenarios", TEST_GROUP) { auto has_read_group_header = [](sam_hdr_t *ptr, const char *id) { return sam_hdr_line_index(ptr, "RG", id) >= 0; }; @@ -58,7 +58,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("No read groups generate no headers") { dorado::SamHdrPtr sam_header(sam_hdr_init()); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); - dorado::utils::add_rg_hdr(sam_header.get(), {}, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), {}); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); } @@ -73,7 +73,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}, nullptr); + dorado::utils::add_rg_headers(sam_header.get(), read_groups); // Check the IDs of the groups are all there. CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(read_groups.size())); @@ -84,23 +84,15 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { } } - // Pick some of the barcode kits (randomly chosen indices). - const auto &kit_infos = dorado::barcode_kits::get_kit_infos(); - const std::vector barcode_kits{ - std::next(kit_infos.begin(), 1)->first, - std::next(kit_infos.begin(), 7)->first, - }; - - SECTION("Read groups with barcodes") { + SECTION("Read groups with barcode kit") { + const std::string KIT_NAME{"SQK-RAB204"}; + auto kit_info = dorado::barcode_kits::get_kit_info(KIT_NAME); dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits, nullptr); + dorado::utils::add_rg_headers_with_barcode_kit(sam_header.get(), read_groups, KIT_NAME, + *kit_info, nullptr); // Check the IDs of the groups are all there. - size_t total_barcodes = 0; - for (const auto &kit_name : barcode_kits) { - total_barcodes += kit_infos.at(kit_name).barcodes.size(); - } - const size_t total_groups = read_groups.size() * (total_barcodes + 1); + const size_t total_groups = read_groups.size() * (kit_info->barcodes.size() + 1); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(total_groups)); // Check that the IDs match the expected format. @@ -110,24 +102,16 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { CHECK(get_barcode_tag(sam_header.get(), id.c_str()) == std::nullopt); // The headers with barcodes should contain those barcodes. - for (const auto &kit_name : barcode_kits) { - const auto &kit_info = kit_infos.at(kit_name); - for (const auto &barcode_name : kit_info.barcodes) { - const auto full_id = id + "_" + - dorado::barcode_kits::generate_standard_barcode_name( - kit_name, barcode_name); - const auto &barcode_seq = barcode_seqs.at(barcode_name); - CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); - CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); - } + for (const auto &barcode_name : kit_info->barcodes) { + const auto full_id = id + "_" + + dorado::barcode_kits::generate_standard_barcode_name( + KIT_NAME, barcode_name); + const auto &barcode_seq = barcode_seqs.at(barcode_name); + CHECK(has_read_group_header(sam_header.get(), full_id.c_str())); + CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq); } } } - - SECTION("Read groups with unknown barcode kit") { - dorado::SamHdrPtr sam_header(sam_hdr_init()); - CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"}, nullptr)); - } } TEST_CASE("BamUtilsTest: Test bam extraction helpers", TEST_GROUP) { From cc328db58639d66e220420a61866115dc33cb5e5 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Thu, 21 Mar 2024 11:39:05 +0000 Subject: [PATCH 4/5] Fix missing include from symbol_test and added missing else clause if no barcoding kit specified --- dorado/cli/basecaller.cpp | 25 ++++++++++++++----------- tests/symbol_test.cpp | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 0b79fddd..3ad80bcf 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -55,7 +55,8 @@ const barcode_kits::KitInfo& get_barcode_kit_info(const std::string& kit_name) { if (!kit_info) { spdlog::error( "{} is not a valid barcode kit name. Please run the help " - "command to find out available barcode kits."); + "command to find out available barcode kits.", + kit_name); std::exit(EXIT_FAILURE); } return *kit_info; @@ -103,7 +104,7 @@ void setup(std::vector args, const std::string& dump_stats_file, const std::string& dump_stats_filter, const std::string& resume_from_file, - const std::vector& barcode_kits, + const std::string& barcode_kit, bool barcode_both_ends, bool barcode_no_trim, bool adapter_no_trim, @@ -161,7 +162,7 @@ void setup(std::vector args, recursive_file_loading); const bool adapter_trimming_enabled = (!adapter_no_trim || !primer_no_trim); - const bool barcode_enabled = !barcode_kits.empty() || custom_kit; + const bool barcode_enabled = !barcode_kit.empty() || custom_kit; const auto thread_allocations = utils::default_thread_allocations( int(num_devices), !remora_runners.empty() ? int(num_remora_threads) : 0, enable_aligner, barcode_enabled, adapter_trimming_enabled); @@ -179,10 +180,12 @@ void setup(std::vector args, auto [kit_name, kit_info] = get_custom_barcode_kit_info(*custom_kit); utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, kit_name, kit_info, sample_sheet.get()); - } else { - const auto kit_info = get_barcode_kit_info(barcode_kits[0]); - utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kits[0], kit_info, + } else if (!barcode_kit.empty()) { + const auto kit_info = get_barcode_kit_info(barcode_kit); + utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kit, kit_info, sample_sheet.get()); + } else { + utils::add_rg_headers(hdr.get(), read_groups); } utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads); @@ -212,8 +215,9 @@ void setup(std::vector args, !primer_no_trim, std::move(custom_primer_file)); } if (barcode_enabled) { + std::vector kit_as_vector{barcode_kit}; current_sink_node = pipeline_desc.add_node( - {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, + {current_sink_node}, thread_allocations.barcoder_threads, kit_as_vector, barcode_both_ends, barcode_no_trim, std::move(allowed_barcodes), std::move(custom_kit), std::move(custom_barcode_file)); } @@ -434,7 +438,8 @@ int basecaller(int argc, char* argv[]) { parser.visible.add_argument("--kit-name") .help("Enable barcoding with the provided kit name. Choose from: " + - dorado::barcode_kits::barcode_kits_list_str() + "."); + dorado::barcode_kits::barcode_kits_list_str() + ".") + .default_value(std::string{}); parser.visible.add_argument("--barcode-both-ends") .help("Require both ends of a read to be barcoded for a double ended barcode.") .default_value(false) @@ -640,7 +645,6 @@ int basecaller(int argc, char* argv[]) { spdlog::info("> Creating basecall pipeline"); try { - /* clang format off */ setup(args, model_path, data, mods_model_paths, parser.visible.get("-x"), parser.visible.get("--reference"), parser.visible.get("-c"), parser.visible.get("-o"), parser.visible.get("-b"), @@ -654,13 +658,12 @@ int basecaller(int argc, char* argv[]) { parser.hidden.get("--dump_stats_file"), parser.hidden.get("--dump_stats_filter"), parser.visible.get("--resume-from"), - parser.visible.get>("--kit-name"), + parser.visible.get("--kit-name"), parser.visible.get("--barcode-both-ends"), no_trim_barcodes, no_trim_adapters, no_trim_primers, parser.visible.get("--sample-sheet"), std::move(custom_kit), std::move(custom_barcode_seqs), std::move(custom_primer_file), resume_parser, parser.visible.get("--estimate-poly-a"), polya_config.empty() ? nullptr : &polya_config, model_selection); - /* clang format on */ } catch (const std::exception& e) { spdlog::error("{}", e.what()); utils::clean_temporary_models(temp_download_paths); diff --git a/tests/symbol_test.cpp b/tests/symbol_test.cpp index 5c7ae870..620b9cd1 100644 --- a/tests/symbol_test.cpp +++ b/tests/symbol_test.cpp @@ -7,7 +7,6 @@ #include "api/runner_creation.h" #include "basecall/CRFModelConfig.h" #include "basecall/ModelRunner.h" -#include "demux/parse_custom_kit.h" #include "demux/parse_custom_sequences.h" #include "modbase/ModBaseModelConfig.h" #include "modbase/ModBaseRunner.h" @@ -23,6 +22,7 @@ #include "utils/barcode_kits.h" #include "utils/gpu_monitor.h" #include "utils/parameters.h" +#include "utils/parse_custom_kit.h" #include "utils/sequence_utils.h" #include "utils/string_utils.h" #include "utils/time_utils.h" From 037257f222cdd43c26d8cb6c583df743111a0a7f Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Fri, 22 Mar 2024 11:52:58 +0000 Subject: [PATCH 5/5] Updated follwoing review --- dorado/demux/parse_custom_sequences.cpp | 14 +++----------- dorado/demux/parse_custom_sequences.h | 3 --- dorado/utils/barcode_kits.cpp | 2 +- dorado/utils/parse_custom_kit.cpp | 2 -- 4 files changed, 4 insertions(+), 17 deletions(-) diff --git a/dorado/demux/parse_custom_sequences.cpp b/dorado/demux/parse_custom_sequences.cpp index 60ad0d44..684d388f 100644 --- a/dorado/demux/parse_custom_sequences.cpp +++ b/dorado/demux/parse_custom_sequences.cpp @@ -1,28 +1,22 @@ #include "parse_custom_sequences.h" #include "utils/bam_utils.h" -#include "utils/barcode_kits.h" +#include "utils/types.h" #include -#include -#include - -#include -#include -#include namespace dorado::demux { std::unordered_map parse_custom_sequences( const std::string& sequences_file) { - auto file = hts_open(sequences_file.c_str(), "r"); + dorado::HtsFilePtr file(hts_open(sequences_file.c_str(), "r")); BamPtr record; record.reset(bam_init1()); std::unordered_map sequences; int sam_ret_val = 0; - while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) { + while ((sam_ret_val = sam_read1(file.get(), nullptr, record.get())) != -1) { if (sam_ret_val < -1) { throw std::runtime_error("Failed to parse custom sequence file " + sequences_file); } @@ -31,8 +25,6 @@ std::unordered_map parse_custom_sequences( sequences[qname] = seq; } - hts_close(file); - return sequences; } diff --git a/dorado/demux/parse_custom_sequences.h b/dorado/demux/parse_custom_sequences.h index 10e70c20..1b12bfcb 100644 --- a/dorado/demux/parse_custom_sequences.h +++ b/dorado/demux/parse_custom_sequences.h @@ -1,8 +1,5 @@ #pragma once -#include "utils/barcode_kits.h" - -#include #include #include diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 6863ed46..1e26b300 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -822,7 +822,7 @@ const std::unordered_map& get_kit_infos() { return kit_inf const KitInfo* get_kit_info(const std::string& kit_name) { const auto& barcode_kit_infos = get_kit_infos(); - auto kit_iter = get_kit_infos().find(kit_name); + auto kit_iter = barcode_kit_infos.find(kit_name); if (kit_iter == barcode_kit_infos.end()) { return nullptr; } diff --git a/dorado/utils/parse_custom_kit.cpp b/dorado/utils/parse_custom_kit.cpp index 30b6e72e..89bc7bdb 100644 --- a/dorado/utils/parse_custom_kit.cpp +++ b/dorado/utils/parse_custom_kit.cpp @@ -1,7 +1,5 @@ #include "parse_custom_kit.h" -//#include "utils/bam_utils.h" -//#include "utils/types.h" #include "utils/barcode_kits.h" #include