From 349df6841eb4c4fe8d42194336ac3830824f8046 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 11:53:45 +0100 Subject: [PATCH 01/32] Change barcoding info to hold the sample sheet - shared_ptr so we can cache them in the basecall server - sample sheet itself determines if a barcode is permitted --- dorado/cli/basecaller.cpp | 8 ++-- dorado/cli/demux.cpp | 9 ++-- dorado/demux/BarcodeClassifier.cpp | 42 +++++++--------- dorado/demux/BarcodeClassifier.h | 15 ++++-- .../read_pipeline/BarcodeClassifierNode.cpp | 12 +++-- dorado/read_pipeline/BarcodeClassifierNode.h | 7 ++- dorado/utils/SampleSheet.cpp | 46 +++++++++++------- dorado/utils/SampleSheet.h | 48 +++++++++++-------- dorado/utils/types.cpp | 5 +- dorado/utils/types.h | 10 ++-- tests/BarcodeClassifierTest.cpp | 21 ++++---- tests/NodeSmokeTest.cpp | 5 +- 12 files changed, 126 insertions(+), 102 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 6834e2b33..7bad067cf 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -146,11 +146,13 @@ void setup(std::vector args, is_rna_model(model_config)); } if (!barcode_kits.empty()) { - utils::SampleSheet sample_sheet(barcode_sample_sheet); - BarcodingInfo::FilterSet allowed_barcodes = sample_sheet.get_barcode_values(); + std::shared_ptr sample_sheet; + if (!barcode_sample_sheet.empty()) { + sample_sheet = std::make_shared(barcode_sample_sheet); + } current_sink_node = pipeline_desc.add_node( {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, - barcode_both_ends, barcode_no_trim, allowed_barcodes); + barcode_both_ends, barcode_no_trim, std::move(sample_sheet)); } current_sink_node = pipeline_desc.add_node( {current_sink_node}, min_qscore, default_parameters.min_sequence_length, diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 68a6c9978..bf1c16506 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -143,11 +143,14 @@ int demuxer(int argc, char* argv[]) { if (auto names = parser.present>("--kit-name")) { kit_names = std::move(*names); } - utils::SampleSheet sample_sheet(parser.get("--sample-sheet")); - BarcodingInfo::FilterSet allowed_barcodes = sample_sheet.get_barcode_values(); + auto barcode_sample_sheet = parser.get("--sample-sheet"); + std::shared_ptr sample_sheet; + if (!barcode_sample_sheet.empty()) { + sample_sheet = std::make_shared(barcode_sample_sheet); + } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), - parser.get("--no-trim"), allowed_barcodes); + parser.get("--no-trim"), sample_sheet); } // Create the Pipeline from our description. diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index ba4326396..1c4a2bf1a 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -1,5 +1,6 @@ #include "BarcodeClassifier.h" +#include "utils/SampleSheet.h" #include "utils/alignment_utils.h" #include "utils/barcode_kits.h" #include "utils/sequence_utils.h" @@ -10,7 +11,6 @@ #include #include -#include #include #include #include @@ -99,16 +99,6 @@ float extract_mask_score(std::string_view adapter, return score; } -bool barcode_is_permitted(const BarcodingInfo::FilterSet& allowed_barcodes, - const std::string& adapter_name) { - if (!allowed_barcodes.has_value()) { - return true; - } - - auto normalized_barcode_name = barcode_kits::normalize_barcode_name(adapter_name); - return allowed_barcodes->count(normalized_barcode_name) != 0; -} - } // namespace namespace demux { @@ -135,11 +125,12 @@ BarcodeClassifier::BarcodeClassifier(const std::vector& kit_names) BarcodeClassifier::~BarcodeClassifier() = default; -ScoreResults BarcodeClassifier::barcode(const std::string& seq, - bool barcode_both_ends, - const BarcodingInfo::FilterSet& allowed_barcodes) const { +ScoreResults BarcodeClassifier::barcode( + const std::string& seq, + bool barcode_both_ends, + const std::shared_ptr& sample_sheet) const { auto best_adapter = - find_best_adapter(seq, m_adapter_sequences, barcode_both_ends, allowed_barcodes); + find_best_adapter(seq, m_adapter_sequences, barcode_both_ends, sample_sheet); return best_adapter; } @@ -217,7 +208,7 @@ std::vector BarcodeClassifier::generate_adap std::vector BarcodeClassifier::calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const { + const std::shared_ptr& sample_sheet) const { std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); std::string_view read_bottom = read_seq.substr(bottom_start, TRIM_LENGTH); @@ -277,7 +268,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_different_d auto& adapter_rev = as.adapter_rev[i]; auto& adapter_name = as.adapter_name[i]; - if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { + if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { continue; } @@ -352,7 +343,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_different_d std::vector BarcodeClassifier::calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const { + const std::shared_ptr& sample_sheet) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); @@ -383,10 +374,9 @@ std::vector BarcodeClassifier::calculate_adapter_score_double_ends auto& adapter_rev = as.adapter_rev[i]; auto& adapter_name = as.adapter_name[i]; - if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { + if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { continue; } - spdlog::debug("Checking barcode {}", adapter_name); auto top_mask_score = extract_mask_score(adapter, top_mask, mask_config, "top window"); @@ -426,7 +416,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_double_ends std::vector BarcodeClassifier::calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const { + const std::shared_ptr& sample_sheet) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); @@ -449,7 +439,7 @@ std::vector BarcodeClassifier::calculate_adapter_score( auto& adapter = as.adapter[i]; auto& adapter_name = as.adapter_name[i]; - if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { + if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { continue; } @@ -569,7 +559,7 @@ ScoreResults BarcodeClassifier::find_best_adapter( const std::string& read_seq, const std::vector& adapters, bool barcode_both_ends, - const BarcodingInfo::FilterSet& allowed_barcodes) const { + const std::shared_ptr& sample_sheet) const { if (read_seq.length() < TRIM_LENGTH) { return UNCLASSIFIED; } @@ -590,14 +580,14 @@ ScoreResults BarcodeClassifier::find_best_adapter( auto& kit = kit_info_map.at(as->kit); if (kit.double_ends) { if (kit.ends_different) { - auto out = calculate_adapter_score_different_double_ends(fwd, *as, allowed_barcodes); + auto out = calculate_adapter_score_different_double_ends(fwd, *as, sample_sheet); scores.insert(scores.end(), out.begin(), out.end()); } else { - auto out = calculate_adapter_score_double_ends(fwd, *as, allowed_barcodes); + auto out = calculate_adapter_score_double_ends(fwd, *as, sample_sheet); scores.insert(scores.end(), out.begin(), out.end()); } } else { - auto out = calculate_adapter_score(fwd, *as, allowed_barcodes); + auto out = calculate_adapter_score(fwd, *as, sample_sheet); scores.insert(scores.end(), out.begin(), out.end()); } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 99dad4f18..3245fc258 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -4,12 +4,17 @@ #include "utils/types.h" #include +#include #include #include #include namespace dorado { +namespace utils { +class SampleSheet; +} + namespace demux { struct ScoreResults { @@ -38,7 +43,7 @@ class BarcodeClassifier { ScoreResults barcode(const std::string& seq, bool barcode_both_ends, - const BarcodingInfo::FilterSet& allowed_barcodes) const; + const std::shared_ptr& sample_sheet) const; private: const std::vector m_adapter_sequences; @@ -48,19 +53,19 @@ class BarcodeClassifier { std::vector calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const; + const std::shared_ptr& sample_sheet) const; std::vector calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const; + const std::shared_ptr& sample_sheet) const; std::vector calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const BarcodingInfo::FilterSet& allowed_barcodes) const; + const std::shared_ptr& sample_sheet) const; ScoreResults find_best_adapter(const std::string& read_seq, const std::vector& adapter, bool barcode_both_ends, - const BarcodingInfo::FilterSet& allowed_barcodes) const; + const std::shared_ptr& sample_sheet) const; }; } // namespace demux diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 3d4383b7a..6984abbc4 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -40,11 +40,13 @@ BarcodeClassifierNode::BarcodeClassifierNode(int threads, const std::vector& kit_names, bool barcode_both_ends, bool no_trim, - const BarcodingInfo::FilterSet& allowed_barcodes) + std::shared_ptr sample_sheet) : MessageSink(10000), m_threads(threads), - m_default_barcoding_info( - create_barcoding_info(kit_names, barcode_both_ends, !no_trim, allowed_barcodes)) { + m_default_barcoding_info(create_barcoding_info(kit_names, + barcode_both_ends, + !no_trim, + std::move(sample_sheet))) { start_threads(); } @@ -259,7 +261,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { std::string seq = utils::extract_sequence(irecord, seqlen); auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, - m_default_barcoding_info->allowed_barcodes); + m_default_barcoding_info->sample_sheet); auto bc = generate_barcode_string(bc_res); bam_aux_append(irecord, "BC", 'Z', bc.length() + 1, (uint8_t*)bc.c_str()); m_num_records++; @@ -278,7 +280,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { // get the sequence to map from the record auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends, - barcoding_info->allowed_barcodes); + barcoding_info->sample_sheet); read.read_common.barcode = generate_barcode_string(bc_res); m_num_records++; if (barcoding_info->trim) { diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index a91cbdcb7..4e4c60e87 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -5,6 +5,7 @@ #include "utils/types.h" #include +#include #include #include #include @@ -15,13 +16,17 @@ namespace demux { struct ScoreResults; } +namespace utils { +class SampleSheet; +} + class BarcodeClassifierNode : public MessageSink { public: BarcodeClassifierNode(int threads, const std::vector& kit_name, bool barcode_both_ends, bool no_trim, - const BarcodingInfo::FilterSet& allowed_barcodes); + std::shared_ptr sample_sheet); BarcodeClassifierNode(int threads); ~BarcodeClassifierNode(); std::string get_name() const override { return "BarcodeClassifierNode"; } diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index dec6e024e..1fa0afb40 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -147,6 +147,28 @@ void SampleSheet::load(std::istream& file_stream, const std::string& filename) { std::string("Unable to infer barcode aliases from sample sheet file: " + filename + " does not contain a unique mapping of barcode ids.")); } + + m_allowed_barcodes = [this]() -> FilterSet { + std::unordered_set barcodes; + switch (m_type) { + case Type::barcode: { + // Grab the barcode idx once so that we're not doing it repeatedly + const auto barcode_idx = m_col_indices.at("barcode"); + + // Grab the barcodes + for (const auto& row : m_rows) { + barcodes.emplace(row[barcode_idx]); + } + break; + } + case Type::none: + [[fallthrough]]; + default: + return std::nullopt; + } + + return barcodes; + }(); } // check if we can generate a unique alias without the flowcell/position information @@ -202,27 +224,15 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } -BarcodingInfo::FilterSet SampleSheet::get_barcode_values() const { - std::unordered_set barcodes; +SampleSheet::FilterSet SampleSheet::get_barcode_values() const { return m_allowed_barcodes; } - switch (m_type) { - case Type::barcode: { - // Grab the barcode idx once so that we're not doing it repeatedly - const auto barcode_idx = m_col_indices.at("barcode"); - - // Grab the barcodes - for (const auto& row : m_rows) { - barcodes.emplace(row[barcode_idx]); - } - break; - } - case Type::none: - [[fallthrough]]; - default: - return std::nullopt; +bool SampleSheet::barcode_is_permitted(const std::string& adapter_name) const { + if (!m_allowed_barcodes.has_value()) { + return true; } - return barcodes; + auto normalized_barcode_name = barcode_kits::normalize_barcode_name(adapter_name); + return m_allowed_barcodes->count(normalized_barcode_name) != 0; } void SampleSheet::validate_headers(const std::vector& col_names, diff --git a/dorado/utils/SampleSheet.h b/dorado/utils/SampleSheet.h index 2c335e6dd..64b025d4b 100644 --- a/dorado/utils/SampleSheet.h +++ b/dorado/utils/SampleSheet.h @@ -6,12 +6,15 @@ #include #include #include +#include #include namespace dorado::utils { class SampleSheet { public: + using FilterSet = std::optional>; + enum class Type { none, barcode }; enum IndexBits { FLOW_CELL_ID = 0, @@ -22,38 +25,40 @@ class SampleSheet { // If skip_index_matching is true the lookup by flowcell/experiment id will be skipped when fetching an alias. // In this case, the constructor will throw if the sample sheet contains entries for more that one flow_cell_id, // position_id or experiment_id, or if any barcode is re-used. - explicit SampleSheet(const std::string & filename = std::string(), + explicit SampleSheet(const std::string& filename = std::string(), bool skip_index_matching = false); // load a sample sheet from a file. Throws a std::runtime_error for any failure condition. - void load(const std::string & filename); + void load(const std::string& filename); // (Testability) load a sample sheet from the given stream. // Throws a std::runtime_error for any failure condition using the filename in the // error message. - void load(std::istream & input_stream, const std::string & filename); + void load(std::istream& input_stream, const std::string& filename); // Return the sample sheet filename. - const std::string & get_filename() const { return m_filename; } + const std::string& get_filename() const { return m_filename; } // Return the sample sheet type based on the column headers it contains. Type get_type() const { return m_type; } - bool contains_column(const std::string & column) const { return m_col_indices.count(column); } + bool contains_column(const std::string& column) const { return m_col_indices.count(column); } // For a given flow_cell_id, position_id, experiment_id and barcode, get the named alias. // Returns an empty string if one does not exist in the loaded sample sheet, or if the sample // sheet is not of type "barcode". - std::string get_alias(const std::string & flow_cell_id, - const std::string & position_id, - const std::string & experiment_id, - const std::string & barcode) const; + std::string get_alias(const std::string& flow_cell_id, + const std::string& position_id, + const std::string& experiment_id, + const std::string& barcode) const; /** * Get all of the barcodes that are present in the sample sheet. * @return All of the barcodes that are present, or std::nullopt if the sample sheet is not loaded. */ - BarcodingInfo::FilterSet get_barcode_values() const; + FilterSet get_barcode_values() const; + + bool barcode_is_permitted(const std::string& adapter_name) const; private: using Row = std::vector; @@ -63,16 +68,17 @@ class SampleSheet { std::unordered_map m_col_indices; std::vector m_rows; bool m_skip_index_matching; - - void validate_headers(const std::vector & col_names, const std::string & filename); - bool check_index(const std::string & flow_cell_id, const std::string & position_id) const; - bool match_index(const Row & row, - const std::string & flow_cell_id, - const std::string & position_id, - const std::string & experiment_id) const; - std::string get(const Row & row, const std::string & key) const; - void validate_text(const Row & row, const std::string & key) const; - void validate_alias(const Row & row, const std::string & key) const; + FilterSet m_allowed_barcodes; + + void validate_headers(const std::vector& col_names, const std::string& filename); + bool check_index(const std::string& flow_cell_id, const std::string& position_id) const; + bool match_index(const Row& row, + const std::string& flow_cell_id, + const std::string& position_id, + const std::string& experiment_id) const; + std::string get(const Row& row, const std::string& key) const; + void validate_text(const Row& row, const std::string& key) const; + void validate_alias(const Row& row, const std::string& key) const; bool is_barcode_mapping_unique() const; }; @@ -97,7 +103,7 @@ enum class EolFileFormat { // identifiers decorated with "_eol" suffix to avoid * the NL character, unless opened for binary reading. But for our purposes we are not interested in * the underlying truth of the file format just the style in which it appears in the stream. */ -EolFileFormat get_eol_file_format(std::istream & input); +EolFileFormat get_eol_file_format(std::istream& input); } // namespace details diff --git a/dorado/utils/types.cpp b/dorado/utils/types.cpp index ac98466ff..ada23fb0f 100644 --- a/dorado/utils/types.cpp +++ b/dorado/utils/types.cpp @@ -9,12 +9,13 @@ std::shared_ptr create_barcoding_info( const std::vector& kit_names, bool barcode_both_ends, bool trim_barcode, - const BarcodingInfo::FilterSet& allowed_barcodes) { + std::shared_ptr sample_sheet) { if (kit_names.empty()) { return {}; } - auto result = BarcodingInfo{kit_names[0], barcode_both_ends, trim_barcode, allowed_barcodes}; + auto result = + BarcodingInfo{kit_names[0], barcode_both_ends, trim_barcode, std::move(sample_sheet)}; return std::make_shared(std::move(result)); } diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 86b4c3f9c..1c7633993 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -4,7 +4,6 @@ #include #include #include -#include #include struct bam1_t; @@ -13,19 +12,22 @@ struct sam_hdr_t; namespace dorado { +namespace utils { +class SampleSheet; +} + struct BarcodingInfo { - using FilterSet = std::optional>; std::string kit_name{}; bool barcode_both_ends{false}; bool trim{false}; - FilterSet allowed_barcodes; + std::shared_ptr sample_sheet; }; std::shared_ptr create_barcoding_info( const std::vector &kit_names, bool barcode_both_ends, bool trim_barcode, - const BarcodingInfo::FilterSet &allowed_barcodes); + std::shared_ptr sample_sheet); struct ReadGroup { std::string run_id; diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index 146161914..23c07b6c2 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -61,7 +61,7 @@ TEST_CASE("BarcodeClassifier: test single ended barcode", TEST_GROUP) { auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, std::nullopt); + auto res = classifier.barcode(seq, false, nullptr); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -89,7 +89,7 @@ TEST_CASE("BarcodeClassifier: test double ended barcode", TEST_GROUP) { auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, std::nullopt); + auto res = classifier.barcode(seq, false, nullptr); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -119,7 +119,7 @@ TEST_CASE("BarcodeClassifier: test double ended barcode with different variants" auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, std::nullopt); + auto res = classifier.barcode(seq, false, nullptr); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -148,8 +148,8 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - failing case", TEST_ auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto single_end_res = classifier.barcode(seq, false, std::nullopt); - auto double_end_res = classifier.barcode(seq, true, std::nullopt); + auto single_end_res = classifier.barcode(seq, false, nullptr); + auto double_end_res = classifier.barcode(seq, true, nullptr); CHECK(double_end_res.adapter_name == "unclassified"); CHECK(single_end_res.adapter_name == "BC15"); } @@ -167,8 +167,8 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - passing case", TEST_ auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto single_end_res = classifier.barcode(seq, false, std::nullopt); - auto double_end_res = classifier.barcode(seq, true, std::nullopt); + auto single_end_res = classifier.barcode(seq, false, nullptr); + auto double_end_res = classifier.barcode(seq, true, nullptr); CHECK(double_end_res.adapter_name == single_end_res.adapter_name); CHECK(single_end_res.adapter_name == "BC01"); } @@ -189,13 +189,12 @@ TEST_CASE( CAPTURE(barcode_both_ends); CAPTURE(use_per_read_barcoding); constexpr bool no_trim = false; - auto barcoding_info = - dorado::create_barcoding_info(kits, barcode_both_ends, !no_trim, std::nullopt); + auto barcoding_info = dorado::create_barcoding_info(kits, barcode_both_ends, !no_trim, nullptr); if (use_per_read_barcoding) { pipeline_desc.add_node({sink}, 8); } else { pipeline_desc.add_node({sink}, 8, kits, barcode_both_ends, no_trim, - std::nullopt); + nullptr); } auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); @@ -344,7 +343,7 @@ TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", bool barcode_both_ends = false; bool no_trim = false; auto classifier = pipeline_desc.add_node( - {sink}, 8, kits, barcode_both_ends, no_trim, std::nullopt); + {sink}, 8, kits, barcode_both_ends, no_trim, nullptr); auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); fs::path data_dir = fs::path(get_data_dir("barcode_demux")); diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index f82f93c83..e3faba92f 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -342,8 +342,7 @@ DEFINE_TEST(NodeSmokeTestRead, "BarcodeClassifierNode") { set_pipeline_restart(pipeline_restart); std::vector kits = {"SQK-RPB004", "EXP-NBD196"}; - run_smoke_test(2, kits, barcode_both_ends, no_trim, - std::nullopt); + run_smoke_test(2, kits, barcode_both_ends, no_trim, nullptr); } TEST_CASE("BarcodeClassifierNode: test simple pipeline with fastq and sam files") { @@ -354,7 +353,7 @@ TEST_CASE("BarcodeClassifierNode: test simple pipeline with fastq and sam files" bool barcode_both_ends = GENERATE(true, false); bool no_trim = GENERATE(true, false); auto classifier = pipeline_desc.add_node( - {sink}, 8, kits, barcode_both_ends, no_trim, std::nullopt); + {sink}, 8, kits, barcode_both_ends, no_trim, nullptr); auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); From fd63aafbc9597d892bd3048aea128f44d99531ea Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 15:31:21 +0100 Subject: [PATCH 02/32] Set flowcell and position id in reads --- dorado/data_loader/DataLoader.cpp | 13 +++++++++++++ dorado/read_pipeline/ReadPipeline.h | 7 ++++--- dorado/read_pipeline/StereoDuplexEncoderNode.cpp | 2 ++ dorado/read_pipeline/read_utils.cpp | 2 ++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index fc8eb04c4..e57599e50 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -151,6 +151,7 @@ SimplexReadPtr process_pod5_read(size_t row, new_read->start_sample = read_data.start_sample; new_read->end_sample = read_data.start_sample + read_data.num_samples; new_read->read_common.flowcell_id = run_info_data->flow_cell_id; + new_read->read_common.position_id = run_info_data->sequencer_position; new_read->read_common.is_duplex = false; if (pod5_free_run_info(run_info_data) != POD5_OK) { @@ -748,10 +749,19 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { std::string fast5_filename = std::filesystem::path(path).filename().string(); HighFive::Group tracking_id_group = read.getGroup("tracking_id"); + HighFive::Attribute exp_start_time_attr = tracking_id_group.getAttribute("exp_start_time"); std::string exp_start_time; string_reader(exp_start_time_attr, exp_start_time); + HighFive::Attribute flow_cell_id_attr = tracking_id_group.getAttribute("flow_cell_id"); + std::string flow_cell_id; + string_reader(flow_cell_id_attr, flow_cell_id); + + HighFive::Attribute device_id_attr = tracking_id_group.getAttribute("device_id"); + std::string device_id; + string_reader(device_id_attr, device_id); + auto start_time_str = utils::adjust_time(exp_start_time, static_cast(start_time / sampling_rate)); @@ -769,6 +779,9 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { new_read->read_common.attributes.channel_number = channel_number; new_read->read_common.attributes.start_time = start_time_str; new_read->read_common.attributes.fast5_filename = fast5_filename; + new_read->read_common.flowcell_id = flow_cell_id; + new_read->read_common.position_id = device_id; + new_read->read_common.is_duplex = false; if (!m_allowed_read_ids || (m_allowed_read_ids->find(new_read->read_common.read_id) != diff --git a/dorado/read_pipeline/ReadPipeline.h b/dorado/read_pipeline/ReadPipeline.h index af45af999..5659529aa 100644 --- a/dorado/read_pipeline/ReadPipeline.h +++ b/dorado/read_pipeline/ReadPipeline.h @@ -40,9 +40,10 @@ class ReadCommon { std::string qstring; // Read Qstring (Phred) std::vector moves; // Move table std::vector base_mod_probs; // Modified base probabilities - std::string run_id; // Run ID - used in read group - std::string flowcell_id; // Flowcell ID - used in read group - std::string model_name; // Read group + std::string run_id; // Run ID - used in read group and for sample sheet aliasing + std::string flowcell_id; // Flowcell ID - used in read group and for sample sheet aliasing + std::string position_id; // Position ID - used for sample sheet aliasing + std::string model_name; // Read group dorado::details::Attributes attributes; diff --git a/dorado/read_pipeline/StereoDuplexEncoderNode.cpp b/dorado/read_pipeline/StereoDuplexEncoderNode.cpp index 7835be7a4..9f604ccc0 100644 --- a/dorado/read_pipeline/StereoDuplexEncoderNode.cpp +++ b/dorado/read_pipeline/StereoDuplexEncoderNode.cpp @@ -270,6 +270,8 @@ DuplexReadPtr StereoDuplexEncoderNode::stereo_encode(const ReadPair& read_pair) read->read_common.raw_data = tmp; // use the encoded signal read->read_common.is_duplex = true; read->read_common.run_id = template_read.read_common.run_id; + read->read_common.flowcell_id = template_read.read_common.flowcell_id; + read->read_common.position_id = template_read.read_common.position_id; edlibFreeAlignResult(result); diff --git a/dorado/read_pipeline/read_utils.cpp b/dorado/read_pipeline/read_utils.cpp index 27918427f..4342a7f08 100644 --- a/dorado/read_pipeline/read_utils.cpp +++ b/dorado/read_pipeline/read_utils.cpp @@ -21,6 +21,8 @@ SimplexReadPtr shallow_copy_read(const SimplexRead& read) { copy->read_common.qstring = read.read_common.qstring; copy->read_common.moves = read.read_common.moves; copy->read_common.run_id = read.read_common.run_id; + copy->read_common.flowcell_id = read.read_common.flowcell_id; + copy->read_common.position_id = read.read_common.position_id; copy->read_common.model_name = read.read_common.model_name; copy->read_common.base_mod_probs = read.read_common.base_mod_probs; From 682ec2d1ae2538426082d27f2c646c00b06da695 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 15:48:33 +0100 Subject: [PATCH 03/32] Update index checks - it should valid to have information that we don't require --- dorado/utils/SampleSheet.cpp | 37 ++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index 1fa0afb40..e00432fe1 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -314,18 +314,43 @@ void SampleSheet::validate_alias(const Row& row, const std::string& key) const { bool SampleSheet::check_index(const std::string& flow_cell_id, const std::string& position_id) const { - return m_skip_index_matching || ((m_index[FLOW_CELL_ID] == !flow_cell_id.empty()) && - (m_index[POSITION_ID] == !position_id.empty())); + if (m_skip_index_matching) { + return true; + } + + bool ok = false; + if (m_index[FLOW_CELL_ID]) { + // if we're expecting a flow cell id, we must provide one + ok |= !flow_cell_id.empty(); + } + if (m_index[POSITION_ID]) { + // if we're expecting a position id, we must provide one + ok |= !position_id.empty(); + } + return ok; } bool SampleSheet::match_index(const Row& row, const std::string& flow_cell_id, const std::string& position_id, const std::string& experiment_id) const { - return m_skip_index_matching || - ((!m_index[FLOW_CELL_ID] || get(row, "flow_cell_id") == flow_cell_id) && - (!m_index[POSITION_ID] || get(row, "position_id") == position_id) && - (get(row, "experiment_id") == experiment_id)); + if (m_skip_index_matching) { + return true; + } + + if (get(row, "experiment_id") != experiment_id) { + return false; + } + + if (m_index[FLOW_CELL_ID] && (get(row, "flow_cell_id") != flow_cell_id)) { + return false; + } + + if (m_index[POSITION_ID] && (get(row, "position_id") != position_id)) { + return false; + } + + return true; } std::string SampleSheet::get(const Row& row, const std::string& key) const { From 6b8266712e636f2ddd560df4b82da6b9eb460c79 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 16:06:30 +0100 Subject: [PATCH 04/32] Add position id to the read group --- dorado/data_loader/DataLoader.cpp | 4 +++- dorado/utils/types.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index e57599e50..d6c25cf50 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -418,6 +418,7 @@ std::unordered_map DataLoader::load_read_groups( std::string device_id = run_info_data->system_name; std::string run_id = run_info_data->acquisition_id; std::string sample_id = run_info_data->sample_id; + std::string position_id = run_info_data->sequencer_position; if (pod5_free_run_info(run_info_data) != POD5_OK) { spdlog::error("Failed to free run info"); @@ -430,7 +431,8 @@ std::unordered_map DataLoader::load_read_groups( flowcell_id, device_id, utils::get_string_timestamp_from_unix_time(exp_start_time_ms), - sample_id}; + sample_id, + position_id}; } if (pod5_close_and_free_reader(file) != POD5_OK) { spdlog::error("Failed to close and free POD5 reader"); diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 1c7633993..fc52670cc 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -36,6 +36,7 @@ struct ReadGroup { std::string device_id; std::string exp_start_time; std::string sample_id; + std::string position_id; }; struct BamDestructor { From 2ca7dabbae128b1b5df8651356e778d972c2a937 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 17:14:36 +0100 Subject: [PATCH 05/32] Use aliases in read group headers --- dorado/cli/basecaller.cpp | 7 ++++++- dorado/cli/duplex.cpp | 2 +- dorado/utils/SampleSheet.cpp | 3 ++- dorado/utils/bam_utils.cpp | 22 +++++++++++++++++++--- dorado/utils/bam_utils.h | 5 ++++- tests/BamUtilsTest.cpp | 8 ++++---- 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 7bad067cf..eec837e77 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -123,9 +123,14 @@ void setup(std::vector args, num_devices, !remora_runners.empty() ? num_remora_threads : 0, enable_aligner, !barcode_kits.empty()); + std::shared_ptr sample_sheet; + if (!barcode_sample_sheet.empty()) { + sample_sheet = std::make_shared(barcode_sample_sheet); + } + SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits); + utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet); PipelineDescriptor pipeline_desc; auto hts_writer = pipeline_desc.add_node( diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 35940d1c9..21e4113e6 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -287,7 +287,7 @@ int duplex(int argc, char* argv[]) { read_groups.merge( DataLoader::load_read_groups(reads, duplex_rg_name, recursive_file_loading)); std::vector barcode_kits; - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits); + utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr); int batch_size(parser.visible.get("-b")); int chunk_size(parser.visible.get("-c")); diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index e00432fe1..1b5ab40cf 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -213,9 +213,10 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } + auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode); for (const auto& row : m_rows) { if (match_index(row, flow_cell_id, position_id, experiment_id) && - get(row, "barcode") == barcode) { + get(row, "barcode") == normalized_barcode_name) { return get(row, "alias"); } } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 83151adbe..78ce9eaad 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -1,5 +1,6 @@ #include "bam_utils.h" +#include "SampleSheet.h" #include "barcode_kits.h" #include "sequence_utils.h" @@ -17,7 +18,8 @@ namespace dorado::utils { void add_rg_hdr(sam_hdr_t* hdr, const std::unordered_map& read_groups, - const std::vector& barcode_kits) { + const std::vector& barcode_kits, + const std::shared_ptr& sample_sheet) { const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); const auto& barcode_sequences = barcode_kits::get_barcodes(); @@ -66,8 +68,22 @@ void add_rg_hdr(sam_hdr_t* hdr, for (const auto& barcode_name : kit_info.barcodes) { const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); for (const auto& read_group : read_groups) { - auto id = read_group.first + '_' + - barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); + std::string alias; + auto barcode = barcode_name; + auto id = read_group.first + '_'; + if (sample_sheet) { + if (!sample_sheet->barcode_is_permitted(barcode_name)) { + continue; + } + alias = sample_sheet->get_alias(read_group.second.flowcell_id, + read_group.second.position_id, + read_group.second.run_id, barcode); + } + if (!alias.empty()) { + id += kit_name + "_" + alias; + } else { + id += barcode_kits::generate_standard_barcode_name(kit_name, barcode); + } const std::string read_group_tags = to_string(read_group.second); emit_read_group(read_group_tags, id, additional_tags); } diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 1e4c50c99..a9cc4207b 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -10,6 +10,8 @@ struct sam_hdr_t; namespace dorado::utils { +class SampleSheet; + using sq_t = std::vector>; struct AlignmentOps { @@ -23,7 +25,8 @@ struct AlignmentOps { void add_rg_hdr(sam_hdr_t* hdr, const std::unordered_map& read_groups, - const std::vector& barcode_kits); + const std::vector& barcode_kits, + const std::shared_ptr& sample_sheet); void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index 9e41e7165..2117ce8f5 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -64,7 +64,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("No read groups generate no headers") { dorado::SamHdrPtr sam_header(sam_hdr_init()); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); - dorado::utils::add_rg_hdr(sam_header.get(), {}, {}); + dorado::utils::add_rg_hdr(sam_header.get(), {}, {}, nullptr); CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0); } @@ -77,7 +77,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}); + dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}, nullptr); // Check the IDs of the groups are all there. CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == read_groups.size()); @@ -97,7 +97,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups with barcodes") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits); + dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits, nullptr); // Check the IDs of the groups are all there. size_t total_barcodes = 0; @@ -130,7 +130,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) { SECTION("Read groups with unknown barcode kit") { dorado::SamHdrPtr sam_header(sam_hdr_init()); - CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"})); + CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"}, nullptr)); } } From dc275a3976ab7d8258516999bb4ab1ec1127055d Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 17:14:45 +0100 Subject: [PATCH 06/32] Update tests to match pod5 from tests/data/barcode_demux/read_group_test --- tests/SampleSheetTests.cpp | 32 +++++++++++++-------- tests/data/sample_sheets/single_barcode.csv | 16 +++++------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tests/SampleSheetTests.cpp b/tests/SampleSheetTests.cpp index 91cff2207..f1eca2490 100644 --- a/tests/SampleSheetTests.cpp +++ b/tests/SampleSheetTests.cpp @@ -29,7 +29,8 @@ TEST_CASE(CUT_TAG " load valid no-barcode sample sheet", CUT_TAG) { // Test that all the alias functions return empty strings std::string alias; - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("FA026858", "pos_id", "sequencing_20200522", + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", "barcode10")); CHECK(alias == ""); } @@ -42,28 +43,33 @@ TEST_CASE(CUT_TAG " load valid single barcode sample sheet", CUT_TAG) { // Test first entry loads correctly std::string alias; - REQUIRE_NOTHROW( - alias = sample_sheet.get_alias("FA026858", "", "sequencing_20200522", "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", + "barcode01")); CHECK(alias == "patient_id_5"); // Test last entry loads correctly - REQUIRE_NOTHROW( - alias = sample_sheet.get_alias("FA026858", "", "sequencing_20200522", "barcode08")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", + "barcode08")); CHECK(alias == "patient_id_4"); // TODO: is this what we want? // Test that asking for position_id when it's not there stops you getting an alias - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("FA026858", "pos_id", "sequencing_20200522", + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", "barcode01")); CHECK(alias == ""); // Test that asking for neither position_id or flowcell_id stops you getting an alias - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("", "", "sequencing_20200522", "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias( + "", "", "9bf5b3eb10d3b031970acc022aecad4ecc918865", "barcode01")); CHECK(alias == ""); // Test non-existent entry - REQUIRE_NOTHROW( - alias = sample_sheet.get_alias("FA026858", "", "sequencing_20200522", "barcode10")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", + "barcode10")); CHECK(alias == ""); } @@ -102,7 +108,8 @@ TEST_CASE(CUT_TAG " load sample sheet cross platform (parameterised)", CUT_TAG) CAPTURE(eol_chars); const std::string HEADER_LINE{"flow_cell_id,kit,sample_id,experiment_id,barcode,alias,type"}; const std::string RECORD_LINE{ - "FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode01,patient_id_5,test_" + "PAO25751,SQK-RBK004,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode01," + "patient_id_5,test_" "sample"}; dorado::utils::SampleSheet sample_sheet{}; std::stringstream input_file{HEADER_LINE + eol_chars + RECORD_LINE + eol_chars}; @@ -112,8 +119,9 @@ TEST_CASE(CUT_TAG " load sample sheet cross platform (parameterised)", CUT_TAG) REQUIRE(sample_sheet.get_type() == dorado::utils::SampleSheet::Type::barcode); std::string alias; - REQUIRE_NOTHROW( - alias = sample_sheet.get_alias("FA026858", "", "sequencing_20200522", "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", + "9bf5b3eb10d3b031970acc022aecad4ecc918865", + "barcode01")); CHECK(alias == "patient_id_5"); } diff --git a/tests/data/sample_sheets/single_barcode.csv b/tests/data/sample_sheets/single_barcode.csv index 37496d91f..607c028c9 100644 --- a/tests/data/sample_sheets/single_barcode.csv +++ b/tests/data/sample_sheets/single_barcode.csv @@ -1,9 +1,9 @@ flow_cell_id,kit,sample_id,experiment_id,barcode,alias,type -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode01,patient_id_5,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode02,patient_id_6,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode03,patient_id_7,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode04,patient_id_8,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode05,patient_id_1,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode06,patient_id_2,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode07,patient_id_3,test_sample -FA026858,SQK-RBK004,barcoding_run,sequencing_20200522,barcode08,patient_id_4,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode01,patient_id_5,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode02,patient_id_6,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode03,patient_id_7,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode04,patient_id_8,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode05,patient_id_1,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode06,patient_id_2,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode07,patient_id_3,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode08,patient_id_4,test_sample From 0a64e9faf72a785a02a259e27dc3bf8bde44fadf Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Mon, 23 Oct 2023 17:15:17 +0100 Subject: [PATCH 07/32] Apply alias when barcoding from Simplex Read (basecaller) --- dorado/read_pipeline/BarcodeClassifierNode.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 6984abbc4..2ba3e8ea0 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -1,6 +1,7 @@ #include "BarcodeClassifierNode.h" #include "demux/BarcodeClassifier.h" +#include "utils/SampleSheet.h" #include "utils/bam_utils.h" #include "utils/barcode_kits.h" #include "utils/trim.h" @@ -281,7 +282,17 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { // get the sequence to map from the record auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends, barcoding_info->sample_sheet); - read.read_common.barcode = generate_barcode_string(bc_res); + std::string alias{}; + if (barcoding_info->sample_sheet) { + alias = barcoding_info->sample_sheet->get_alias( + read.read_common.flowcell_id, read.read_common.position_id, read.read_common.run_id, + bc_res.adapter_name); + } + if (!alias.empty()) { + read.read_common.barcode = bc_res.kit + "_" + alias; + } else { + read.read_common.barcode = generate_barcode_string(bc_res); + } m_num_records++; if (barcoding_info->trim) { trim_barcode(read, bc_res); From 90bc91205058caafc66026cedb78028310712e2a Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 10:55:11 +0100 Subject: [PATCH 08/32] Add unclassified to the forbidden aliases --- dorado/utils/SampleSheet.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index 1b5ab40cf..afe1a6f92 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -52,6 +52,11 @@ bool is_alias_forbidden(const std::string& input) { return true; } + // Unclassified + if (input == "unclassified") { + return true; + } + return false; } From e8e6ae30834b78bb5564affe48346f88b2f8444d Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 11:04:10 +0100 Subject: [PATCH 09/32] Switch from run_id to experiment_id --- dorado/data_loader/DataLoader.cpp | 31 ++++++++++--------- .../read_pipeline/BarcodeClassifierNode.cpp | 4 +-- dorado/read_pipeline/ReadPipeline.h | 9 +++--- .../read_pipeline/StereoDuplexEncoderNode.cpp | 1 + dorado/read_pipeline/read_utils.cpp | 1 + 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index d6c25cf50..75d31759d 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -89,7 +89,16 @@ void string_reader(HighFive::Attribute& attribute, std::string& target_str) { if (eol_pos < target_str.size()) { target_str.resize(eol_pos); } -}; +} + +std::string get_string_attribute(const HighFive::Group& group, const std::string& attr_name) { + std::string attribute_string; + if (group.hasAttribute(attr_name)) { + HighFive::Attribute attribute = group.getAttribute(attr_name); + string_reader(attribute, attribute_string); + } + return attribute_string; +} } // namespace namespace dorado { @@ -152,6 +161,7 @@ SimplexReadPtr process_pod5_read(size_t row, new_read->end_sample = read_data.start_sample + read_data.num_samples; new_read->read_common.flowcell_id = run_info_data->flow_cell_id; new_read->read_common.position_id = run_info_data->sequencer_position; + new_read->read_common.experiment_id = run_info_data->experiment_name; new_read->read_common.is_duplex = false; if (pod5_free_run_info(run_info_data) != POD5_OK) { @@ -751,18 +761,11 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { std::string fast5_filename = std::filesystem::path(path).filename().string(); HighFive::Group tracking_id_group = read.getGroup("tracking_id"); - - HighFive::Attribute exp_start_time_attr = tracking_id_group.getAttribute("exp_start_time"); - std::string exp_start_time; - string_reader(exp_start_time_attr, exp_start_time); - - HighFive::Attribute flow_cell_id_attr = tracking_id_group.getAttribute("flow_cell_id"); - std::string flow_cell_id; - string_reader(flow_cell_id_attr, flow_cell_id); - - HighFive::Attribute device_id_attr = tracking_id_group.getAttribute("device_id"); - std::string device_id; - string_reader(device_id_attr, device_id); + std::string exp_start_time = get_string_attribute(tracking_id_group, "exp_start_time"); + std::string flow_cell_id = get_string_attribute(tracking_id_group, "flow_cell_id"); + std::string device_id = get_string_attribute(tracking_id_group, "device_id"); + std::string group_protocol_id = + get_string_attribute(tracking_id_group, "group_protocol_id"); auto start_time_str = utils::adjust_time(exp_start_time, static_cast(start_time / sampling_rate)); @@ -783,7 +786,7 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { new_read->read_common.attributes.fast5_filename = fast5_filename; new_read->read_common.flowcell_id = flow_cell_id; new_read->read_common.position_id = device_id; - + new_read->read_common.experiment_id = group_protocol_id; new_read->read_common.is_duplex = false; if (!m_allowed_read_ids || (m_allowed_read_ids->find(new_read->read_common.read_id) != diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 2ba3e8ea0..944001b44 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -285,8 +285,8 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { std::string alias{}; if (barcoding_info->sample_sheet) { alias = barcoding_info->sample_sheet->get_alias( - read.read_common.flowcell_id, read.read_common.position_id, read.read_common.run_id, - bc_res.adapter_name); + read.read_common.flowcell_id, read.read_common.position_id, + read.read_common.experiment_id, bc_res.adapter_name); } if (!alias.empty()) { read.read_common.barcode = bc_res.kit + "_" + alias; diff --git a/dorado/read_pipeline/ReadPipeline.h b/dorado/read_pipeline/ReadPipeline.h index 5659529aa..deea3e5f5 100644 --- a/dorado/read_pipeline/ReadPipeline.h +++ b/dorado/read_pipeline/ReadPipeline.h @@ -40,10 +40,11 @@ class ReadCommon { std::string qstring; // Read Qstring (Phred) std::vector moves; // Move table std::vector base_mod_probs; // Modified base probabilities - std::string run_id; // Run ID - used in read group and for sample sheet aliasing - std::string flowcell_id; // Flowcell ID - used in read group and for sample sheet aliasing - std::string position_id; // Position ID - used for sample sheet aliasing - std::string model_name; // Read group + std::string run_id; // Run ID - used in read group + std::string flowcell_id; // Flowcell ID - used in read group and for sample sheet aliasing + std::string position_id; // Position ID - used for sample sheet aliasing + std::string experiment_id; // Experiment ID - used for sample sheet aliasing + std::string model_name; // Read group dorado::details::Attributes attributes; diff --git a/dorado/read_pipeline/StereoDuplexEncoderNode.cpp b/dorado/read_pipeline/StereoDuplexEncoderNode.cpp index 9f604ccc0..25032efa0 100644 --- a/dorado/read_pipeline/StereoDuplexEncoderNode.cpp +++ b/dorado/read_pipeline/StereoDuplexEncoderNode.cpp @@ -272,6 +272,7 @@ DuplexReadPtr StereoDuplexEncoderNode::stereo_encode(const ReadPair& read_pair) read->read_common.run_id = template_read.read_common.run_id; read->read_common.flowcell_id = template_read.read_common.flowcell_id; read->read_common.position_id = template_read.read_common.position_id; + read->read_common.experiment_id = template_read.read_common.experiment_id; edlibFreeAlignResult(result); diff --git a/dorado/read_pipeline/read_utils.cpp b/dorado/read_pipeline/read_utils.cpp index 4342a7f08..37ff88406 100644 --- a/dorado/read_pipeline/read_utils.cpp +++ b/dorado/read_pipeline/read_utils.cpp @@ -23,6 +23,7 @@ SimplexReadPtr shallow_copy_read(const SimplexRead& read) { copy->read_common.run_id = read.read_common.run_id; copy->read_common.flowcell_id = read.read_common.flowcell_id; copy->read_common.position_id = read.read_common.position_id; + copy->read_common.experiment_id = read.read_common.experiment_id; copy->read_common.model_name = read.read_common.model_name; copy->read_common.base_mod_probs = read.read_common.base_mod_probs; From fdfc526d6ed3865dc7d70c3b0d7ef073a16435b9 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 11:05:16 +0100 Subject: [PATCH 10/32] Update test data again --- tests/SampleSheetTests.cpp | 34 ++++++--------------- tests/data/sample_sheets/single_barcode.csv | 16 +++++----- 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/tests/SampleSheetTests.cpp b/tests/SampleSheetTests.cpp index f1eca2490..959115b75 100644 --- a/tests/SampleSheetTests.cpp +++ b/tests/SampleSheetTests.cpp @@ -29,9 +29,7 @@ TEST_CASE(CUT_TAG " load valid no-barcode sample sheet", CUT_TAG) { // Test that all the alias functions return empty strings std::string alias; - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode10")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", "", "barcode10")); CHECK(alias == ""); } @@ -43,33 +41,23 @@ TEST_CASE(CUT_TAG " load valid single barcode sample sheet", CUT_TAG) { // Test first entry loads correctly std::string alias; - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", "", "barcode01")); CHECK(alias == "patient_id_5"); // Test last entry loads correctly - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode08")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", "", "barcode08")); CHECK(alias == "patient_id_4"); - // TODO: is this what we want? - // Test that asking for position_id when it's not there stops you getting an alias - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode01")); - CHECK(alias == ""); + // Test that providing position_id when it's not in the sample sheet doesn't stop you getting an alias + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "pos_id", "", "barcode01")); + CHECK(alias == "patient_id_5"); // Test that asking for neither position_id or flowcell_id stops you getting an alias - REQUIRE_NOTHROW(alias = sample_sheet.get_alias( - "", "", "9bf5b3eb10d3b031970acc022aecad4ecc918865", "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("", "", "", "barcode01")); CHECK(alias == ""); // Test non-existent entry - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode10")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", "", "barcode10")); CHECK(alias == ""); } @@ -108,7 +96,7 @@ TEST_CASE(CUT_TAG " load sample sheet cross platform (parameterised)", CUT_TAG) CAPTURE(eol_chars); const std::string HEADER_LINE{"flow_cell_id,kit,sample_id,experiment_id,barcode,alias,type"}; const std::string RECORD_LINE{ - "PAO25751,SQK-RBK004,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode01," + "PAO25751,SQK-RBK004,barcoding_run,,barcode01," "patient_id_5,test_" "sample"}; dorado::utils::SampleSheet sample_sheet{}; @@ -119,9 +107,7 @@ TEST_CASE(CUT_TAG " load sample sheet cross platform (parameterised)", CUT_TAG) REQUIRE(sample_sheet.get_type() == dorado::utils::SampleSheet::Type::barcode); std::string alias; - REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", - "9bf5b3eb10d3b031970acc022aecad4ecc918865", - "barcode01")); + REQUIRE_NOTHROW(alias = sample_sheet.get_alias("PAO25751", "", "", "barcode01")); CHECK(alias == "patient_id_5"); } diff --git a/tests/data/sample_sheets/single_barcode.csv b/tests/data/sample_sheets/single_barcode.csv index 607c028c9..823eb1660 100644 --- a/tests/data/sample_sheets/single_barcode.csv +++ b/tests/data/sample_sheets/single_barcode.csv @@ -1,9 +1,9 @@ flow_cell_id,kit,sample_id,experiment_id,barcode,alias,type -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode01,patient_id_5,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode02,patient_id_6,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode03,patient_id_7,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode04,patient_id_8,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode05,patient_id_1,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode06,patient_id_2,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode07,patient_id_3,test_sample -PAO25751,SQK-RBK114-96,barcoding_run,9bf5b3eb10d3b031970acc022aecad4ecc918865,barcode08,patient_id_4,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode01,patient_id_5,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode02,patient_id_6,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode03,patient_id_7,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode04,patient_id_8,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode05,patient_id_1,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode06,patient_id_2,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode07,patient_id_3,test_sample +PAO25751,SQK-RBK114-96,barcoding_run,,barcode08,patient_id_4,test_sample From 0a8dd5c62d0b6d79639e7a25dae61aba35c54957 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 11:07:20 +0100 Subject: [PATCH 11/32] Remove kit name from alias --- dorado/read_pipeline/BarcodeClassifierNode.cpp | 2 +- dorado/utils/bam_utils.cpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 944001b44..7de439784 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -289,7 +289,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { read.read_common.experiment_id, bc_res.adapter_name); } if (!alias.empty()) { - read.read_common.barcode = bc_res.kit + "_" + alias; + read.read_common.barcode = alias; } else { read.read_common.barcode = generate_barcode_string(bc_res); } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 78ce9eaad..21566628d 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -69,7 +69,6 @@ void add_rg_hdr(sam_hdr_t* hdr, const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); for (const auto& read_group : read_groups) { std::string alias; - auto barcode = barcode_name; auto id = read_group.first + '_'; if (sample_sheet) { if (!sample_sheet->barcode_is_permitted(barcode_name)) { @@ -77,12 +76,12 @@ void add_rg_hdr(sam_hdr_t* hdr, } alias = sample_sheet->get_alias(read_group.second.flowcell_id, read_group.second.position_id, - read_group.second.run_id, barcode); + read_group.second.run_id, barcode_name); } if (!alias.empty()) { - id += kit_name + "_" + alias; + id += alias; } else { - id += barcode_kits::generate_standard_barcode_name(kit_name, barcode); + id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name); } const std::string read_group_tags = to_string(read_group.second); emit_read_group(read_group_tags, id, additional_tags); From 5abe6163a62455c191a2da523aed501f92898030 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 13:15:43 +0100 Subject: [PATCH 12/32] Destroy leaked sam header --- dorado/cli/demux.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index bf1c16506..e30657642 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -166,6 +166,7 @@ int demuxer(int argc, char* argv[]) { auto& demux_writer_ref = dynamic_cast(pipeline->get_node_ref(demux_writer)); demux_writer_ref.set_header(header); + sam_hdr_destroy(header); // Set up stats counting std::vector stats_callables; From a3f706d49b47a819b8c7882c08c3c87b85f40797 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 13:16:16 +0100 Subject: [PATCH 13/32] Fix use of run_id in run group aliasing --- dorado/utils/bam_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 21566628d..0e0f292d4 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -76,7 +76,7 @@ void add_rg_hdr(sam_hdr_t* hdr, } alias = sample_sheet->get_alias(read_group.second.flowcell_id, read_group.second.position_id, - read_group.second.run_id, barcode_name); + read_group.second.experiment_id, barcode_name); } if (!alias.empty()) { id += alias; From 3adf9e715dd027d3aac7803c5ab4b5461fc2d71e Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 13:16:41 +0100 Subject: [PATCH 14/32] Don't normalize unclassified barcodes --- dorado/utils/SampleSheet.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index afe1a6f92..bc97baa33 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -218,7 +218,8 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } - auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode); + auto normalized_barcode_name = + (barcode == "unclassified" ? barcode : barcode_kits::normalize_barcode_name(barcode)); for (const auto& row : m_rows) { if (match_index(row, flow_cell_id, position_id, experiment_id) && get(row, "barcode") == normalized_barcode_name) { From f21e0c2cbd643cc725f32d2b01e96cfdfdfb0fbc Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 14:04:34 +0100 Subject: [PATCH 15/32] Set experiment id in read group for use in run group aliasing --- dorado/data_loader/DataLoader.cpp | 4 +++- dorado/utils/types.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index c7d79f59e..2c90d79c5 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -478,6 +478,7 @@ std::unordered_map DataLoader::load_read_groups( std::string run_id = run_info_data->acquisition_id; std::string sample_id = run_info_data->sample_id; std::string position_id = run_info_data->sequencer_position; + std::string experiment_id = run_info_data->experiment_name; if (pod5_free_run_info(run_info_data) != POD5_OK) { spdlog::error("Failed to free run info"); @@ -491,7 +492,8 @@ std::unordered_map DataLoader::load_read_groups( device_id, utils::get_string_timestamp_from_unix_time(exp_start_time_ms), sample_id, - position_id}; + position_id, + experiment_id}; } if (pod5_close_and_free_reader(file) != POD5_OK) { spdlog::error("Failed to close and free POD5 reader"); diff --git a/dorado/utils/types.h b/dorado/utils/types.h index fc52670cc..27f7d221f 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -37,6 +37,7 @@ struct ReadGroup { std::string exp_start_time; std::string sample_id; std::string position_id; + std::string experiment_id; }; struct BamDestructor { From cc8c81de4c1855cfbf7291c1743ca18c6ffe326e Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 14:04:49 +0100 Subject: [PATCH 16/32] Update tests to perform aliasing --- tests/data/barcode_demux/sample_sheet.csv | 2 +- tests/test_simple_basecaller_execution.sh | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/data/barcode_demux/sample_sheet.csv b/tests/data/barcode_demux/sample_sheet.csv index d73594cbf..5dfb5ae83 100644 --- a/tests/data/barcode_demux/sample_sheet.csv +++ b/tests/data/barcode_demux/sample_sheet.csv @@ -1,2 +1,2 @@ flow_cell_id,kit,sample_id,experiment_id,barcode,alias,type -PAO25751,SQK-RBK114-96,no_sample,not_set,barcode01,patient_id_1,test_sample \ No newline at end of file +PAO25751,SQK-RBK114-96,no_sample,,barcode01,patient_id_1,test_sample \ No newline at end of file diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index 572dd9890..81b3f9371 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -180,23 +180,28 @@ fi echo dorado basecaller barcoding read groups test_barcoding_read_groups() ( - expected_read_groups_barcode01=$1 - expected_read_groups_barcode04=$2 - expected_read_groups_unclassified=$3 - sample_sheet=$4 + while (( "$#" >= 2 )); do + barcode=$1 + export expected_read_groups_${barcode}=$2 + shift 2 + done + sample_sheet=$1 output_name=read_group_test${sample_sheet:+_sample_sheet} $dorado_bin basecaller -b ${batch} --kit-name SQK-RBK114-96 ${sample_sheet:+--sample-sheet ${sample_sheet}} ${model_5k} $data_dir/barcode_demux/read_group_test > $output_dir/${output_name}.bam samtools quickcheck -u $output_dir/${output_name}.bam split_dir=$output_dir/${output_name} mkdir $split_dir samtools split -u $split_dir/unknown.bam -f "$split_dir/rg_%!.bam" $output_dir/${output_name}.bam - # There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups. for bam in $split_dir/rg_*.bam; do if [[ $bam =~ "_SQK-RBK114-96_" ]]; then # Arrangement is |_|, so trim the kit from the prefix and the .bam from the suffix. barcode=${bam#*_SQK-RBK114-96_} barcode=${barcode%.bam*} + elif [[ $bam =~ "_${model_name_5k}_" ]]; then + # Arrangement is ||, so trim the model from the prefix and the .bam from the suffix. + barcode=${bam#*_${model_name_5k}_} + barcode=${barcode%.bam*} else barcode="unclassified" fi @@ -217,8 +222,10 @@ test_barcoding_read_groups() ( fi ) -test_barcoding_read_groups 4 3 2 -test_barcoding_read_groups 4 0 5 $data_dir/barcode_demux/sample_sheet.csv +# There should be 4 reads with BC01, 3 with BC04, and 2 unclassified groups. +test_barcoding_read_groups barcode01 4 barcode04 3 unclassified 2 +# There should be 4 reads with BC01 aliased to patient_id_1, and 5 unclassified groups. +test_barcoding_read_groups patient_id_1 4 unclassified 5 $data_dir/barcode_demux/sample_sheet.csv # Test demux only on a pre-classified BAM file $dorado_bin demux --no-classify --output-dir "$output_dir/demux_only_test/" $output_dir/read_group_test.bam From 67346dead8479547179884753cf2417ca15fa854 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 16:53:46 +0100 Subject: [PATCH 17/32] No default arguments --- dorado/cli/basecaller.cpp | 2 +- dorado/cli/demux.cpp | 2 +- dorado/utils/SampleSheet.h | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 6f9915742..b7996f4cb 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -125,7 +125,7 @@ void setup(std::vector args, std::shared_ptr sample_sheet; if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_shared(barcode_sample_sheet); + sample_sheet = std::make_shared(barcode_sample_sheet, false); } SamHdrPtr hdr(sam_hdr_init()); diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index e30657642..375472532 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -146,7 +146,7 @@ int demuxer(int argc, char* argv[]) { auto barcode_sample_sheet = parser.get("--sample-sheet"); std::shared_ptr sample_sheet; if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_shared(barcode_sample_sheet); + sample_sheet = std::make_shared(barcode_sample_sheet, true); } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), diff --git a/dorado/utils/SampleSheet.h b/dorado/utils/SampleSheet.h index 64b025d4b..4084a3d7b 100644 --- a/dorado/utils/SampleSheet.h +++ b/dorado/utils/SampleSheet.h @@ -25,8 +25,7 @@ class SampleSheet { // If skip_index_matching is true the lookup by flowcell/experiment id will be skipped when fetching an alias. // In this case, the constructor will throw if the sample sheet contains entries for more that one flow_cell_id, // position_id or experiment_id, or if any barcode is re-used. - explicit SampleSheet(const std::string& filename = std::string(), - bool skip_index_matching = false); + explicit SampleSheet(const std::string& filename, bool skip_index_matching); // load a sample sheet from a file. Throws a std::runtime_error for any failure condition. void load(const std::string& filename); From 52d9bffcad5ab93edfa7823899ae2311aee17697 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 16:54:08 +0100 Subject: [PATCH 18/32] Move sample sheet in demux --- dorado/cli/demux.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 375472532..3cd63af13 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -150,7 +150,7 @@ int demuxer(int argc, char* argv[]) { } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), - parser.get("--no-trim"), sample_sheet); + parser.get("--no-trim"), std::move(sample_sheet)); } // Create the Pipeline from our description. From 4e9217102aacfd22729d376a748ac448166c819f Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 16:54:26 +0100 Subject: [PATCH 19/32] Remove shadowing variable --- dorado/cli/basecaller.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index b7996f4cb..384ff105c 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -151,10 +151,6 @@ void setup(std::vector args, PolyACalculator::get_model_type(model_name)); } if (!barcode_kits.empty()) { - std::shared_ptr sample_sheet; - if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_shared(barcode_sample_sheet); - } current_sink_node = pipeline_desc.add_node( {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, barcode_both_ends, barcode_no_trim, std::move(sample_sheet)); From 82187acbe62214add4f0b82bb1c72ef075821e67 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 16:55:05 +0100 Subject: [PATCH 20/32] Alias in demux path --- dorado/read_pipeline/BarcodeClassifierNode.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 7de439784..0321f72a1 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -263,7 +263,12 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, m_default_barcoding_info->sample_sheet); - auto bc = generate_barcode_string(bc_res); + std::string alias{}; + if (m_default_barcoding_info->sample_sheet) { + // experiment id and position id are not stored in the bam file, so we can't recover them to use here + alias = m_default_barcoding_info->sample_sheet->get_alias("", "", "", bc_res.adapter_name); + } + auto bc = alias.empty() ? generate_barcode_string(bc_res) : alias; bam_aux_append(irecord, "BC", 'Z', bc.length() + 1, (uint8_t*)bc.c_str()); m_num_records++; From aa4df3c95cfbfc205893eae9289950709a96b078 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 24 Oct 2023 17:07:44 +0100 Subject: [PATCH 21/32] Add default constructor for testing --- dorado/utils/SampleSheet.cpp | 2 ++ dorado/utils/SampleSheet.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index bc97baa33..edb22db4b 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -82,6 +82,8 @@ bool get_line(std::istream& input, namespace dorado::utils { +SampleSheet::SampleSheet() : m_skip_index_matching(false) {} + SampleSheet::SampleSheet(const std::string& filename, bool skip_index_matching) : m_filename(filename), m_skip_index_matching(skip_index_matching) { if (!filename.empty()) { diff --git a/dorado/utils/SampleSheet.h b/dorado/utils/SampleSheet.h index 4084a3d7b..867f57424 100644 --- a/dorado/utils/SampleSheet.h +++ b/dorado/utils/SampleSheet.h @@ -21,6 +21,8 @@ class SampleSheet { POSITION_ID, }; + SampleSheet(); + // Calls load on the passed filename, if it is not empty. // If skip_index_matching is true the lookup by flowcell/experiment id will be skipped when fetching an alias. // In this case, the constructor will throw if the sample sheet contains entries for more that one flow_cell_id, From 521bf0a2d44b8acf12e3890b03bcc0d7d6f2910f Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Wed, 25 Oct 2023 15:45:31 +0100 Subject: [PATCH 22/32] Fix check_index logic --- dorado/utils/SampleSheet.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index edb22db4b..ec8c4c4cb 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -327,14 +327,14 @@ bool SampleSheet::check_index(const std::string& flow_cell_id, return true; } - bool ok = false; + bool ok = m_index.any(); // one of the indicies must be set if (m_index[FLOW_CELL_ID]) { // if we're expecting a flow cell id, we must provide one - ok |= !flow_cell_id.empty(); + ok &= !flow_cell_id.empty(); } if (m_index[POSITION_ID]) { // if we're expecting a position id, we must provide one - ok |= !position_id.empty(); + ok &= !position_id.empty(); } return ok; } From 5dcdf20c6edd0606b73b8b5af5f5473dab1526ed Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Wed, 25 Oct 2023 15:50:17 +0100 Subject: [PATCH 23/32] Switch from shared to raw pointer for function parameters --- dorado/demux/BarcodeClassifier.cpp | 15 +++++++-------- dorado/demux/BarcodeClassifier.h | 11 +++++------ dorado/read_pipeline/BarcodeClassifierNode.cpp | 4 ++-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 1c4a2bf1a..c337b5bba 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -125,10 +125,9 @@ BarcodeClassifier::BarcodeClassifier(const std::vector& kit_names) BarcodeClassifier::~BarcodeClassifier() = default; -ScoreResults BarcodeClassifier::barcode( - const std::string& seq, - bool barcode_both_ends, - const std::shared_ptr& sample_sheet) const { +ScoreResults BarcodeClassifier::barcode(const std::string& seq, + bool barcode_both_ends, + const utils::SampleSheet* const sample_sheet) const { auto best_adapter = find_best_adapter(seq, m_adapter_sequences, barcode_both_ends, sample_sheet); return best_adapter; @@ -208,7 +207,7 @@ std::vector BarcodeClassifier::generate_adap std::vector BarcodeClassifier::calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const { + const utils::SampleSheet* const sample_sheet) const { std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); std::string_view read_bottom = read_seq.substr(bottom_start, TRIM_LENGTH); @@ -343,7 +342,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_different_d std::vector BarcodeClassifier::calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const { + const utils::SampleSheet* const sample_sheet) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); @@ -416,7 +415,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_double_ends std::vector BarcodeClassifier::calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const { + const utils::SampleSheet* const sample_sheet) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); @@ -559,7 +558,7 @@ ScoreResults BarcodeClassifier::find_best_adapter( const std::string& read_seq, const std::vector& adapters, bool barcode_both_ends, - const std::shared_ptr& sample_sheet) const { + const utils::SampleSheet* const sample_sheet) const { if (read_seq.length() < TRIM_LENGTH) { return UNCLASSIFIED; } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 3245fc258..74ff67550 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -4,7 +4,6 @@ #include "utils/types.h" #include -#include #include #include #include @@ -43,7 +42,7 @@ class BarcodeClassifier { ScoreResults barcode(const std::string& seq, bool barcode_both_ends, - const std::shared_ptr& sample_sheet) const; + const utils::SampleSheet* const sample_sheet) const; private: const std::vector m_adapter_sequences; @@ -53,19 +52,19 @@ class BarcodeClassifier { std::vector calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const; + const utils::SampleSheet* const sample_sheet) const; std::vector calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const; + const utils::SampleSheet* const sample_sheet) const; std::vector calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const std::shared_ptr& sample_sheet) const; + const utils::SampleSheet* const sample_sheet) const; ScoreResults find_best_adapter(const std::string& read_seq, const std::vector& adapter, bool barcode_both_ends, - const std::shared_ptr& sample_sheet) const; + const utils::SampleSheet* const sample_sheet) const; }; } // namespace demux diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 0321f72a1..02dd3151f 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -262,7 +262,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { std::string seq = utils::extract_sequence(irecord, seqlen); auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, - m_default_barcoding_info->sample_sheet); + m_default_barcoding_info->sample_sheet.get()); std::string alias{}; if (m_default_barcoding_info->sample_sheet) { // experiment id and position id are not stored in the bam file, so we can't recover them to use here @@ -286,7 +286,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { // get the sequence to map from the record auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends, - barcoding_info->sample_sheet); + barcoding_info->sample_sheet.get()); std::string alias{}; if (barcoding_info->sample_sheet) { alias = barcoding_info->sample_sheet->get_alias( From f993d47dd89073de4d1bd86ec23381049397c243 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Wed, 25 Oct 2023 15:53:33 +0100 Subject: [PATCH 24/32] Remove unnecessary lambda+switch kerfuffle --- dorado/utils/SampleSheet.cpp | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index ec8c4c4cb..f8fb0827e 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -155,27 +155,16 @@ void SampleSheet::load(std::istream& file_stream, const std::string& filename) { " does not contain a unique mapping of barcode ids.")); } - m_allowed_barcodes = [this]() -> FilterSet { + if (m_type == Type::barcode) { std::unordered_set barcodes; - switch (m_type) { - case Type::barcode: { - // Grab the barcode idx once so that we're not doing it repeatedly - const auto barcode_idx = m_col_indices.at("barcode"); - - // Grab the barcodes - for (const auto& row : m_rows) { - barcodes.emplace(row[barcode_idx]); - } - break; - } - case Type::none: - [[fallthrough]]; - default: - return std::nullopt; + // Grab the barcode idx once so that we're not doing it repeatedly + const auto barcode_idx = m_col_indices.at("barcode"); + // Grab the barcodes + for (const auto& row : m_rows) { + barcodes.emplace(row[barcode_idx]); } - - return barcodes; - }(); + m_allowed_barcodes = std::move(barcodes); + } } // check if we can generate a unique alias without the flowcell/position information From c28111e42d39d4b4f856b8fecdd0e802da6e1c09 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Wed, 25 Oct 2023 16:09:15 +0100 Subject: [PATCH 25/32] Switch from shared_ptr to unique_ptr --- dorado/cli/basecaller.cpp | 6 +++--- dorado/cli/demux.cpp | 4 ++-- dorado/read_pipeline/BarcodeClassifierNode.cpp | 2 +- dorado/read_pipeline/BarcodeClassifierNode.h | 2 +- dorado/utils/bam_utils.cpp | 2 +- dorado/utils/bam_utils.h | 2 +- dorado/utils/types.cpp | 4 +++- dorado/utils/types.h | 4 ++-- tests/BarcodeClassifierTest.cpp | 1 + tests/NodeSmokeTest.cpp | 1 + 10 files changed, 16 insertions(+), 12 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 384ff105c..5b5b48a1d 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -123,14 +123,14 @@ void setup(std::vector args, num_devices, !remora_runners.empty() ? num_remora_threads : 0, enable_aligner, !barcode_kits.empty()); - std::shared_ptr sample_sheet; + std::unique_ptr sample_sheet; if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_shared(barcode_sample_sheet, false); + sample_sheet = std::make_unique(barcode_sample_sheet, false); } SamHdrPtr hdr(sam_hdr_init()); cli::add_pg_hdr(hdr.get(), args); - utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet); + utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get()); PipelineDescriptor pipeline_desc; auto hts_writer = pipeline_desc.add_node( diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 3cd63af13..24a38a034 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -144,9 +144,9 @@ int demuxer(int argc, char* argv[]) { kit_names = std::move(*names); } auto barcode_sample_sheet = parser.get("--sample-sheet"); - std::shared_ptr sample_sheet; + std::unique_ptr sample_sheet; if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_shared(barcode_sample_sheet, true); + sample_sheet = std::make_unique(barcode_sample_sheet, true); } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 02dd3151f..567631763 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -41,7 +41,7 @@ BarcodeClassifierNode::BarcodeClassifierNode(int threads, const std::vector& kit_names, bool barcode_both_ends, bool no_trim, - std::shared_ptr sample_sheet) + std::unique_ptr sample_sheet) : MessageSink(10000), m_threads(threads), m_default_barcoding_info(create_barcoding_info(kit_names, diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index 4e4c60e87..a5813546d 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -26,7 +26,7 @@ class BarcodeClassifierNode : public MessageSink { const std::vector& kit_name, bool barcode_both_ends, bool no_trim, - std::shared_ptr sample_sheet); + std::unique_ptr sample_sheet); BarcodeClassifierNode(int threads); ~BarcodeClassifierNode(); std::string get_name() const override { return "BarcodeClassifierNode"; } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 0e0f292d4..7168b7b6f 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -19,7 +19,7 @@ namespace dorado::utils { void add_rg_hdr(sam_hdr_t* hdr, const std::unordered_map& read_groups, const std::vector& barcode_kits, - const std::shared_ptr& sample_sheet) { + const utils::SampleSheet* const sample_sheet) { const auto& barcode_kit_infos = barcode_kits::get_kit_infos(); const auto& barcode_sequences = barcode_kits::get_barcodes(); diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index a9cc4207b..ac2d7953e 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -26,7 +26,7 @@ struct AlignmentOps { void add_rg_hdr(sam_hdr_t* hdr, const std::unordered_map& read_groups, const std::vector& barcode_kits, - const std::shared_ptr& sample_sheet); + const utils::SampleSheet* const sample_sheet); void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs); diff --git a/dorado/utils/types.cpp b/dorado/utils/types.cpp index ada23fb0f..ab0311edb 100644 --- a/dorado/utils/types.cpp +++ b/dorado/utils/types.cpp @@ -1,5 +1,7 @@ #include "types.h" +#include "SampleSheet.h" + #include #include @@ -9,7 +11,7 @@ std::shared_ptr create_barcoding_info( const std::vector& kit_names, bool barcode_both_ends, bool trim_barcode, - std::shared_ptr sample_sheet) { + std::unique_ptr sample_sheet) { if (kit_names.empty()) { return {}; } diff --git a/dorado/utils/types.h b/dorado/utils/types.h index 27f7d221f..de2f4566e 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -20,14 +20,14 @@ struct BarcodingInfo { std::string kit_name{}; bool barcode_both_ends{false}; bool trim{false}; - std::shared_ptr sample_sheet; + std::unique_ptr sample_sheet; }; std::shared_ptr create_barcoding_info( const std::vector &kit_names, bool barcode_both_ends, bool trim_barcode, - std::shared_ptr sample_sheet); + std::unique_ptr sample_sheet); struct ReadGroup { std::string run_id; diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index 23c07b6c2..113b1a905 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -4,6 +4,7 @@ #include "TestUtils.h" #include "read_pipeline/BarcodeClassifierNode.h" #include "read_pipeline/HtsReader.h" +#include "utils/SampleSheet.h" #include "utils/bam_utils.h" #include "utils/barcode_kits.h" #include "utils/sequence_utils.h" diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index f279d6b89..281aad799 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -14,6 +14,7 @@ #include "read_pipeline/ReadFilterNode.h" #include "read_pipeline/ReadToBamTypeNode.h" #include "read_pipeline/ScalerNode.h" +#include "utils/SampleSheet.h" #include "utils/parameters.h" #if DORADO_GPU_BUILD From ebfc37479314652288267eda88a67e087cf3c18d Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 09:40:26 +0100 Subject: [PATCH 26/32] Use smart ptr type for sam header --- dorado/cli/demux.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 24a38a034..ed9ae3158 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -131,8 +131,8 @@ int demuxer(int argc, char* argv[]) { } HtsReader reader(reads[0], read_list); - auto header = sam_hdr_dup(reader.header); - add_pg_hdr(header); + auto header = SamHdrPtr(sam_hdr_dup(reader.header)); + add_pg_hdr(header.get()); PipelineDescriptor pipeline_desc; auto demux_writer = pipeline_desc.add_node( @@ -165,8 +165,7 @@ int demuxer(int argc, char* argv[]) { // rather than the pipeline framework. auto& demux_writer_ref = dynamic_cast(pipeline->get_node_ref(demux_writer)); - demux_writer_ref.set_header(header); - sam_hdr_destroy(header); + demux_writer_ref.set_header(header.get()); // Set up stats counting std::vector stats_callables; From 227b11905ecba15723c2f1401c6c10449508b1f4 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 10:33:36 +0100 Subject: [PATCH 27/32] Move basecaller barcode aliasing into the Bam translator node This is the latest place in the pipeline where we have access to the flowcell_id It also isn't used in the server - we want aliasing in that case to occur in the client instead By this point, the barcode will be in full "kit"_"barcode" form, so change the aliasing to use that. --- dorado/cli/basecaller.cpp | 9 +++++++-- dorado/cli/duplex.cpp | 5 +++-- .../read_pipeline/BarcodeClassifierNode.cpp | 12 +----------- dorado/read_pipeline/ReadToBamTypeNode.cpp | 19 ++++++++++++++++++- dorado/read_pipeline/ReadToBamTypeNode.h | 12 +++++++++--- dorado/utils/SampleSheet.cpp | 5 ++--- dorado/utils/bam_utils.cpp | 8 +++++--- tests/NodeSmokeTest.cpp | 4 ++-- 8 files changed, 47 insertions(+), 27 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 5b5b48a1d..a62ef39c1 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -144,16 +144,21 @@ void setup(std::vector args, } current_sink_node = pipeline_desc.add_node( {current_sink_node}, emit_moves, thread_allocations.read_converter_threads, - methylation_threshold_pct); + methylation_threshold_pct, std::move(sample_sheet), 1000); if (estimate_poly_a) { current_sink_node = pipeline_desc.add_node( {current_sink_node}, std::thread::hardware_concurrency(), PolyACalculator::get_model_type(model_name)); } if (!barcode_kits.empty()) { + std::unique_ptr sample_sheet_2; + if (!barcode_sample_sheet.empty()) { + sample_sheet_2 = + std::make_unique(barcode_sample_sheet, false); + } current_sink_node = pipeline_desc.add_node( {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, - barcode_both_ends, barcode_no_trim, std::move(sample_sheet)); + barcode_both_ends, barcode_no_trim, std::move(sample_sheet_2)); } current_sink_node = pipeline_desc.add_node( {current_sink_node}, min_qscore, default_parameters.min_sequence_length, diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 21e4113e6..98e5b8761 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -12,6 +12,7 @@ #include "read_pipeline/ProgressTracker.h" #include "read_pipeline/ReadFilterNode.h" #include "read_pipeline/ReadToBamTypeNode.h" +#include "utils/SampleSheet.h" #include "utils/bam_utils.h" #include "utils/basecaller_utils.h" #include "utils/duplex_utils.h" @@ -193,8 +194,8 @@ int duplex(int argc, char* argv[]) { pipeline_desc.add_node_sink(aligner, hts_writer); converted_reads_sink = aligner; } - auto read_converter = - pipeline_desc.add_node({converted_reads_sink}, emit_moves, 2); + auto read_converter = pipeline_desc.add_node( + {converted_reads_sink}, emit_moves, 2, 0, nullptr, 1000); auto duplex_read_tagger = pipeline_desc.add_node({read_converter}); // The minimum sequence length is set to 5 to avoid issues with duplex node printing very short sequences for mismatched pairs. std::unordered_set read_ids_to_filter; diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 567631763..969169c44 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -287,17 +287,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { // get the sequence to map from the record auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends, barcoding_info->sample_sheet.get()); - std::string alias{}; - if (barcoding_info->sample_sheet) { - alias = barcoding_info->sample_sheet->get_alias( - read.read_common.flowcell_id, read.read_common.position_id, - read.read_common.experiment_id, bc_res.adapter_name); - } - if (!alias.empty()) { - read.read_common.barcode = alias; - } else { - read.read_common.barcode = generate_barcode_string(bc_res); - } + read.read_common.barcode = generate_barcode_string(bc_res); m_num_records++; if (barcoding_info->trim) { trim_barcode(read, bc_res); diff --git a/dorado/read_pipeline/ReadToBamTypeNode.cpp b/dorado/read_pipeline/ReadToBamTypeNode.cpp index c3465fb4b..6d725edb6 100644 --- a/dorado/read_pipeline/ReadToBamTypeNode.cpp +++ b/dorado/read_pipeline/ReadToBamTypeNode.cpp @@ -1,5 +1,7 @@ #include "ReadToBamTypeNode.h" +#include "utils/SampleSheet.h" + #include #include @@ -24,6 +26,17 @@ void ReadToBamType::worker_thread() { if (!read_common_data.is_duplex) { is_duplex_parent = std::get(message)->is_duplex_parent; } + + // alias barcode if present + if (m_sample_sheet && !read_common_data.barcode.empty()) { + auto alias = m_sample_sheet->get_alias( + read_common_data.flowcell_id, read_common_data.position_id, + read_common_data.experiment_id, read_common_data.barcode); + if (!alias.empty()) { + read_common_data.barcode = alias; + } + } + auto alns = read_common_data.extract_sam_lines(m_emit_moves, m_modbase_threshold, is_duplex_parent); for (auto& aln : alns) { @@ -35,15 +48,19 @@ void ReadToBamType::worker_thread() { ReadToBamType::ReadToBamType(bool emit_moves, size_t num_worker_threads, float modbase_threshold_frac, + std::unique_ptr sample_sheet, size_t max_reads) : MessageSink(max_reads), m_num_worker_threads(num_worker_threads), m_emit_moves(emit_moves), m_modbase_threshold( - static_cast(std::min(modbase_threshold_frac * 256.0f, 255.0f))) { + static_cast(std::min(modbase_threshold_frac * 256.0f, 255.0f))), + m_sample_sheet(std::move(sample_sheet)) { start_threads(); } +ReadToBamType::~ReadToBamType() { terminate_impl(); } + void ReadToBamType::start_threads() { for (size_t i = 0; i < m_num_worker_threads; i++) { m_workers.push_back( diff --git a/dorado/read_pipeline/ReadToBamTypeNode.h b/dorado/read_pipeline/ReadToBamTypeNode.h index f3d2be672..d4c3001e5 100644 --- a/dorado/read_pipeline/ReadToBamTypeNode.h +++ b/dorado/read_pipeline/ReadToBamTypeNode.h @@ -11,13 +11,18 @@ namespace dorado { +namespace utils { +class SampleSheet; +} + class ReadToBamType : public MessageSink { public: ReadToBamType(bool emit_moves, size_t num_worker_threads, - float modbase_threshold_frac = 0, - size_t max_reads = 1000); - ~ReadToBamType() { terminate_impl(); } + float modbase_threshold_frac, + std::unique_ptr sample_sheet, + size_t max_reads); + ~ReadToBamType(); std::string get_name() const override { return "ReadToBamType"; } void terminate(const FlushOptions& flush_options) override { terminate_impl(); }; void restart() override; @@ -33,6 +38,7 @@ class ReadToBamType : public MessageSink { bool m_emit_moves; uint8_t m_modbase_threshold; + std::unique_ptr m_sample_sheet; }; } // namespace dorado diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index f8fb0827e..72afd2346 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -209,11 +209,10 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } - auto normalized_barcode_name = - (barcode == "unclassified" ? barcode : barcode_kits::normalize_barcode_name(barcode)); for (const auto& row : m_rows) { + auto standard_barcode_name = get(row, "kit") + "_" + get(row, "barcode"); if (match_index(row, flow_cell_id, position_id, experiment_id) && - get(row, "barcode") == normalized_barcode_name) { + barcode == standard_barcode_name) { return get(row, "alias"); } } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 7168b7b6f..0a2b101a9 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -74,9 +74,11 @@ void add_rg_hdr(sam_hdr_t* hdr, if (!sample_sheet->barcode_is_permitted(barcode_name)) { continue; } - alias = sample_sheet->get_alias(read_group.second.flowcell_id, - read_group.second.position_id, - read_group.second.experiment_id, barcode_name); + + alias = sample_sheet->get_alias( + read_group.second.flowcell_id, read_group.second.position_id, + read_group.second.experiment_id, + barcode_kits::generate_standard_barcode_name(kit_name, barcode_name)); } if (!alias.empty()) { id += alias; diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index 281aad799..4d843f974 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -328,8 +328,8 @@ DEFINE_TEST(NodeSmokeTestBam, "ReadToBamType") { set_pipeline_restart(pipeline_restart); - run_smoke_test(emit_moves, 2, - dorado::utils::default_parameters.methylation_threshold); + run_smoke_test( + emit_moves, 2, dorado::utils::default_parameters.methylation_threshold, nullptr, 1000); } DEFINE_TEST(NodeSmokeTestRead, "BarcodeClassifierNode") { From b24e40e1d9d57c4ba7c041d236acd91c9c7efa30 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 10:56:12 +0100 Subject: [PATCH 28/32] Apply aliasing in barcode demuxer node --- dorado/cli/demux.cpp | 16 +++++++++++----- dorado/read_pipeline/BarcodeClassifierNode.cpp | 7 +------ dorado/read_pipeline/BarcodeDemuxerNode.cpp | 16 ++++++++++++++-- dorado/read_pipeline/BarcodeDemuxerNode.h | 7 ++++++- tests/BarcodeDemuxerNodeTest.cpp | 5 +++-- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index ed9ae3158..cec5e9cef 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -134,23 +134,29 @@ int demuxer(int argc, char* argv[]) { auto header = SamHdrPtr(sam_hdr_dup(reader.header)); add_pg_hdr(header.get()); + auto barcode_sample_sheet = parser.get("--sample-sheet"); + std::unique_ptr sample_sheet; + if (!barcode_sample_sheet.empty()) { + sample_sheet = std::make_unique(barcode_sample_sheet, true); + } + PipelineDescriptor pipeline_desc; auto demux_writer = pipeline_desc.add_node( - {}, output_dir, demux_writer_threads, 0, parser.get("--emit-fastq")); + {}, output_dir, demux_writer_threads, 0, parser.get("--emit-fastq"), + std::move(sample_sheet)); if (parser.is_used("--kit-name")) { std::vector kit_names; if (auto names = parser.present>("--kit-name")) { kit_names = std::move(*names); } - auto barcode_sample_sheet = parser.get("--sample-sheet"); - std::unique_ptr sample_sheet; + std::unique_ptr sample_sheet_2; if (!barcode_sample_sheet.empty()) { - sample_sheet = std::make_unique(barcode_sample_sheet, true); + sample_sheet_2 = std::make_unique(barcode_sample_sheet, true); } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), - parser.get("--no-trim"), std::move(sample_sheet)); + parser.get("--no-trim"), std::move(sample_sheet_2)); } // Create the Pipeline from our description. diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 969169c44..8ce57c27a 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -263,12 +263,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, m_default_barcoding_info->sample_sheet.get()); - std::string alias{}; - if (m_default_barcoding_info->sample_sheet) { - // experiment id and position id are not stored in the bam file, so we can't recover them to use here - alias = m_default_barcoding_info->sample_sheet->get_alias("", "", "", bc_res.adapter_name); - } - auto bc = alias.empty() ? generate_barcode_string(bc_res) : alias; + auto bc = generate_barcode_string(bc_res); bam_aux_append(irecord, "BC", 'Z', bc.length() + 1, (uint8_t*)bc.c_str()); m_num_records++; diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.cpp b/dorado/read_pipeline/BarcodeDemuxerNode.cpp index 45dce7c1b..0200d79b4 100644 --- a/dorado/read_pipeline/BarcodeDemuxerNode.cpp +++ b/dorado/read_pipeline/BarcodeDemuxerNode.cpp @@ -1,6 +1,7 @@ #include "BarcodeDemuxerNode.h" #include "read_pipeline/ReadPipeline.h" +#include "utils/SampleSheet.h" #include #include @@ -14,12 +15,14 @@ namespace dorado { BarcodeDemuxerNode::BarcodeDemuxerNode(const std::string& output_dir, size_t htslib_threads, size_t num_reads, - bool write_fastq) + bool write_fastq, + std::unique_ptr sample_sheet) : MessageSink(10000), m_output_dir(output_dir), m_htslib_threads(htslib_threads), m_num_reads_expected(num_reads), - m_write_fastq(write_fastq) { + m_write_fastq(write_fastq), + m_sample_sheet(std::move(sample_sheet)) { std::filesystem::create_directories(m_output_dir); start_threads(); } @@ -67,6 +70,15 @@ int BarcodeDemuxerNode::write(bam1_t* const record) { if (bam_tag) { bc = std::string(bam_aux2Z(bam_tag)); } + + if (m_sample_sheet) { + // experiment id and position id are not stored in the bam record, so we can't recover them to use here + auto alias = m_sample_sheet->get_alias("", "", "", bc); + if (!alias.empty()) { + bc = alias; + bam_aux_update_str(record, "BC", bc.size() + 1, bc.c_str()); + } + } // Check of existence of file for that barcode. auto res = m_files.find(bc); htsFile* file = nullptr; diff --git a/dorado/read_pipeline/BarcodeDemuxerNode.h b/dorado/read_pipeline/BarcodeDemuxerNode.h index b5c5f4af1..17008b847 100644 --- a/dorado/read_pipeline/BarcodeDemuxerNode.h +++ b/dorado/read_pipeline/BarcodeDemuxerNode.h @@ -14,12 +14,16 @@ namespace dorado { +namespace utils { +class SampleSheet; +} class BarcodeDemuxerNode : public MessageSink { public: BarcodeDemuxerNode(const std::string& output_dir, size_t htslib_threads, size_t num_reads, - bool write_fastq); + bool write_fastq, + std::unique_ptr sample_sheet); ~BarcodeDemuxerNode(); std::string get_name() const override { return "BarcodeDemuxerNode"; } stats::NamedStats sample_stats() const override; @@ -42,6 +46,7 @@ class BarcodeDemuxerNode : public MessageSink { int write(bam1_t* record); size_t m_num_reads_expected; bool m_write_fastq{false}; + std::unique_ptr m_sample_sheet; }; } // namespace dorado diff --git a/tests/BarcodeDemuxerNodeTest.cpp b/tests/BarcodeDemuxerNodeTest.cpp index f50936715..93a20bd38 100644 --- a/tests/BarcodeDemuxerNodeTest.cpp +++ b/tests/BarcodeDemuxerNodeTest.cpp @@ -3,6 +3,7 @@ #include "MessageSinkUtils.h" #include "TestUtils.h" #include "read_pipeline/HtsReader.h" +#include "utils/SampleSheet.h" #include "utils/bam_utils.h" #include "utils/sequence_utils.h" #include "utils/types.h" @@ -46,8 +47,8 @@ TEST_CASE("BarcodeDemuxerNode: check correct output files are created", TEST_GRO // the pipeline object is closed. This needs to be looked at. // TODO: Address open file issue on windows. dorado::PipelineDescriptor pipeline_desc; - auto demuxer = - pipeline_desc.add_node({}, tmp_dir.string(), 8, 10, false); + auto demuxer = pipeline_desc.add_node({}, tmp_dir.string(), 8, 10, + false, nullptr); auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); From 9821b21fdac404a0ad8575c33fc056b47167d427 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 11:25:21 +0100 Subject: [PATCH 29/32] Switch back to filterset in the barcoding info. Strip the kitname when aliasing. --- dorado/cli/basecaller.cpp | 9 ++--- dorado/cli/demux.cpp | 8 ++--- dorado/demux/BarcodeClassifier.cpp | 35 ++++++++++++------- dorado/demux/BarcodeClassifier.h | 10 +++--- .../read_pipeline/BarcodeClassifierNode.cpp | 8 ++--- dorado/read_pipeline/BarcodeClassifierNode.h | 6 +--- dorado/utils/SampleSheet.cpp | 16 +++++---- dorado/utils/SampleSheet.h | 11 +++--- dorado/utils/bam_utils.cpp | 6 ++-- dorado/utils/types.cpp | 8 ++--- dorado/utils/types.h | 10 +++--- tests/BarcodeClassifierTest.cpp | 22 ++++++------ tests/NodeSmokeTest.cpp | 5 +-- 13 files changed, 78 insertions(+), 76 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index a62ef39c1..de4c60f85 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -124,8 +124,10 @@ void setup(std::vector args, !barcode_kits.empty()); std::unique_ptr sample_sheet; + BarcodingInfo::FilterSet allowed_barcodes; if (!barcode_sample_sheet.empty()) { sample_sheet = std::make_unique(barcode_sample_sheet, false); + allowed_barcodes = sample_sheet->get_barcode_values(); } SamHdrPtr hdr(sam_hdr_init()); @@ -151,14 +153,9 @@ void setup(std::vector args, PolyACalculator::get_model_type(model_name)); } if (!barcode_kits.empty()) { - std::unique_ptr sample_sheet_2; - if (!barcode_sample_sheet.empty()) { - sample_sheet_2 = - std::make_unique(barcode_sample_sheet, false); - } current_sink_node = pipeline_desc.add_node( {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, - barcode_both_ends, barcode_no_trim, std::move(sample_sheet_2)); + barcode_both_ends, barcode_no_trim, std::move(allowed_barcodes)); } current_sink_node = pipeline_desc.add_node( {current_sink_node}, min_qscore, default_parameters.min_sequence_length, diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index cec5e9cef..2fd2bd640 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -136,8 +136,10 @@ int demuxer(int argc, char* argv[]) { auto barcode_sample_sheet = parser.get("--sample-sheet"); std::unique_ptr sample_sheet; + BarcodingInfo::FilterSet allowed_barcodes; if (!barcode_sample_sheet.empty()) { sample_sheet = std::make_unique(barcode_sample_sheet, true); + allowed_barcodes = sample_sheet->get_barcode_values(); } PipelineDescriptor pipeline_desc; @@ -150,13 +152,9 @@ int demuxer(int argc, char* argv[]) { if (auto names = parser.present>("--kit-name")) { kit_names = std::move(*names); } - std::unique_ptr sample_sheet_2; - if (!barcode_sample_sheet.empty()) { - sample_sheet_2 = std::make_unique(barcode_sample_sheet, true); - } auto demux = pipeline_desc.add_node( {demux_writer}, demux_threads, kit_names, parser.get("--barcode-both-ends"), - parser.get("--no-trim"), std::move(sample_sheet_2)); + parser.get("--no-trim"), std::move(allowed_barcodes)); } // Create the Pipeline from our description. diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index c337b5bba..94fec1840 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -1,6 +1,5 @@ #include "BarcodeClassifier.h" -#include "utils/SampleSheet.h" #include "utils/alignment_utils.h" #include "utils/barcode_kits.h" #include "utils/sequence_utils.h" @@ -99,6 +98,16 @@ float extract_mask_score(std::string_view adapter, return score; } +bool barcode_is_permitted(const BarcodingInfo::FilterSet& allowed_barcodes, + const std::string& adapter_name) { + if (!allowed_barcodes.has_value()) { + return true; + } + + auto normalized_barcode_name = barcode_kits::normalize_barcode_name(adapter_name); + return allowed_barcodes->count(normalized_barcode_name) != 0; +} + } // namespace namespace demux { @@ -127,9 +136,9 @@ BarcodeClassifier::~BarcodeClassifier() = default; ScoreResults BarcodeClassifier::barcode(const std::string& seq, bool barcode_both_ends, - const utils::SampleSheet* const sample_sheet) const { + const BarcodingInfo::FilterSet& allowed_barcodes) const { auto best_adapter = - find_best_adapter(seq, m_adapter_sequences, barcode_both_ends, sample_sheet); + find_best_adapter(seq, m_adapter_sequences, barcode_both_ends, allowed_barcodes); return best_adapter; } @@ -207,7 +216,7 @@ std::vector BarcodeClassifier::generate_adap std::vector BarcodeClassifier::calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const { + const BarcodingInfo::FilterSet& allowed_barcodes) const { std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); std::string_view read_bottom = read_seq.substr(bottom_start, TRIM_LENGTH); @@ -267,7 +276,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_different_d auto& adapter_rev = as.adapter_rev[i]; auto& adapter_name = as.adapter_name[i]; - if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { + if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { continue; } @@ -342,7 +351,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_different_d std::vector BarcodeClassifier::calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const { + const BarcodingInfo::FilterSet& allowed_barcodes) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); int bottom_start = std::max(0, (int)read_seq.length() - TRIM_LENGTH); @@ -373,7 +382,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_double_ends auto& adapter_rev = as.adapter_rev[i]; auto& adapter_name = as.adapter_name[i]; - if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { + if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { continue; } spdlog::debug("Checking barcode {}", adapter_name); @@ -415,7 +424,7 @@ std::vector BarcodeClassifier::calculate_adapter_score_double_ends std::vector BarcodeClassifier::calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const { + const BarcodingInfo::FilterSet& allowed_barcodes) const { bool debug_mode = (spdlog::get_level() == spdlog::level::debug); std::string_view read_top = read_seq.substr(0, TRIM_LENGTH); @@ -438,7 +447,7 @@ std::vector BarcodeClassifier::calculate_adapter_score( auto& adapter = as.adapter[i]; auto& adapter_name = as.adapter_name[i]; - if (sample_sheet && !sample_sheet->barcode_is_permitted(adapter_name)) { + if (!barcode_is_permitted(allowed_barcodes, adapter_name)) { continue; } @@ -558,7 +567,7 @@ ScoreResults BarcodeClassifier::find_best_adapter( const std::string& read_seq, const std::vector& adapters, bool barcode_both_ends, - const utils::SampleSheet* const sample_sheet) const { + const BarcodingInfo::FilterSet& allowed_barcodes) const { if (read_seq.length() < TRIM_LENGTH) { return UNCLASSIFIED; } @@ -579,14 +588,14 @@ ScoreResults BarcodeClassifier::find_best_adapter( auto& kit = kit_info_map.at(as->kit); if (kit.double_ends) { if (kit.ends_different) { - auto out = calculate_adapter_score_different_double_ends(fwd, *as, sample_sheet); + auto out = calculate_adapter_score_different_double_ends(fwd, *as, allowed_barcodes); scores.insert(scores.end(), out.begin(), out.end()); } else { - auto out = calculate_adapter_score_double_ends(fwd, *as, sample_sheet); + auto out = calculate_adapter_score_double_ends(fwd, *as, allowed_barcodes); scores.insert(scores.end(), out.begin(), out.end()); } } else { - auto out = calculate_adapter_score(fwd, *as, sample_sheet); + auto out = calculate_adapter_score(fwd, *as, allowed_barcodes); scores.insert(scores.end(), out.begin(), out.end()); } diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 74ff67550..beaf0d073 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -42,7 +42,7 @@ class BarcodeClassifier { ScoreResults barcode(const std::string& seq, bool barcode_both_ends, - const utils::SampleSheet* const sample_sheet) const; + const BarcodingInfo::FilterSet& allowed_barcodes) const; private: const std::vector m_adapter_sequences; @@ -52,19 +52,19 @@ class BarcodeClassifier { std::vector calculate_adapter_score_different_double_ends( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const; + const BarcodingInfo::FilterSet& allowed_barcodes) const; std::vector calculate_adapter_score_double_ends( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const; + const BarcodingInfo::FilterSet& allowed_barcodes) const; std::vector calculate_adapter_score( std::string_view read_seq, const AdapterSequence& as, - const utils::SampleSheet* const sample_sheet) const; + const BarcodingInfo::FilterSet& allowed_barcodes) const; ScoreResults find_best_adapter(const std::string& read_seq, const std::vector& adapter, bool barcode_both_ends, - const utils::SampleSheet* const sample_sheet) const; + const BarcodingInfo::FilterSet& allowed_barcodes) const; }; } // namespace demux diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 8ce57c27a..cf3eb9fbc 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -41,13 +41,13 @@ BarcodeClassifierNode::BarcodeClassifierNode(int threads, const std::vector& kit_names, bool barcode_both_ends, bool no_trim, - std::unique_ptr sample_sheet) + BarcodingInfo::FilterSet allowed_barcodes) : MessageSink(10000), m_threads(threads), m_default_barcoding_info(create_barcoding_info(kit_names, barcode_both_ends, !no_trim, - std::move(sample_sheet))) { + std::move(allowed_barcodes))) { start_threads(); } @@ -262,7 +262,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { std::string seq = utils::extract_sequence(irecord, seqlen); auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, - m_default_barcoding_info->sample_sheet.get()); + m_default_barcoding_info->allowed_barcodes); auto bc = generate_barcode_string(bc_res); bam_aux_append(irecord, "BC", 'Z', bc.length() + 1, (uint8_t*)bc.c_str()); m_num_records++; @@ -281,7 +281,7 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { // get the sequence to map from the record auto bc_res = barcoder->barcode(read.read_common.seq, barcoding_info->barcode_both_ends, - barcoding_info->sample_sheet.get()); + barcoding_info->allowed_barcodes); read.read_common.barcode = generate_barcode_string(bc_res); m_num_records++; if (barcoding_info->trim) { diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index a5813546d..a905e9f64 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -16,17 +16,13 @@ namespace demux { struct ScoreResults; } -namespace utils { -class SampleSheet; -} - class BarcodeClassifierNode : public MessageSink { public: BarcodeClassifierNode(int threads, const std::vector& kit_name, bool barcode_both_ends, bool no_trim, - std::unique_ptr sample_sheet); + BarcodingInfo::FilterSet allowed_barcodes); BarcodeClassifierNode(int threads); ~BarcodeClassifierNode(); std::string get_name() const override { return "BarcodeClassifierNode"; } diff --git a/dorado/utils/SampleSheet.cpp b/dorado/utils/SampleSheet.cpp index 72afd2346..2e5e934c1 100644 --- a/dorado/utils/SampleSheet.cpp +++ b/dorado/utils/SampleSheet.cpp @@ -209,10 +209,15 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } + std::string_view barcode_only(barcode); + if (auto pos = barcode_only.find('_'); pos != std::string::npos) { + // trim off the kit name + barcode_only = barcode_only.substr(pos + 1); + } + for (const auto& row : m_rows) { - auto standard_barcode_name = get(row, "kit") + "_" + get(row, "barcode"); if (match_index(row, flow_cell_id, position_id, experiment_id) && - barcode == standard_barcode_name) { + get(row, "barcode") == barcode_only) { return get(row, "alias"); } } @@ -221,15 +226,14 @@ std::string SampleSheet::get_alias(const std::string& flow_cell_id, return ""; } -SampleSheet::FilterSet SampleSheet::get_barcode_values() const { return m_allowed_barcodes; } +BarcodingInfo::FilterSet SampleSheet::get_barcode_values() const { return m_allowed_barcodes; } -bool SampleSheet::barcode_is_permitted(const std::string& adapter_name) const { +bool SampleSheet::barcode_is_permitted(const std::string& barcode_name) const { if (!m_allowed_barcodes.has_value()) { return true; } - auto normalized_barcode_name = barcode_kits::normalize_barcode_name(adapter_name); - return m_allowed_barcodes->count(normalized_barcode_name) != 0; + return m_allowed_barcodes->count(barcode_name) != 0; } void SampleSheet::validate_headers(const std::vector& col_names, diff --git a/dorado/utils/SampleSheet.h b/dorado/utils/SampleSheet.h index 867f57424..3623a7620 100644 --- a/dorado/utils/SampleSheet.h +++ b/dorado/utils/SampleSheet.h @@ -13,8 +13,6 @@ namespace dorado::utils { class SampleSheet { public: - using FilterSet = std::optional>; - enum class Type { none, barcode }; enum IndexBits { FLOW_CELL_ID = 0, @@ -57,9 +55,12 @@ class SampleSheet { * Get all of the barcodes that are present in the sample sheet. * @return All of the barcodes that are present, or std::nullopt if the sample sheet is not loaded. */ - FilterSet get_barcode_values() const; + BarcodingInfo::FilterSet get_barcode_values() const; - bool barcode_is_permitted(const std::string& adapter_name) const; + /** + * Check whether the a list of allowed barcodes is set and, if so, whether the provided barcode is in it. + */ + bool barcode_is_permitted(const std::string& barcode_name) const; private: using Row = std::vector; @@ -69,7 +70,7 @@ class SampleSheet { std::unordered_map m_col_indices; std::vector m_rows; bool m_skip_index_matching; - FilterSet m_allowed_barcodes; + BarcodingInfo::FilterSet m_allowed_barcodes; void validate_headers(const std::vector& col_names, const std::string& filename); bool check_index(const std::string& flow_cell_id, const std::string& position_id) const; diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 0a2b101a9..b4bd3c3d2 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -67,18 +67,18 @@ void add_rg_hdr(sam_hdr_t* hdr, const auto& kit_info = kit_iter->second; for (const auto& barcode_name : kit_info.barcodes) { const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name); + const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name); for (const auto& read_group : read_groups) { std::string alias; auto id = read_group.first + '_'; if (sample_sheet) { - if (!sample_sheet->barcode_is_permitted(barcode_name)) { + if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) { continue; } alias = sample_sheet->get_alias( read_group.second.flowcell_id, read_group.second.position_id, - read_group.second.experiment_id, - barcode_kits::generate_standard_barcode_name(kit_name, barcode_name)); + read_group.second.experiment_id, normalized_barcode_name); } if (!alias.empty()) { id += alias; diff --git a/dorado/utils/types.cpp b/dorado/utils/types.cpp index ab0311edb..2f3c32cbd 100644 --- a/dorado/utils/types.cpp +++ b/dorado/utils/types.cpp @@ -1,7 +1,5 @@ #include "types.h" -#include "SampleSheet.h" - #include #include @@ -11,13 +9,13 @@ std::shared_ptr create_barcoding_info( const std::vector& kit_names, bool barcode_both_ends, bool trim_barcode, - std::unique_ptr sample_sheet) { + BarcodingInfo::FilterSet allowed_barcodes) { if (kit_names.empty()) { return {}; } - auto result = - BarcodingInfo{kit_names[0], barcode_both_ends, trim_barcode, std::move(sample_sheet)}; + auto result = BarcodingInfo{kit_names[0], barcode_both_ends, trim_barcode, + std::move(allowed_barcodes)}; return std::make_shared(std::move(result)); } diff --git a/dorado/utils/types.h b/dorado/utils/types.h index de2f4566e..716dd1e37 100644 --- a/dorado/utils/types.h +++ b/dorado/utils/types.h @@ -4,6 +4,7 @@ #include #include #include +#include #include struct bam1_t; @@ -12,22 +13,19 @@ struct sam_hdr_t; namespace dorado { -namespace utils { -class SampleSheet; -} - struct BarcodingInfo { + using FilterSet = std::optional>; std::string kit_name{}; bool barcode_both_ends{false}; bool trim{false}; - std::unique_ptr sample_sheet; + FilterSet allowed_barcodes; }; std::shared_ptr create_barcoding_info( const std::vector &kit_names, bool barcode_both_ends, bool trim_barcode, - std::unique_ptr sample_sheet); + BarcodingInfo::FilterSet allowed_barcodes); struct ReadGroup { std::string run_id; diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index 113b1a905..146161914 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -4,7 +4,6 @@ #include "TestUtils.h" #include "read_pipeline/BarcodeClassifierNode.h" #include "read_pipeline/HtsReader.h" -#include "utils/SampleSheet.h" #include "utils/bam_utils.h" #include "utils/barcode_kits.h" #include "utils/sequence_utils.h" @@ -62,7 +61,7 @@ TEST_CASE("BarcodeClassifier: test single ended barcode", TEST_GROUP) { auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, nullptr); + auto res = classifier.barcode(seq, false, std::nullopt); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -90,7 +89,7 @@ TEST_CASE("BarcodeClassifier: test double ended barcode", TEST_GROUP) { auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, nullptr); + auto res = classifier.barcode(seq, false, std::nullopt); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -120,7 +119,7 @@ TEST_CASE("BarcodeClassifier: test double ended barcode with different variants" auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto res = classifier.barcode(seq, false, nullptr); + auto res = classifier.barcode(seq, false, std::nullopt); if (res.adapter_name == "unclassified") { CHECK(bc == res.adapter_name); } else { @@ -149,8 +148,8 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - failing case", TEST_ auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto single_end_res = classifier.barcode(seq, false, nullptr); - auto double_end_res = classifier.barcode(seq, true, nullptr); + auto single_end_res = classifier.barcode(seq, false, std::nullopt); + auto double_end_res = classifier.barcode(seq, true, std::nullopt); CHECK(double_end_res.adapter_name == "unclassified"); CHECK(single_end_res.adapter_name == "BC15"); } @@ -168,8 +167,8 @@ TEST_CASE("BarcodeClassifier: check barcodes on both ends - passing case", TEST_ auto seqlen = reader.record->core.l_qseq; auto bseq = bam_get_seq(reader.record); std::string seq = utils::convert_nt16_to_str(bseq, seqlen); - auto single_end_res = classifier.barcode(seq, false, nullptr); - auto double_end_res = classifier.barcode(seq, true, nullptr); + auto single_end_res = classifier.barcode(seq, false, std::nullopt); + auto double_end_res = classifier.barcode(seq, true, std::nullopt); CHECK(double_end_res.adapter_name == single_end_res.adapter_name); CHECK(single_end_res.adapter_name == "BC01"); } @@ -190,12 +189,13 @@ TEST_CASE( CAPTURE(barcode_both_ends); CAPTURE(use_per_read_barcoding); constexpr bool no_trim = false; - auto barcoding_info = dorado::create_barcoding_info(kits, barcode_both_ends, !no_trim, nullptr); + auto barcoding_info = + dorado::create_barcoding_info(kits, barcode_both_ends, !no_trim, std::nullopt); if (use_per_read_barcoding) { pipeline_desc.add_node({sink}, 8); } else { pipeline_desc.add_node({sink}, 8, kits, barcode_both_ends, no_trim, - nullptr); + std::nullopt); } auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); @@ -344,7 +344,7 @@ TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", bool barcode_both_ends = false; bool no_trim = false; auto classifier = pipeline_desc.add_node( - {sink}, 8, kits, barcode_both_ends, no_trim, nullptr); + {sink}, 8, kits, barcode_both_ends, no_trim, std::nullopt); auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); fs::path data_dir = fs::path(get_data_dir("barcode_demux")); diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index 4d843f974..9bb453dca 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -343,7 +343,8 @@ DEFINE_TEST(NodeSmokeTestRead, "BarcodeClassifierNode") { set_pipeline_restart(pipeline_restart); std::vector kits = {"SQK-RPB004", "EXP-NBD196"}; - run_smoke_test(2, kits, barcode_both_ends, no_trim, nullptr); + run_smoke_test(2, kits, barcode_both_ends, no_trim, + std::nullopt); } TEST_CASE("BarcodeClassifierNode: test simple pipeline with fastq and sam files") { @@ -354,7 +355,7 @@ TEST_CASE("BarcodeClassifierNode: test simple pipeline with fastq and sam files" bool barcode_both_ends = GENERATE(true, false); bool no_trim = GENERATE(true, false); auto classifier = pipeline_desc.add_node( - {sink}, 8, kits, barcode_both_ends, no_trim, nullptr); + {sink}, 8, kits, barcode_both_ends, no_trim, std::nullopt); auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc)); From 846a1787c5e1890a11293dc4821eb7033fe5e36b Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 12:31:13 +0100 Subject: [PATCH 30/32] Remove unnecessary forward declaration --- dorado/demux/BarcodeClassifier.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index beaf0d073..99dad4f18 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -10,10 +10,6 @@ namespace dorado { -namespace utils { -class SampleSheet; -} - namespace demux { struct ScoreResults { From 55b70dd8ca6e675c648f6ba112e2f2fa04ad974e Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 12:35:07 +0100 Subject: [PATCH 31/32] Remove unused header, add missing one --- dorado/demux/BarcodeClassifier.h | 1 - dorado/demux/BarcodeClassifierSelector.cpp | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dorado/demux/BarcodeClassifier.h b/dorado/demux/BarcodeClassifier.h index 99dad4f18..9df9cf815 100644 --- a/dorado/demux/BarcodeClassifier.h +++ b/dorado/demux/BarcodeClassifier.h @@ -1,5 +1,4 @@ #pragma once -#include "read_pipeline/ReadPipeline.h" #include "utils/stats.h" #include "utils/types.h" diff --git a/dorado/demux/BarcodeClassifierSelector.cpp b/dorado/demux/BarcodeClassifierSelector.cpp index 06405c473..5a189af7c 100644 --- a/dorado/demux/BarcodeClassifierSelector.cpp +++ b/dorado/demux/BarcodeClassifierSelector.cpp @@ -2,6 +2,8 @@ #include "BarcodeClassifier.h" +#include + namespace dorado::demux { std::shared_ptr BarcodeClassifierSelector::get_barcoder( From 4698eeb27895a84c5384254551f0002323c29937 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Thu, 26 Oct 2023 13:06:02 +0100 Subject: [PATCH 32/32] Add missing header --- dorado/demux/BarcodeClassifier.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 94fec1840..b211ec09c 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include