From 437aca1b6192acf9447af16b910580b25fc1fd3b Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Wed, 17 Apr 2024 12:12:16 +0100 Subject: [PATCH 1/3] Correctly strip alignment data from unclassified reads when demuxing with trimming. --- dorado/demux/Trimmer.cpp | 13 +++------- .../read_pipeline/BarcodeClassifierNode.cpp | 4 +-- dorado/utils/bam_utils.cpp | 25 +++++++++++++++++++ dorado/utils/bam_utils.h | 15 +++++++++++ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/dorado/demux/Trimmer.cpp b/dorado/demux/Trimmer.cpp index 7e66d39e..dca2eab0 100644 --- a/dorado/demux/Trimmer.cpp +++ b/dorado/demux/Trimmer.cpp @@ -161,15 +161,8 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair trim_interval) { : trim_interval); // Create a new bam record to hold the trimmed read. - bam1_t* out_record = bam_init1(); - bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1, - bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/, - 0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/, - trimmed_seq.size(), trimmed_seq.data(), - trimmed_qual.empty() ? nullptr : (char*)trimmed_qual.data(), - bam_get_l_aux(input_record)); - memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record)); - out_record->l_data += bam_get_l_aux(input_record); + BamPtr output(utils::new_unmapped_record(input, trimmed_seq, trimmed_qual)); + bam1_t* out_record = output.get(); // Insert the new tags and delete the old ones. if (!trimmed_moves.empty()) { @@ -197,7 +190,7 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair trim_interval) { bam_aux_update_int(out_record, "ns", ns); } - return BamPtr(out_record); + return output; } void Trimmer::trim_sequence(SimplexRead& read, std::pair trim_interval) { diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 941ae462..a2a6c815 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -127,9 +127,9 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { if (bc != "unclassified" && trim_interval.second - trim_interval.first <= seqlen) { read = Trimmer::trim_sequence(std::move(read), trim_interval); + } else { + read = utils::new_unmapped_record(read, {}, {}); } - - utils::remove_alignment_tags_from_record(read.get()); } } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index cf9bd461..5277dd32 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -536,6 +536,31 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar) { return cigar_str; } +BamPtr new_unmapped_record(const BamPtr& record, + const std::string& seq, + const std::vector& qual) { + const bam1_t* input_record = record.get(); + auto seq_ptr = seq.empty() ? nullptr : seq.data(); + size_t seq_len = seq.size(); + auto qual_ptr = qual.empty() ? nullptr : (const char*)qual.data(); + if (!seq_ptr) { + seq_ptr = (const char*)bam_get_seq(input_record); + seq_len = input_record->core.l_qseq; + } + if (!qual_ptr && !seq.empty()) { + qual_ptr = (const char*)bam_get_qual(input_record); + } + bam1_t* out_record = bam_init1(); + bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1, + bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/, + 0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/, seq_len, + seq_ptr, qual_ptr, bam_get_l_aux(input_record)); + memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record)); + out_record->l_data += bam_get_l_aux(input_record); + remove_alignment_tags_from_record(out_record); + return BamPtr(out_record); +} + void remove_alignment_tags_from_record(bam1_t* record) { // Iterate through all tags and check against known set // of tags to remove. diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index c66416a2..87f9593c 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -194,6 +194,21 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar); */ kstring_t allocate_kstring(); +/* + * Make a copy of the bam record with any alignment data stripped out. + * + * @param record BAM record. + * @param seq New sequence. + * @param qual New qquality scores. + * + * If seq and qual are empty strings, the sequence and qualities from record will be + * used. If qual is empty and seq is not, then this means no quality data should be + * included. Otherwise, if seq and qual are not empty, they must be the same length. + */ +BamPtr new_unmapped_record(const BamPtr& record, + const std::string& seq, + const std::vector& qual); + /* * Remove any alignment related tags from a BAM record. * From c6844efef917ce55a30d258767126bf80c442004 Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Wed, 17 Apr 2024 13:48:53 +0100 Subject: [PATCH 2/3] Fixed incorrect extraction of sequence. Updated adapter-trimming code for changes to trimming code. --- dorado/read_pipeline/AdapterDetectorNode.cpp | 3 +-- dorado/utils/bam_utils.cpp | 26 +++++++++----------- dorado/utils/bam_utils.h | 15 +++++------ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/dorado/read_pipeline/AdapterDetectorNode.cpp b/dorado/read_pipeline/AdapterDetectorNode.cpp index a63af728..50697489 100644 --- a/dorado/read_pipeline/AdapterDetectorNode.cpp +++ b/dorado/read_pipeline/AdapterDetectorNode.cpp @@ -80,11 +80,10 @@ void AdapterDetectorNode::process_read(BamPtr& read) { if (trim_interval.first >= trim_interval.second) { spdlog::warn("Unexpected adapter/primer trim interval {}-{} for {}", trim_interval.first, trim_interval.second, seq); + read = utils::new_unmapped_record(read, {}, {}); return; } read = Trimmer::trim_sequence(std::move(read), trim_interval); - - utils::remove_alignment_tags_from_record(read.get()); } } diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 5277dd32..c621cce1 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -536,25 +536,21 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar) { return cigar_str; } -BamPtr new_unmapped_record(const BamPtr& record, - const std::string& seq, - const std::vector& qual) { - const bam1_t* input_record = record.get(); - auto seq_ptr = seq.empty() ? nullptr : seq.data(); - size_t seq_len = seq.size(); - auto qual_ptr = qual.empty() ? nullptr : (const char*)qual.data(); - if (!seq_ptr) { - seq_ptr = (const char*)bam_get_seq(input_record); - seq_len = input_record->core.l_qseq; - } - if (!qual_ptr && !seq.empty()) { - qual_ptr = (const char*)bam_get_qual(input_record); +BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector qual) { + bam1_t* input_record = record.get(); + if (seq.empty()) { + seq = extract_sequence(input_record); + auto qptr = bam_get_qual(input_record); + if (qptr) { + qual.resize(seq.size()); + std::copy(qptr, qptr + seq.size(), qual.begin()); + } } bam1_t* out_record = bam_init1(); bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1, bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/, - 0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/, seq_len, - seq_ptr, qual_ptr, bam_get_l_aux(input_record)); + 0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/, seq.size(), + seq.data(), qual.empty() ? nullptr : (char*)qual.data(), bam_get_l_aux(input_record)); memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record)); out_record->l_data += bam_get_l_aux(input_record); remove_alignment_tags_from_record(out_record); diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 87f9593c..2cdb57ed 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -199,15 +199,16 @@ kstring_t allocate_kstring(); * * @param record BAM record. * @param seq New sequence. - * @param qual New qquality scores. + * @param qual New quality scores. * - * If seq and qual are empty strings, the sequence and qualities from record will be - * used. If qual is empty and seq is not, then this means no quality data should be - * included. Otherwise, if seq and qual are not empty, they must be the same length. + * If seq is an empty string, then qual is ignored and the sequence and qualities from the + * input record will be used. If seq is not empty then it will replace the original sequence + * in the new object, and qual will replace the original qualities in the new object. This + * means that if seq is not empty but qual is, then the new object will have nullptr as its + * quality field. If seq and qual are both non-empty, then of course they must be the same + * length. */ -BamPtr new_unmapped_record(const BamPtr& record, - const std::string& seq, - const std::vector& qual); +BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector qual); /* * Remove any alignment related tags from a BAM record. From a03c3d58011004c98d1076a42b158279626dbbc6 Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Thu, 18 Apr 2024 14:46:28 +0100 Subject: [PATCH 3/3] Added test for removal of alignment data. Fixed a bug. --- dorado/utils/bam_utils.cpp | 16 ++---- tests/AlignerTest.cpp | 5 +- tests/BarcodeClassifierTest.cpp | 54 ++++++++++++++----- .../barcode_demux/simple_mapped_reads.sam | 6 +++ 4 files changed, 54 insertions(+), 27 deletions(-) create mode 100755 tests/data/barcode_demux/simple_mapped_reads.sam diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index c621cce1..acab7bc5 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -540,11 +540,7 @@ BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vectorcore.l_qname - input_record->core.l_extranul - 1, @@ -560,17 +556,15 @@ BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector> tags_to_remove = { - {"SA", 'Z'}, {"NM", 'i'}, {"ms", 'i'}, {"AS", 'i'}, {"nn", 'i'}, {"ts", 'A'}, - {"de", 'f'}, {"dv", 'f'}, {"tp", 'A'}, {"cm", 'i'}, {"s1", 'i'}, {"s2", 'i'}, - {"MD", 'Z'}, {"zd", 'i'}, {"rl", 'i'}, {"bh", 'i'}}; + static const std::set tags_to_remove = {"SA", "NM", "ms", "AS", "nn", "ts", + "de", "dv", "tp", "cm", "s1", "s2", + "MD", "zd", "rl", "bh"}; uint8_t* aux_ptr = bam_aux_first(record); while (aux_ptr != NULL) { auto tag_ptr = bam_aux_tag(aux_ptr); std::string tag = std::string(tag_ptr, tag_ptr + 2); - char type = bam_aux_type(aux_ptr); - if (tags_to_remove.find({tag, type}) != tags_to_remove.end()) { + if (tags_to_remove.find(tag) != tags_to_remove.end()) { aux_ptr = bam_aux_remove(record, aux_ptr); } else { aux_ptr = bam_aux_next(record, aux_ptr); diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index 4f6652d7..4d9433a3 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -195,10 +195,9 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check alignment with bed CHECK_THAT(aux, Contains(tag)); } auto bh_tag_ptr = bam_aux_get(rec, "bh"); - auto bh_tag_type = *(char*)bh_tag_ptr; + auto bh_tag_type = bam_aux_type(bh_tag_ptr); CHECK(bh_tag_type == 'i'); - int32_t bh_tag_value = 0; - memcpy(&bh_tag_value, bh_tag_ptr + 1, 4); + auto bh_tag_value = bam_aux2i(bh_tag_ptr); CHECK(bh_tag_value == 3); } diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index 31bf0af7..bb8e4bbe 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -18,7 +18,6 @@ #include #define TEST_GROUP "[barcode_demux]" - namespace fs = std::filesystem; using namespace dorado; @@ -338,13 +337,14 @@ TEST_CASE( } } -TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", TEST_GROUP) { +TEST_CASE("BarcodeClassifierNode: test for proper trimming and alignment data stripping", + TEST_GROUP) { using Catch::Matchers::Equals; dorado::PipelineDescriptor pipeline_desc; std::vector messages; auto sink = pipeline_desc.add_node({}, 100, messages); - std::vector kits = {"SQK-RBK114-96"}; + std::vector kits = {"SQK-16S024"}; bool barcode_both_ends = false; bool no_trim = false; pipeline_desc.add_node({sink}, 8, kits, barcode_both_ends, no_trim, @@ -352,25 +352,53 @@ TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc), nullptr); fs::path data_dir = fs::path(get_data_dir("barcode_demux")); - auto bc_file = data_dir / "no_trim_expected.fastq"; + auto bc_file = data_dir / "simple_mapped_reads.sam"; - // Only one read in the file, so fetch that. + // First read should be unclassified. HtsReader reader(bc_file.string(), std::nullopt); reader.read(); + BamPtr read1(bam_dup1(reader.record.get())); + std::string id_in1 = bam_get_qname(read1.get()); + auto orig_seq1 = dorado::utils::extract_sequence(read1.get()); + pipeline->push_message(std::move(read1)); - // Fetch the original read before barcode trimming. - auto orig_seq = dorado::utils::extract_sequence(reader.record.get()); + // Second read should be classified. + reader.read(); + BamPtr read2(bam_dup1(reader.record.get())); + std::string id_in2 = bam_get_qname(read2.get()); + auto orig_seq2 = dorado::utils::extract_sequence(read2.get()); + pipeline->push_message(std::move(read2)); - pipeline->push_message(std::move(reader.record)); pipeline->terminate(DefaultFlushOptions()); - CHECK(messages.size() == 1); + CHECK(messages.size() == 2); - auto read = std::get(std::move(messages[0])); - auto seq = dorado::utils::extract_sequence(read.get()); + read1 = std::get(std::move(messages[0])); + read2 = std::get(std::move(messages[1])); + + // Reads may not come back in the same order. + std::string id_out1 = bam_get_qname(read1.get()); + std::string id_out2 = bam_get_qname(read2.get()); + if (id_out1 != id_in1) { + read1.swap(read2); + } - // We don't expect any trimming to happen, so the original and final sequence must match. - CHECK(seq == orig_seq); + // First read should be unclassified and untrimmed. + auto seq1 = dorado::utils::extract_sequence(read1.get()); + CHECK(seq1 == orig_seq1); + CHECK_THAT(bam_aux2Z(bam_aux_get(read1.get(), "BC")), Equals("unclassified")); + + // Second read should be classified and trimmed. + auto seq2 = dorado::utils::extract_sequence(read2.get()); + CHECK(seq2 == + seq1); // Sequence 2 is just sequence 1 plus the barcode, which should be trimmed. + CHECK_THAT(bam_aux2Z(bam_aux_get(read2.get(), "BC")), Equals("SQK-16S024_barcode01")); + + // Check to make sure alignment data has been stripped from both reads. + CHECK(read1->core.tid == -1); + CHECK(bam_aux_get(read1.get(), "bh") == nullptr); + CHECK(read2->core.tid == -1); + CHECK(bam_aux_get(read2.get(), "bh") == nullptr); } struct CustomDoubleEndedKitInput { diff --git a/tests/data/barcode_demux/simple_mapped_reads.sam b/tests/data/barcode_demux/simple_mapped_reads.sam new file mode 100755 index 00000000..fd7fc472 --- /dev/null +++ b/tests/data/barcode_demux/simple_mapped_reads.sam @@ -0,0 +1,6 @@ +@HD VN:1.6 SO:coordinate +@PG ID:aligner PN:dorado VN:0.6.0+437aca1b+dirty +@RG ID:0a73e955b30dc4b0182e1abb710bca268b16d689_dna_r10.4.1_e8.2_400bps_sup@v4.2.0 PU:PAO83395 PM:PC24B318 DT:2023-04-29T16:06:40.107+00:00 PL:ONT DS:basecall_model=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 runid=0a73e955b30dc4b0182e1abb710bca268b16d689 LB:PrePCR SM:PrePCR +@SQ SN:ref LN:1000 +799a14ce-a152-4d95-92fd-097d4c157fa5 2 ref 100 9 100M49S * 0 100 CTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT &+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1 +799a14ce-a152-4d95-92fd-097d4c157fa6 2 ref 100 9 51S100M49S * 0 100 CCGTGACAAGAAAGTTGTCGGTGTCTTTGTGAGAGTTTGATCATGGCTCAGCTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT 23@,&%,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1