Skip to content

Commit

Permalink
Merge branch 'alignment_stripping_patch_v0.6' into 'release-v0.6'
Browse files Browse the repository at this point in the history
DOR-666 Alignment data is not stripped from unclassified reads

See merge request machine-learning/dorado!954
  • Loading branch information
kdolan1973 committed Apr 18, 2024
2 parents 2982771 + a03c3d5 commit 1cc207a
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 36 deletions.
13 changes: 3 additions & 10 deletions dorado/demux/Trimmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,8 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair<int, int> trim_interval) {
: trim_interval);

// Create a new bam record to hold the trimmed read.
bam1_t* out_record = bam_init1();
bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1,
bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/,
0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/,
trimmed_seq.size(), trimmed_seq.data(),
trimmed_qual.empty() ? nullptr : (char*)trimmed_qual.data(),
bam_get_l_aux(input_record));
memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record));
out_record->l_data += bam_get_l_aux(input_record);
BamPtr output(utils::new_unmapped_record(input, trimmed_seq, trimmed_qual));
bam1_t* out_record = output.get();

// Insert the new tags and delete the old ones.
if (!trimmed_moves.empty()) {
Expand Down Expand Up @@ -197,7 +190,7 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair<int, int> trim_interval) {
bam_aux_update_int(out_record, "ns", ns);
}

return BamPtr(out_record);
return output;
}

void Trimmer::trim_sequence(SimplexRead& read, std::pair<int, int> trim_interval) {
Expand Down
3 changes: 1 addition & 2 deletions dorado/read_pipeline/AdapterDetectorNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,10 @@ void AdapterDetectorNode::process_read(BamPtr& read) {
if (trim_interval.first >= trim_interval.second) {
spdlog::warn("Unexpected adapter/primer trim interval {}-{} for {}",
trim_interval.first, trim_interval.second, seq);
read = utils::new_unmapped_record(read, {}, {});
return;
}
read = Trimmer::trim_sequence(std::move(read), trim_interval);

utils::remove_alignment_tags_from_record(read.get());
}
}

Expand Down
4 changes: 2 additions & 2 deletions dorado/read_pipeline/BarcodeClassifierNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ void BarcodeClassifierNode::barcode(BamPtr& read) {

if (bc != "unclassified" && trim_interval.second - trim_interval.first <= seqlen) {
read = Trimmer::trim_sequence(std::move(read), trim_interval);
} else {
read = utils::new_unmapped_record(read, {}, {});
}

utils::remove_alignment_tags_from_record(read.get());
}
}

Expand Down
27 changes: 21 additions & 6 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,20 +536,35 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar) {
return cigar_str;
}

BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector<uint8_t> qual) {
bam1_t* input_record = record.get();
if (seq.empty()) {
seq = extract_sequence(input_record);
qual = extract_quality(input_record);
}
bam1_t* out_record = bam_init1();
bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1,
bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/,
0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/, seq.size(),
seq.data(), qual.empty() ? nullptr : (char*)qual.data(), bam_get_l_aux(input_record));
memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record));
out_record->l_data += bam_get_l_aux(input_record);
remove_alignment_tags_from_record(out_record);
return BamPtr(out_record);
}

void remove_alignment_tags_from_record(bam1_t* record) {
// Iterate through all tags and check against known set
// of tags to remove.
static const std::set<std::pair<std::string, char>> tags_to_remove = {
{"SA", 'Z'}, {"NM", 'i'}, {"ms", 'i'}, {"AS", 'i'}, {"nn", 'i'}, {"ts", 'A'},
{"de", 'f'}, {"dv", 'f'}, {"tp", 'A'}, {"cm", 'i'}, {"s1", 'i'}, {"s2", 'i'},
{"MD", 'Z'}, {"zd", 'i'}, {"rl", 'i'}, {"bh", 'i'}};
static const std::set<std::string> tags_to_remove = {"SA", "NM", "ms", "AS", "nn", "ts",
"de", "dv", "tp", "cm", "s1", "s2",
"MD", "zd", "rl", "bh"};

uint8_t* aux_ptr = bam_aux_first(record);
while (aux_ptr != NULL) {
auto tag_ptr = bam_aux_tag(aux_ptr);
std::string tag = std::string(tag_ptr, tag_ptr + 2);
char type = bam_aux_type(aux_ptr);
if (tags_to_remove.find({tag, type}) != tags_to_remove.end()) {
if (tags_to_remove.find(tag) != tags_to_remove.end()) {
aux_ptr = bam_aux_remove(record, aux_ptr);
} else {
aux_ptr = bam_aux_next(record, aux_ptr);
Expand Down
16 changes: 16 additions & 0 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar);
*/
kstring_t allocate_kstring();

/*
* Make a copy of the bam record with any alignment data stripped out.
*
* @param record BAM record.
* @param seq New sequence.
* @param qual New quality scores.
*
* If seq is an empty string, then qual is ignored and the sequence and qualities from the
* input record will be used. If seq is not empty then it will replace the original sequence
* in the new object, and qual will replace the original qualities in the new object. This
* means that if seq is not empty but qual is, then the new object will have nullptr as its
* quality field. If seq and qual are both non-empty, then of course they must be the same
* length.
*/
BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector<uint8_t> qual);

/*
* Remove any alignment related tags from a BAM record.
*
Expand Down
5 changes: 2 additions & 3 deletions tests/AlignerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,9 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check alignment with bed
CHECK_THAT(aux, Contains(tag));
}
auto bh_tag_ptr = bam_aux_get(rec, "bh");
auto bh_tag_type = *(char*)bh_tag_ptr;
auto bh_tag_type = bam_aux_type(bh_tag_ptr);
CHECK(bh_tag_type == 'i');
int32_t bh_tag_value = 0;
memcpy(&bh_tag_value, bh_tag_ptr + 1, 4);
auto bh_tag_value = bam_aux2i(bh_tag_ptr);
CHECK(bh_tag_value == 3);
}

Expand Down
54 changes: 41 additions & 13 deletions tests/BarcodeClassifierTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <vector>

#define TEST_GROUP "[barcode_demux]"

namespace fs = std::filesystem;

using namespace dorado;
Expand Down Expand Up @@ -338,39 +337,68 @@ TEST_CASE(
}
}

TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", TEST_GROUP) {
TEST_CASE("BarcodeClassifierNode: test for proper trimming and alignment data stripping",
TEST_GROUP) {
using Catch::Matchers::Equals;

dorado::PipelineDescriptor pipeline_desc;
std::vector<dorado::Message> messages;
auto sink = pipeline_desc.add_node<MessageSinkToVector>({}, 100, messages);
std::vector<std::string> kits = {"SQK-RBK114-96"};
std::vector<std::string> kits = {"SQK-16S024"};
bool barcode_both_ends = false;
bool no_trim = false;
pipeline_desc.add_node<BarcodeClassifierNode>({sink}, 8, kits, barcode_both_ends, no_trim,
std::nullopt, std::nullopt, std::nullopt);

auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc), nullptr);
fs::path data_dir = fs::path(get_data_dir("barcode_demux"));
auto bc_file = data_dir / "no_trim_expected.fastq";
auto bc_file = data_dir / "simple_mapped_reads.sam";

// Only one read in the file, so fetch that.
// First read should be unclassified.
HtsReader reader(bc_file.string(), std::nullopt);
reader.read();
BamPtr read1(bam_dup1(reader.record.get()));
std::string id_in1 = bam_get_qname(read1.get());
auto orig_seq1 = dorado::utils::extract_sequence(read1.get());
pipeline->push_message(std::move(read1));

// Fetch the original read before barcode trimming.
auto orig_seq = dorado::utils::extract_sequence(reader.record.get());
// Second read should be classified.
reader.read();
BamPtr read2(bam_dup1(reader.record.get()));
std::string id_in2 = bam_get_qname(read2.get());
auto orig_seq2 = dorado::utils::extract_sequence(read2.get());
pipeline->push_message(std::move(read2));

pipeline->push_message(std::move(reader.record));
pipeline->terminate(DefaultFlushOptions());

CHECK(messages.size() == 1);
CHECK(messages.size() == 2);

auto read = std::get<BamPtr>(std::move(messages[0]));
auto seq = dorado::utils::extract_sequence(read.get());
read1 = std::get<BamPtr>(std::move(messages[0]));
read2 = std::get<BamPtr>(std::move(messages[1]));

// Reads may not come back in the same order.
std::string id_out1 = bam_get_qname(read1.get());
std::string id_out2 = bam_get_qname(read2.get());
if (id_out1 != id_in1) {
read1.swap(read2);
}

// We don't expect any trimming to happen, so the original and final sequence must match.
CHECK(seq == orig_seq);
// First read should be unclassified and untrimmed.
auto seq1 = dorado::utils::extract_sequence(read1.get());
CHECK(seq1 == orig_seq1);
CHECK_THAT(bam_aux2Z(bam_aux_get(read1.get(), "BC")), Equals("unclassified"));

// Second read should be classified and trimmed.
auto seq2 = dorado::utils::extract_sequence(read2.get());
CHECK(seq2 ==
seq1); // Sequence 2 is just sequence 1 plus the barcode, which should be trimmed.
CHECK_THAT(bam_aux2Z(bam_aux_get(read2.get(), "BC")), Equals("SQK-16S024_barcode01"));

// Check to make sure alignment data has been stripped from both reads.
CHECK(read1->core.tid == -1);
CHECK(bam_aux_get(read1.get(), "bh") == nullptr);
CHECK(read2->core.tid == -1);
CHECK(bam_aux_get(read2.get(), "bh") == nullptr);
}

struct CustomDoubleEndedKitInput {
Expand Down
6 changes: 6 additions & 0 deletions tests/data/barcode_demux/simple_mapped_reads.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@HD VN:1.6 SO:coordinate
@PG ID:aligner PN:dorado VN:0.6.0+437aca1b+dirty
@RG ID:0a73e955b30dc4b0182e1abb710bca268b16d689_dna_r10.4.1_e8.2_400bps_sup@v4.2.0 PU:PAO83395 PM:PC24B318 DT:2023-04-29T16:06:40.107+00:00 PL:ONT DS:basecall_model=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 runid=0a73e955b30dc4b0182e1abb710bca268b16d689 LB:PrePCR SM:PrePCR
@SQ SN:ref LN:1000
799a14ce-a152-4d95-92fd-097d4c157fa5 2 ref 100 9 100M49S * 0 100 CTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT &+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1
799a14ce-a152-4d95-92fd-097d4c157fa6 2 ref 100 9 51S100M49S * 0 100 CCGTGACAAGAAAGTTGTCGGTGTCTTTGTGAGAGTTTGATCATGGCTCAGCTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT 23@,&%,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1

0 comments on commit 1cc207a

Please sign in to comment.