Skip to content

Commit

Permalink
Added test for removal of alignment data. Fixed a bug.
Browse files Browse the repository at this point in the history
  • Loading branch information
kdolan1973 committed Apr 18, 2024
1 parent c6844ef commit a03c3d5
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 27 deletions.
16 changes: 5 additions & 11 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,11 +540,7 @@ BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector<ui
bam1_t* input_record = record.get();
if (seq.empty()) {
seq = extract_sequence(input_record);
auto qptr = bam_get_qual(input_record);
if (qptr) {
qual.resize(seq.size());
std::copy(qptr, qptr + seq.size(), qual.begin());
}
qual = extract_quality(input_record);
}
bam1_t* out_record = bam_init1();
bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1,
Expand All @@ -560,17 +556,15 @@ BamPtr new_unmapped_record(const BamPtr& record, std::string seq, std::vector<ui
void remove_alignment_tags_from_record(bam1_t* record) {
// Iterate through all tags and check against known set
// of tags to remove.
static const std::set<std::pair<std::string, char>> tags_to_remove = {
{"SA", 'Z'}, {"NM", 'i'}, {"ms", 'i'}, {"AS", 'i'}, {"nn", 'i'}, {"ts", 'A'},
{"de", 'f'}, {"dv", 'f'}, {"tp", 'A'}, {"cm", 'i'}, {"s1", 'i'}, {"s2", 'i'},
{"MD", 'Z'}, {"zd", 'i'}, {"rl", 'i'}, {"bh", 'i'}};
static const std::set<std::string> tags_to_remove = {"SA", "NM", "ms", "AS", "nn", "ts",
"de", "dv", "tp", "cm", "s1", "s2",
"MD", "zd", "rl", "bh"};

uint8_t* aux_ptr = bam_aux_first(record);
while (aux_ptr != NULL) {
auto tag_ptr = bam_aux_tag(aux_ptr);
std::string tag = std::string(tag_ptr, tag_ptr + 2);
char type = bam_aux_type(aux_ptr);
if (tags_to_remove.find({tag, type}) != tags_to_remove.end()) {
if (tags_to_remove.find(tag) != tags_to_remove.end()) {
aux_ptr = bam_aux_remove(record, aux_ptr);
} else {
aux_ptr = bam_aux_next(record, aux_ptr);
Expand Down
5 changes: 2 additions & 3 deletions tests/AlignerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,9 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check alignment with bed
CHECK_THAT(aux, Contains(tag));
}
auto bh_tag_ptr = bam_aux_get(rec, "bh");
auto bh_tag_type = *(char*)bh_tag_ptr;
auto bh_tag_type = bam_aux_type(bh_tag_ptr);
CHECK(bh_tag_type == 'i');
int32_t bh_tag_value = 0;
memcpy(&bh_tag_value, bh_tag_ptr + 1, 4);
auto bh_tag_value = bam_aux2i(bh_tag_ptr);
CHECK(bh_tag_value == 3);
}

Expand Down
54 changes: 41 additions & 13 deletions tests/BarcodeClassifierTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <vector>

#define TEST_GROUP "[barcode_demux]"

namespace fs = std::filesystem;

using namespace dorado;
Expand Down Expand Up @@ -338,39 +337,68 @@ TEST_CASE(
}
}

TEST_CASE("BarcodeClassifierNode: test reads where trim length == read length", TEST_GROUP) {
TEST_CASE("BarcodeClassifierNode: test for proper trimming and alignment data stripping",
TEST_GROUP) {
using Catch::Matchers::Equals;

dorado::PipelineDescriptor pipeline_desc;
std::vector<dorado::Message> messages;
auto sink = pipeline_desc.add_node<MessageSinkToVector>({}, 100, messages);
std::vector<std::string> kits = {"SQK-RBK114-96"};
std::vector<std::string> kits = {"SQK-16S024"};
bool barcode_both_ends = false;
bool no_trim = false;
pipeline_desc.add_node<BarcodeClassifierNode>({sink}, 8, kits, barcode_both_ends, no_trim,
std::nullopt, std::nullopt, std::nullopt);

auto pipeline = dorado::Pipeline::create(std::move(pipeline_desc), nullptr);
fs::path data_dir = fs::path(get_data_dir("barcode_demux"));
auto bc_file = data_dir / "no_trim_expected.fastq";
auto bc_file = data_dir / "simple_mapped_reads.sam";

// Only one read in the file, so fetch that.
// First read should be unclassified.
HtsReader reader(bc_file.string(), std::nullopt);
reader.read();
BamPtr read1(bam_dup1(reader.record.get()));
std::string id_in1 = bam_get_qname(read1.get());
auto orig_seq1 = dorado::utils::extract_sequence(read1.get());
pipeline->push_message(std::move(read1));

// Fetch the original read before barcode trimming.
auto orig_seq = dorado::utils::extract_sequence(reader.record.get());
// Second read should be classified.
reader.read();
BamPtr read2(bam_dup1(reader.record.get()));
std::string id_in2 = bam_get_qname(read2.get());
auto orig_seq2 = dorado::utils::extract_sequence(read2.get());
pipeline->push_message(std::move(read2));

pipeline->push_message(std::move(reader.record));
pipeline->terminate(DefaultFlushOptions());

CHECK(messages.size() == 1);
CHECK(messages.size() == 2);

auto read = std::get<BamPtr>(std::move(messages[0]));
auto seq = dorado::utils::extract_sequence(read.get());
read1 = std::get<BamPtr>(std::move(messages[0]));
read2 = std::get<BamPtr>(std::move(messages[1]));

// Reads may not come back in the same order.
std::string id_out1 = bam_get_qname(read1.get());
std::string id_out2 = bam_get_qname(read2.get());
if (id_out1 != id_in1) {
read1.swap(read2);
}

// We don't expect any trimming to happen, so the original and final sequence must match.
CHECK(seq == orig_seq);
// First read should be unclassified and untrimmed.
auto seq1 = dorado::utils::extract_sequence(read1.get());
CHECK(seq1 == orig_seq1);
CHECK_THAT(bam_aux2Z(bam_aux_get(read1.get(), "BC")), Equals("unclassified"));

// Second read should be classified and trimmed.
auto seq2 = dorado::utils::extract_sequence(read2.get());
CHECK(seq2 ==
seq1); // Sequence 2 is just sequence 1 plus the barcode, which should be trimmed.
CHECK_THAT(bam_aux2Z(bam_aux_get(read2.get(), "BC")), Equals("SQK-16S024_barcode01"));

// Check to make sure alignment data has been stripped from both reads.
CHECK(read1->core.tid == -1);
CHECK(bam_aux_get(read1.get(), "bh") == nullptr);
CHECK(read2->core.tid == -1);
CHECK(bam_aux_get(read2.get(), "bh") == nullptr);
}

struct CustomDoubleEndedKitInput {
Expand Down
6 changes: 6 additions & 0 deletions tests/data/barcode_demux/simple_mapped_reads.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@HD VN:1.6 SO:coordinate
@PG ID:aligner PN:dorado VN:0.6.0+437aca1b+dirty
@RG ID:0a73e955b30dc4b0182e1abb710bca268b16d689_dna_r10.4.1_e8.2_400bps_sup@v4.2.0 PU:PAO83395 PM:PC24B318 DT:2023-04-29T16:06:40.107+00:00 PL:ONT DS:basecall_model=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 runid=0a73e955b30dc4b0182e1abb710bca268b16d689 LB:PrePCR SM:PrePCR
@SQ SN:ref LN:1000
799a14ce-a152-4d95-92fd-097d4c157fa5 2 ref 100 9 100M49S * 0 100 CTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT &+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1
799a14ce-a152-4d95-92fd-097d4c157fa6 2 ref 100 9 51S100M49S * 0 100 CCGTGACAAGAAAGTTGTCGGTGTCTTTGTGAGAGTTTGATCATGGCTCAGCTGGATCAGTATATGCCGGCGGTATCCCAACAAGTATTGGGACCCGTCGGTTTTAGCGTATAGTGAAGTGCTTTGAAGTGGTGTTTAACCAACAGTTGTCTTCGGAGCCCATAGAGTTCGCGTATAGTGAAACACTTTCGCGTTTTCGT 23@,&%,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(,&&&&+*&&$##$%%',-))((+)*+&&%'%%%$()***2)(&&&&+00(()*,//+,-+++(%$$$'(%$&&(&&%(((((243,,,+('+('(()&'&&-%%&&'%&%(-.(''('''(((&&(-0+)(*1***3323@,&% bh:i:1

0 comments on commit a03c3d5

Please sign in to comment.