Skip to content

Commit

Permalink
Merge branch 'jdaw/remove-alignment-info-after-trim' into 'master'
Browse files Browse the repository at this point in the history
Strip a BAM record of any alignment information

Closes DOR-605

See merge request machine-learning/dorado!900
  • Loading branch information
tijyojwad committed Mar 21, 2024
2 parents a0f9462 + 61fcee6 commit bacd354
Show file tree
Hide file tree
Showing 10 changed files with 115 additions and 29 deletions.
12 changes: 9 additions & 3 deletions dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ int demuxer(int argc, char* argv[]) {
auto emit_summary = parser.visible.get<bool>("emit-summary");
auto threads(parser.visible.get<int>("threads"));
auto max_reads(parser.visible.get<int>("max-reads"));
auto no_trim(parser.visible.get<bool>("--no-trim"));

alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true};
if (!processing_items.initialise()) {
Expand Down Expand Up @@ -198,6 +199,12 @@ int demuxer(int argc, char* argv[]) {
}

add_pg_hdr(header.get());
if (!no_trim) {
// Remove SQ lines from header since alignment information
// is invalidated after trimming.
utils::strip_sq_hdr(header.get());
sam_hdr_remove_tag_id(header.get(), "HD", NULL, NULL, "SO");
}

auto barcode_sample_sheet = parser.visible.get<std::string>("--sample-sheet");
std::unique_ptr<const utils::SampleSheet> sample_sheet;
Expand All @@ -219,9 +226,8 @@ int demuxer(int argc, char* argv[]) {
}
pipeline_desc.add_node<BarcodeClassifierNode>(
{demux_writer}, demux_threads, kit_names,
parser.visible.get<bool>("--barcode-both-ends"),
parser.visible.get<bool>("--no-trim"), std::move(allowed_barcodes),
std::move(custom_kit), std::move(custom_barcode_file));
parser.visible.get<bool>("--barcode-both-ends"), no_trim,
std::move(allowed_barcodes), std::move(custom_kit), std::move(custom_barcode_file));
}

// Create the Pipeline from our description.
Expand Down
6 changes: 6 additions & 0 deletions dorado/cli/trim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "read_pipeline/HtsReader.h"
#include "read_pipeline/HtsWriter.h"
#include "read_pipeline/ProgressTracker.h"
#include "utils/bam_utils.h"
#include "utils/basecaller_utils.h"
#include "utils/log_utils.h"
#include "utils/stats.h"
Expand Down Expand Up @@ -116,6 +117,11 @@ int trim(int argc, char* argv[]) {
HtsReader reader(reads[0], read_list);
auto header = SamHdrPtr(sam_hdr_dup(reader.header));
add_pg_hdr(header.get());
// Always remove alignment information from input header
// because at minimum the adapters are trimmed, which
// invalidates the alignment record.
utils::strip_sq_hdr(header.get());
sam_hdr_remove_tag_id(header.get(), "HD", NULL, NULL, "SO");

auto output_mode = OutputMode::BAM;

Expand Down
49 changes: 24 additions & 25 deletions dorado/demux/Trimmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,43 +136,38 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair<int, int> trim_interval) {
std::string seq = utils::extract_sequence(input_record);
std::vector<uint8_t> qual = utils::extract_quality(input_record);
auto [stride, move_vals] = utils::extract_move_table(input_record);
int ts = bam_aux_get(input_record, "ts") ? int(bam_aux2i(bam_aux_get(input_record, "ts"))) : 0;
int ns = bam_aux_get(input_record, "ns") ? int(bam_aux2i(bam_aux_get(input_record, "ns"))) : 0;
int ts = bam_aux_get(input_record, "ts") ? int(bam_aux2i(bam_aux_get(input_record, "ts"))) : -1;
int ns = bam_aux_get(input_record, "ns") ? int(bam_aux2i(bam_aux_get(input_record, "ns"))) : -1;
auto [modbase_str, modbase_probs] = utils::extract_modbase_info(input_record);

// Actually trim components.
auto trimmed_seq = utils::trim_sequence(seq, trim_interval);
auto trimmed_qual = utils::trim_quality(qual, trim_interval);
auto [positions_trimmed, trimmed_moves] = utils::trim_move_table(move_vals, trim_interval);
ts += positions_trimmed * stride;
// After sequence trimming, the number of samples corresponding to the sequence is the size of
// the new move table * stride. However, the ns tag includes the number of samples trimmed from the
// front of the read as well.
// |---------------------- ns ------------------|
// |----ts----|--------moves signal-------------|
ns = int(trimmed_moves.size() * stride) + ts;
if (ts >= 0) {
ts += positions_trimmed * stride;
}
if (ns >= 0) {
// After sequence trimming, the number of samples corresponding to the sequence is the size of
// the new move table * stride. However, the ns tag includes the number of samples trimmed from the
// front of the read as well.
// |---------------------- ns ------------------|
// |----ts----|--------moves signal-------------|
ns = int(trimmed_moves.size() * stride) + ts;
}
auto [trimmed_modbase_str, trimmed_modbase_probs] = utils::trim_modbase_info(
is_seq_reversed ? utils::reverse_complement(seq) : seq, modbase_str, modbase_probs,
is_seq_reversed ? reverse_complement_interval(trim_interval, int(seq.length()))
: trim_interval);
auto n_cigar = input_record->core.n_cigar;
std::vector<uint32_t> ops;
uint32_t ref_pos_consumed = 0;
if (n_cigar > 0) {
auto cigar_arr = bam_get_cigar(input_record);
ops = utils::trim_cigar(n_cigar, cigar_arr, trim_interval);
ref_pos_consumed =
ops.empty() ? 0 : utils::ref_pos_consumed(n_cigar, cigar_arr, trim_interval.first);
}

// Create a new bam record to hold the trimmed read.
bam1_t* out_record = bam_init1();
bam_set1(out_record, input_record->core.l_qname - input_record->core.l_extranul - 1,
bam_get_qname(input_record), input_record->core.flag, input_record->core.tid,
input_record->core.pos + ref_pos_consumed, input_record->core.qual, ops.size(),
ops.empty() ? NULL : ops.data(), input_record->core.mtid, input_record->core.mpos,
input_record->core.isize, trimmed_seq.size(), trimmed_seq.data(),
trimmed_qual.empty() ? NULL : (char*)trimmed_qual.data(), bam_get_l_aux(input_record));
bam_get_qname(input_record), 4 /*flag*/, -1 /*tid*/, -1 /*pos*/, 0 /*mapq*/,
0 /*n_cigar*/, nullptr /*cigar*/, -1 /*mtid*/, -1 /*mpos*/, 0 /*isize*/,
trimmed_seq.size(), trimmed_seq.data(),
trimmed_qual.empty() ? nullptr : (char*)trimmed_qual.data(),
bam_get_l_aux(input_record));
memcpy(bam_get_aux(out_record), bam_get_aux(input_record), bam_get_l_aux(input_record));
out_record->l_data += bam_get_l_aux(input_record);

Expand All @@ -195,8 +190,12 @@ BamPtr Trimmer::trim_sequence(BamPtr input, std::pair<int, int> trim_interval) {
bam_aux_update_int(out_record, "MN", trimmed_seq.length());
}

bam_aux_update_int(out_record, "ts", ts);
bam_aux_update_int(out_record, "ns", ns);
if (ts >= 0) {
bam_aux_update_int(out_record, "ts", ts);
}
if (ns >= 0) {
bam_aux_update_int(out_record, "ns", ns);
}

return BamPtr(out_record);
}
Expand Down
6 changes: 6 additions & 0 deletions dorado/read_pipeline/AdapterDetectorNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ void AdapterDetectorNode::input_thread_fn() {
while (get_input_message(message)) {
if (std::holds_alternative<BamPtr>(message)) {
auto read = std::get<BamPtr>(std::move(message));
// If the read is a secondary or supplementary read, ignore it.
if (read->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY)) {
continue;
}
process_read(read);
send_message_to_sink(std::move(read));
} else if (std::holds_alternative<SimplexReadPtr>(message)) {
Expand Down Expand Up @@ -79,6 +83,8 @@ void AdapterDetectorNode::process_read(BamPtr& read) {
return;
}
read = Trimmer::trim_sequence(std::move(read), trim_interval);

utils::remove_alignment_tags_from_record(read.get());
}
}

Expand Down
10 changes: 9 additions & 1 deletion dorado/read_pipeline/BarcodeClassifierNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ void BarcodeClassifierNode::input_thread_fn() {
while (get_input_message(message)) {
if (std::holds_alternative<BamPtr>(message)) {
auto read = std::get<BamPtr>(std::move(message));
// If the read is a secondary or supplementary read, ignore it if
// we're setup to trim reads.
if (m_default_barcoding_info->trim &&
(read->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
continue;
}
barcode(read);
send_message_to_sink(std::move(read));
} else if (std::holds_alternative<SimplexReadPtr>(message)) {
Expand Down Expand Up @@ -118,9 +124,11 @@ void BarcodeClassifierNode::barcode(BamPtr& read) {
int seqlen = irecord->core.l_qseq;
auto trim_interval = Trimmer::determine_trim_interval(bc_res, seqlen);

if (trim_interval.second - trim_interval.first < seqlen) {
if (trim_interval.second - trim_interval.first <= seqlen) {
read = Trimmer::trim_sequence(std::move(read), trim_interval);
}

utils::remove_alignment_tags_from_record(read.get());
}
}

Expand Down
22 changes: 22 additions & 0 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "SampleSheet.h"
#include "barcode_kits.h"
#include "sequence_utils.h"
#include "spdlog/spdlog.h"

#include <htslib/sam.h>

Expand Down Expand Up @@ -517,4 +518,25 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar) {
return cigar_str;
}

void remove_alignment_tags_from_record(bam1_t* record) {
// Iterate through all tags and check against known set
// of tags to remove.
static const std::set<std::pair<std::string, char>> tags_to_remove = {
{"SA", 'Z'}, {"NM", 'i'}, {"ms", 'i'}, {"AS", 'i'}, {"nn", 'i'}, {"ts", 'A'},
{"de", 'f'}, {"dv", 'f'}, {"tp", 'A'}, {"cm", 'i'}, {"s1", 'i'}, {"s2", 'i'},
{"MD", 'Z'}, {"zd", 'i'}, {"rl", 'i'}, {"bh", 'i'}};

uint8_t* aux_ptr = bam_aux_first(record);
while (aux_ptr != NULL) {
auto tag_ptr = bam_aux_tag(aux_ptr);
std::string tag = std::string(tag_ptr, tag_ptr + 2);
char type = bam_aux_type(aux_ptr);
if (tags_to_remove.find({tag, type}) != tags_to_remove.end()) {
aux_ptr = bam_aux_remove(record, aux_ptr);
} else {
aux_ptr = bam_aux_next(record, aux_ptr);
}
}
}

} // namespace dorado::utils
7 changes: 7 additions & 0 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,11 @@ std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar);
*/
kstring_t allocate_kstring();

/*
* Remove any alignment related tags from a BAM record.
*
* @param record BAM record.
*/
void remove_alignment_tags_from_record(bam1_t* record);

} // namespace dorado::utils
13 changes: 13 additions & 0 deletions tests/BamUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,16 @@ TEST_CASE("BamUtilsTest: test sam_hdr_merge refuses to merge incompatible SQ", T
CHECK(result == false);
CHECK(error_msg.size() != 0);
}

TEST_CASE("BamUtilsTest: Remove all alignment tags", TEST_GROUP) {
fs::path bam_utils_test_dir = fs::path(get_data_dir("bam_utils"));
auto sam = bam_utils_test_dir / "aligned_record.bam";

HtsReader reader(sam.string(), std::nullopt);
REQUIRE(reader.read()); // Parse first and only record.
auto record = reader.record.get();

utils::remove_alignment_tags_from_record(record);

CHECK(bam_aux_first(record) == nullptr);
}
19 changes: 19 additions & 0 deletions tests/TrimTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,25 @@ TEST_CASE("Test trim of reverse strand record in BAM", TEST_GROUP) {
Equals("C+h?,28,24;C+m?,28,24;"));
}

TEST_CASE("Test trim removes all alignment information", TEST_GROUP) {
const auto data_dir = fs::path(get_data_dir("trimmer"));
const auto bam_file = data_dir / "reverse_strand_record.bam";
HtsReader reader(bam_file.string(), std::nullopt);
reader.read();
auto &record = reader.record;

Trimmer trimmer;
const std::pair<int, int> trim_interval = {72, 647};
auto trimmed_record = trimmer.trim_sequence(std::move(record), trim_interval);

CHECK(trimmed_record->core.pos == -1);
CHECK(trimmed_record->core.tid == -1);
CHECK(trimmed_record->core.flag == 4);
CHECK(trimmed_record->core.n_cigar == 0);
CHECK(trimmed_record->core.mtid == -1);
CHECK(trimmed_record->core.mpos == -1);
}

std::string to_qstr(std::vector<int8_t> qscore) {
std::string qstr;
for (size_t i = 0; i < qscore.size(); ++i) {
Expand Down
Binary file added tests/data/bam_utils/aligned_record.bam
Binary file not shown.

0 comments on commit bacd354

Please sign in to comment.