Skip to content

Commit

Permalink
Merge branch 'jdaw/fix-cigar-secondary-hardclip' into 'master'
Browse files Browse the repository at this point in the history
fix negative numbers in SA tag

Closes DOR-504

See merge request machine-learning/dorado!807
  • Loading branch information
tijyojwad committed Jan 18, 2024
1 parent d453db2 commit 062e5e3
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 6 deletions.
15 changes: 9 additions & 6 deletions dorado/alignment/Minimap2Aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ void add_sa_tag(bam1_t* record,
int32_t hits,
int32_t aln_idx,
int32_t l_seq,
const mm_idx_t* idx,
const bool use_hard_clip) {
const auto clip_char = use_hard_clip ? "H" : "S";
const mm_idx_t* idx) {
std::stringstream ss;
for (int i = 0; i < hits; i++) {
if (i == aln_idx) {
Expand Down Expand Up @@ -50,7 +48,7 @@ void add_sa_tag(bam1_t* record,
ss << r->rs + 1 << ",";
ss << "+-"[r->rev] << ",";
if (clip5) {
ss << clip5 << clip_char;
ss << clip5 << "S";
}
if (num_matches) {
ss << num_matches << "M";
Expand All @@ -62,7 +60,7 @@ void add_sa_tag(bam1_t* record,
ss << num_deletes << "D";
}
if (clip3) {
ss << clip3 << clip_char;
ss << clip3 << "S";
}
ss << "," << r->mapq << "," << (r->blen - r->mlen + r->p->n_ambi) << ";";
}
Expand Down Expand Up @@ -220,7 +218,12 @@ std::vector<BamPtr> Minimap2Aligner::align(bam1_t* irecord, mm_tbuf_t* buf) {

// Add new tags to match minimap2.
add_tags(record, aln, seq, buf);
add_sa_tag(record, reg, hits, j, static_cast<int>(l_seq), mm_index, use_hard_clip);
if (!skip_seq_qual) {
// Here pass the original query length before any hard clip because the
// the CIGAR string in SA tag only makes use of soft clip. And for that to be
// correct the unclipped query length is needed.
add_sa_tag(record, reg, hits, j, static_cast<int>(seq.size()), mm_index);
}

// Remove MM/ML/MN tags if secondary alignment and soft clipping is not enabled.
if ((flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY)) && !(mm_map_opts.flag & MM_F_SOFTCLIP)) {
Expand Down
32 changes: 32 additions & 0 deletions tests/AlignerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#define TEST_GROUP "[bam_utils][aligner]"

using Catch::Matchers::Equals;
namespace fs = std::filesystem;

namespace {
Expand Down Expand Up @@ -434,3 +435,34 @@ SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) {
}
}
}

TEST_CASE("AlignerTest: Check SA tag in non-primary alignments has correct CIGAR string",
TEST_GROUP) {
using Catch::Matchers::Contains;

fs::path aligner_test_dir = fs::path(get_aligner_data_dir());
auto ref = aligner_test_dir / "supplementary_basecall_target.fa";
auto query = aligner_test_dir / "basecall_target.fa";

auto options = dorado::alignment::dflt_options;
options.kmer_size = options.window_size = 15;
options.index_batch_size = 1'000'000'000ull;
options.soft_clipping = GENERATE(true, false);
dorado::HtsReader reader(query.string(), std::nullopt);
auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 1);
REQUIRE(bam_records.size() == 3);

bam1_t* primary_rec = bam_records[0].get();
bam1_t* secondary_rec = bam_records[1].get();
bam1_t* supplementary_rec = bam_records[2].get();

// Check aux tags.
CHECK_THAT(bam_aux2Z(bam_aux_get(primary_rec, "SA")), Equals("read2,1,+,999S899M,60,0;"));
if (options.soft_clipping) {
CHECK_THAT(bam_aux2Z(bam_aux_get(secondary_rec, "SA")),
Equals("read3,1,+,999M899S,0,0;read2,1,+,999S899M,60,0;"));
} else {
CHECK(bam_aux_get(secondary_rec, "SA") == nullptr);
}
CHECK_THAT(bam_aux2Z(bam_aux_get(supplementary_rec, "SA")), Equals("read3,1,+,999M899S,0,0;"));
}

0 comments on commit 062e5e3

Please sign in to comment.