diff --git a/dorado/alignment/Minimap2Aligner.cpp b/dorado/alignment/Minimap2Aligner.cpp index 9b5be64f..5aa57065 100644 --- a/dorado/alignment/Minimap2Aligner.cpp +++ b/dorado/alignment/Minimap2Aligner.cpp @@ -18,9 +18,7 @@ void add_sa_tag(bam1_t* record, int32_t hits, int32_t aln_idx, int32_t l_seq, - const mm_idx_t* idx, - const bool use_hard_clip) { - const auto clip_char = use_hard_clip ? "H" : "S"; + const mm_idx_t* idx) { std::stringstream ss; for (int i = 0; i < hits; i++) { if (i == aln_idx) { @@ -50,7 +48,7 @@ void add_sa_tag(bam1_t* record, ss << r->rs + 1 << ","; ss << "+-"[r->rev] << ","; if (clip5) { - ss << clip5 << clip_char; + ss << clip5 << "S"; } if (num_matches) { ss << num_matches << "M"; @@ -62,7 +60,7 @@ void add_sa_tag(bam1_t* record, ss << num_deletes << "D"; } if (clip3) { - ss << clip3 << clip_char; + ss << clip3 << "S"; } ss << "," << r->mapq << "," << (r->blen - r->mlen + r->p->n_ambi) << ";"; } @@ -223,7 +221,12 @@ std::vector Minimap2Aligner::align(bam1_t* irecord, mm_tbuf_t* buf) { // Add new tags to match minimap2. add_tags(record, aln, seq, buf); - add_sa_tag(record, reg, hits, j, static_cast(l_seq), mm_index, use_hard_clip); + if (!skip_seq_qual) { + // Here pass the original query length before any hard clip because the + // the CIGAR string in SA tag only makes use of soft clip. And for that to be + // correct the unclipped query length is needed. + add_sa_tag(record, reg, hits, j, static_cast(seq.size()), mm_index); + } // Remove MM/ML/MN tags if secondary alignment and soft clipping is not enabled. if ((flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY)) && !(mm_map_opts.flag & MM_F_SOFTCLIP)) { diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index 79a6ab76..c0316a36 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -21,6 +21,7 @@ #define TEST_GROUP "[bam_utils][aligner]" +using Catch::Matchers::Equals; namespace fs = std::filesystem; namespace { @@ -531,4 +532,36 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, CHECK(read_common_value == bam_value); } -} \ No newline at end of file +} + +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerTest: Check SA tag in non-primary alignments has correct CIGAR string", + TEST_GROUP) { + using Catch::Matchers::Contains; + + fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); + auto ref = aligner_test_dir / "supplementary_basecall_target.fa"; + auto query = aligner_test_dir / "basecall_target.fa"; + + auto options = dorado::alignment::dflt_options; + options.kmer_size = options.window_size = 15; + options.index_batch_size = 1'000'000'000ull; + options.soft_clipping = GENERATE(true, false); + dorado::HtsReader reader(query.string(), std::nullopt); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 1); + REQUIRE(bam_records.size() == 3); + + bam1_t* primary_rec = bam_records[0].get(); + bam1_t* secondary_rec = bam_records[1].get(); + bam1_t* supplementary_rec = bam_records[2].get(); + + // Check aux tags. + CHECK_THAT(bam_aux2Z(bam_aux_get(primary_rec, "SA")), Equals("read2,1,+,999S899M,60,0;")); + if (options.soft_clipping) { + CHECK_THAT(bam_aux2Z(bam_aux_get(secondary_rec, "SA")), + Equals("read3,1,+,999M899S,0,0;read2,1,+,999S899M,60,0;")); + } else { + CHECK(bam_aux_get(secondary_rec, "SA") == nullptr); + } + CHECK_THAT(bam_aux2Z(bam_aux_get(supplementary_rec, "SA")), Equals("read3,1,+,999M899S,0,0;")); +}