diff --git a/CMakeLists.txt b/CMakeLists.txt index f5b1c2e8..40aa94eb 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,8 @@ configure_file(dorado/Version.h.in dorado/Version.h) add_library(dorado_lib dorado/alignment/alignment_processing_items.cpp dorado/alignment/alignment_processing_items.h + dorado/alignment/BedFile.cpp + dorado/alignment/BedFile.h dorado/alignment/IndexFileAccess.cpp dorado/alignment/IndexFileAccess.h dorado/alignment/Minimap2Aligner.cpp diff --git a/documentation/SAM.md b/documentation/SAM.md index 9608d972..6246213d 100644 --- a/documentation/SAM.md +++ b/documentation/SAM.md @@ -42,6 +42,7 @@ | pi:Z: | parent read id for a split read | | sp:i: | start coordinate of split read in parent read signal | | pt:i: | estimated poly(A/T) tail length in cDNA and dRNA reads | +| bh:i: | number of detected bedfile hits _(only if alignment was performed with a specified bed-file)_ | | MN:i: | Length of sequence at the time MM and ML were produced | #### Modified Base Tags diff --git a/dorado/alignment/BedFile.cpp b/dorado/alignment/BedFile.cpp new file mode 100644 index 00000000..5a877cce --- /dev/null +++ b/dorado/alignment/BedFile.cpp @@ -0,0 +1,102 @@ +#include "BedFile.h" + +#include "utils/string_utils.h" + +#include + +#include +#include +#include + +namespace dorado::alignment { + +BedFile::Entries const BedFile::NO_ENTRIES{}; + +const std::string & BedFile::filename() const { return m_file_name; } + +bool BedFile::load(const std::string & bed_filename) { + m_file_name = bed_filename; + + // Read in each line and parse it into the BedFile structure + std::ifstream file_stream(m_file_name, std::ios::ate); + if (file_stream.fail()) { + spdlog::error("Failed to open Bed file for reading: '{}'", m_file_name); + return false; + } + auto file_size = file_stream.tellg(); + file_stream.seekg(0); + bool reading_header = true; + bool allow_column_headers = true; + while (!(file_stream.tellg() == (int32_t)file_size || file_stream.tellg() == -1)) { + std::string bed_line; + std::getline(file_stream, bed_line); + + if (utils::starts_with(bed_line, "#")) { + continue; + } + // Remove whitespace from end of the line + utils::rtrim(bed_line); + // check for header markers and ignore if present + if (reading_header) { + if (utils::starts_with(bed_line, "browser") || utils::starts_with(bed_line, "track")) { + continue; + } + reading_header = false; + } + + // required fields from BED line + std::string reference_name; + size_t start, end; + char strand = '.'; + std::istringstream bed_line_stream(bed_line); + bed_line_stream >> reference_name >> start >> end; + std::string next_col_str; + // Guards against only the required fields being present + if (!bed_line_stream.eof()) { + // Strand is the sixth column so track column index + int8_t c_index = 4; + while (bed_line_stream >> next_col_str) { + if (c_index < 6) { + c_index += 1; + } else if (next_col_str == "+") { + strand = '+'; + break; + } else if (next_col_str == "-") { + strand = '-'; + break; + // 6th column and not "+/-/." + } else if (next_col_str != ".") { + spdlog::error("Invalid data reading strand from bed file '{}'", m_file_name); + return false; + } + // No strand column present and we're at the end + if (bed_line_stream.eof()) { + break; + } + } + } + if (bed_line_stream.fail() && !bed_line_stream.eof()) { + if (allow_column_headers) { + allow_column_headers = false; + spdlog::info( + "Invalid data found reading bed file '{}'. Assuming column headers and " + "skipping line.", + m_file_name); + continue; + } + spdlog::error("Invalid data reading bed file '{}'", m_file_name); + return false; + } + allow_column_headers = false; + m_genomes[reference_name].push_back({bed_line, start, end, strand}); + } + + return true; +}; + +const BedFile::Entries & BedFile::entries(const std::string & genome) const { + auto it = m_genomes.find(genome); + return it != m_genomes.end() ? it->second : NO_ENTRIES; +} + +} // namespace dorado::alignment diff --git a/dorado/alignment/BedFile.h b/dorado/alignment/BedFile.h new file mode 100644 index 00000000..8d3c0aef --- /dev/null +++ b/dorado/alignment/BedFile.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +namespace dorado::alignment { + +class BedFile { +public: + struct Entry { + std::string bed_line; + size_t start; + size_t end; + char strand; + }; + + using Entries = std::vector; + +private: + std::map m_genomes; + std::string m_file_name{}; + static const Entries NO_ENTRIES; + +public: + BedFile() = default; + BedFile(BedFile&& other) = delete; + BedFile(const BedFile&) = delete; + BedFile& operator=(const BedFile&) = delete; + ~BedFile() = default; + + bool load(const std::string& index_filename); + + const Entries& entries(const std::string& genome) const; + + const std::string& filename() const; +}; + +} // namespace dorado::alignment diff --git a/dorado/alignment/IndexFileAccess.cpp b/dorado/alignment/IndexFileAccess.cpp index d0906b69..e36e9ac8 100644 --- a/dorado/alignment/IndexFileAccess.cpp +++ b/dorado/alignment/IndexFileAccess.cpp @@ -8,9 +8,9 @@ namespace dorado::alignment { const Minimap2Index* IndexFileAccess::get_compatible_index( - const std::string& file, + const std::string& index_file, const Minimap2IndexOptions& indexing_options) { - auto& compatible_indices = m_index_lut[{file, indexing_options}]; + auto& compatible_indices = m_index_lut[{index_file, indexing_options}]; if (compatible_indices.empty()) { return nullptr; } @@ -18,9 +18,9 @@ const Minimap2Index* IndexFileAccess::get_compatible_index( } std::shared_ptr IndexFileAccess::get_exact_index_impl( - const std::string& file, + const std::string& index_file, const Minimap2Options& options) const { - auto compatible_indices = m_index_lut.find({file, options}); + auto compatible_indices = m_index_lut.find({index_file, options}); if (compatible_indices == m_index_lut.end()) { return nullptr; } @@ -29,17 +29,17 @@ std::shared_ptr IndexFileAccess::get_exact_index_impl( } std::shared_ptr IndexFileAccess::get_exact_index( - const std::string& file, + const std::string& index_file, const Minimap2Options& options) const { std::lock_guard lock(m_mutex); - auto exact_index = get_exact_index_impl(file, options); + auto exact_index = get_exact_index_impl(index_file, options); assert(exact_index && "Cannot access an index which has not been loaded"); return exact_index; } -bool IndexFileAccess::is_index_loaded_impl(const std::string& file, +bool IndexFileAccess::is_index_loaded_impl(const std::string& index_file, const Minimap2Options& options) const { - const auto compatible_indices = m_index_lut.find({file, options}); + const auto compatible_indices = m_index_lut.find({index_file, options}); if (compatible_indices == m_index_lut.end()) { return false; } @@ -47,13 +47,13 @@ bool IndexFileAccess::is_index_loaded_impl(const std::string& file, } std::shared_ptr IndexFileAccess::get_or_load_compatible_index( - const std::string& file, + const std::string& index_file, const Minimap2Options& options) { - auto index = get_exact_index_impl(file, options); + auto index = get_exact_index_impl(index_file, options); if (index) { return index; } - auto compatible_index = get_compatible_index(file, options); + auto compatible_index = get_compatible_index(index_file, options); if (!compatible_index) { return nullptr; } @@ -62,21 +62,21 @@ std::shared_ptr IndexFileAccess::get_or_load_compatible_index( if (!new_index) { return nullptr; } - m_index_lut[{file, options}][options] = new_index; + m_index_lut[{index_file, options}][options] = new_index; return new_index; } -bool IndexFileAccess::try_load_compatible_index(const std::string& file, +bool IndexFileAccess::try_load_compatible_index(const std::string& index_file, const Minimap2Options& options) { std::lock_guard lock(m_mutex); - auto index = get_or_load_compatible_index(file, options); + auto index = get_or_load_compatible_index(index_file, options); return index != nullptr; } -IndexLoadResult IndexFileAccess::load_index(const std::string& file, +IndexLoadResult IndexFileAccess::load_index(const std::string& index_file, const Minimap2Options& options, int num_threads) { - if (try_load_compatible_index(file, options)) { + if (try_load_compatible_index(index_file, options)) { return IndexLoadResult::success; } @@ -85,36 +85,36 @@ IndexLoadResult IndexFileAccess::load_index(const std::string& file, return IndexLoadResult::validation_error; } - auto load_result = new_index->load(file, num_threads); + auto load_result = new_index->load(index_file, num_threads); if (load_result != IndexLoadResult::success) { return load_result; } std::lock_guard lock(m_mutex); - m_index_lut[{file, options}][options] = std::move(new_index); + m_index_lut[{index_file, options}][options] = std::move(new_index); return IndexLoadResult::success; } -std::shared_ptr IndexFileAccess::get_index(const std::string& file, +std::shared_ptr IndexFileAccess::get_index(const std::string& index_file, const Minimap2Options& options) { std::lock_guard lock(m_mutex); // N.B. Although the index file for a client must be loaded before reads are added to the pipeline // it is still possible that the index for a read in the pipeline does not have its index loaded // if the client disconnected and caused the index to be unloaded while there were still reads in // the pipeline. For this reason we do not assert a non-null index. - return get_or_load_compatible_index(file, options); + return get_or_load_compatible_index(index_file, options); } -bool IndexFileAccess::is_index_loaded(const std::string& file, +bool IndexFileAccess::is_index_loaded(const std::string& index_file, const Minimap2Options& options) const { std::lock_guard lock(m_mutex); - return is_index_loaded_impl(file, options); + return is_index_loaded_impl(index_file, options); } std::string IndexFileAccess::generate_sequence_records_header( - const std::string& file, + const std::string& index_file, const Minimap2Options& options) const { - auto loaded_index = get_exact_index(file, options); + auto loaded_index = get_exact_index(index_file, options); assert(loaded_index && "Index must be loaded to generate header records"); auto sequence_records = loaded_index->get_sequence_records_for_header(); @@ -132,10 +132,10 @@ std::string IndexFileAccess::generate_sequence_records_header( return header_stream.str(); } -void IndexFileAccess::unload_index(const std::string& file, +void IndexFileAccess::unload_index(const std::string& index_file, const Minimap2IndexOptions& index_options) { std::lock_guard lock(m_mutex); - m_index_lut[{file, index_options}] = {}; + m_index_lut[{index_file, index_options}] = {}; } bool validate_options(const Minimap2Options& options) { diff --git a/dorado/alignment/IndexFileAccess.h b/dorado/alignment/IndexFileAccess.h index 737af8a9..9db98fb0 100644 --- a/dorado/alignment/IndexFileAccess.h +++ b/dorado/alignment/IndexFileAccess.h @@ -20,49 +20,49 @@ class IndexFileAccess { // Returns true if the index is loaded, will also create the index if a compatible // one is already loaded and return true. - bool try_load_compatible_index(const std::string& file, const Minimap2Options& options); + bool try_load_compatible_index(const std::string& index_file, const Minimap2Options& options); // By contract the index must be loaded (else assertion failure) - std::shared_ptr get_exact_index(const std::string& file, + std::shared_ptr get_exact_index(const std::string& index_file, const Minimap2Options& options) const; // Requires the mutex to be locked before calling. - const Minimap2Index* get_compatible_index(const std::string& file, + const Minimap2Index* get_compatible_index(const std::string& index_file, const Minimap2IndexOptions& indexing_options); // Requires the mutex to be locked before calling. - std::shared_ptr get_exact_index_impl(const std::string& file, + std::shared_ptr get_exact_index_impl(const std::string& index_file, const Minimap2Options& options) const; // Requires the mutex to be locked before calling. - bool is_index_loaded_impl(const std::string& file, const Minimap2Options& options) const; + bool is_index_loaded_impl(const std::string& index_file, const Minimap2Options& options) const; // Requires the mutex to be locked before calling. - std::shared_ptr get_or_load_compatible_index(const std::string& file, + std::shared_ptr get_or_load_compatible_index(const std::string& index_file, const Minimap2Options& options); public: - IndexLoadResult load_index(const std::string& file, + IndexLoadResult load_index(const std::string& index_file, const Minimap2Options& options, int num_threads); // Returns the index if already loaded, if not loaded will create an index from an // existing compatible one. - // By contract there must be a loaded index for the file with matching indexing + // By contract there must be a loaded index for the index file with matching indexing // options, if not there will be an assertion failure. - std::shared_ptr get_index(const std::string& file, + std::shared_ptr get_index(const std::string& index_file, const Minimap2Options& options); - // Unloads all compatible indices for the given file and indexing options. + // Unloads all compatible indices for the given index file and indexing options. // The underlying minimap index will be deallocated. - void unload_index(const std::string& file, const Minimap2IndexOptions& index_options); + void unload_index(const std::string& index_file, const Minimap2IndexOptions& index_options); // returns a string containing the sequence records for the requested index - std::string generate_sequence_records_header(const std::string& file, + std::string generate_sequence_records_header(const std::string& index_file, const Minimap2Options& options) const; // Testability. Method needed to support utests - bool is_index_loaded(const std::string& file, const Minimap2Options& options) const; + bool is_index_loaded(const std::string& index_file, const Minimap2Options& options) const; }; bool validate_options(const Minimap2Options& options); diff --git a/dorado/cli/aligner.cpp b/dorado/cli/aligner.cpp index 685f8cdc..5fa14daa 100644 --- a/dorado/cli/aligner.cpp +++ b/dorado/cli/aligner.cpp @@ -101,18 +101,19 @@ int aligner(int argc, char* argv[]) { .nargs(0); parser.visible.add_argument("-o", "--output-dir") .help("If specified output files will be written to the given folder, otherwise output " - "is to stdout. " - "Required if the 'reads' positional argument is a folder.") + "is to stdout. Required if the 'reads' positional argument is a folder.") .default_value(std::string{}); parser.visible.add_argument("--emit-summary") .help("If specified, a summary file containing the details of the primary alignments " - "for each " - "read will be emitted to the root of the output folder. This option requires " - "that the " - "'--output-dir' option is also set.") + "for each read will be emitted to the root of the output folder. This option " + "requires that the '--output-dir' option is also set.") .default_value(false) .implicit_value(true) .nargs(0); + parser.visible.add_argument("--bed-file") + .help("Optional bed-file. If specified, overlaps between the alignments and bed-file " + "entries will be counted, and recorded in BAM output using the 'bh' read tag.") + .default_value(std::string("")); parser.hidden.add_argument("--progress_stats_frequency") .help("Frequency in seconds in which to report progress statistics") .default_value(0) @@ -150,6 +151,7 @@ int aligner(int argc, char* argv[]) { } auto index(parser.visible.get("index")); + auto bed_file(parser.visible.get("bed-file")); auto reads(parser.visible.get("reads")); auto recursive_input = parser.visible.get("recursive"); auto output_folder = parser.visible.get("output-dir"); @@ -214,7 +216,7 @@ int aligner(int argc, char* argv[]) { auto hts_writer = pipeline_desc.add_node({}, file_info.output, file_info.output_mode, writer_threads); auto aligner = pipeline_desc.add_node({hts_writer}, index_file_access, index, - options, aligner_threads); + bed_file, options, aligner_threads); // Create the Pipeline from our description. std::vector stats_reporters; diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 3cba885c..12909703 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -158,7 +158,7 @@ void setup(std::vector args, if (enable_aligner) { auto index_file_access = std::make_shared(); aligner = pipeline_desc.add_node({current_sink_node}, index_file_access, ref, - aligner_options, + "", aligner_options, thread_allocations.aligner_threads); current_sink_node = aligner; } diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 5838f3c1..b2cc322b 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -369,7 +369,7 @@ int duplex(int argc, char* argv[]) { } else { auto options = cli::process_minimap2_arguments(parser, alignment::dflt_options); auto index_file_access = std::make_shared(); - aligner = pipeline_desc.add_node({}, index_file_access, ref, options, + aligner = pipeline_desc.add_node({}, index_file_access, ref, "", options, std::thread::hardware_concurrency()); hts_writer = pipeline_desc.add_node({}, "-", output_mode, 4); pipeline_desc.add_node_sink(aligner, hts_writer); diff --git a/dorado/read_pipeline/AlignerNode.cpp b/dorado/read_pipeline/AlignerNode.cpp index 16f42439..6adc4222 100644 --- a/dorado/read_pipeline/AlignerNode.cpp +++ b/dorado/read_pipeline/AlignerNode.cpp @@ -4,6 +4,7 @@ #include "alignment/Minimap2Aligner.h" #include "alignment/Minimap2Index.h" +#include #include #include @@ -16,13 +17,13 @@ namespace { std::shared_ptr load_and_get_index( dorado::alignment::IndexFileAccess& index_file_access, - const std::string& filename, + const std::string& index_file, const dorado::alignment::Minimap2Options& options, const int threads) { int num_index_construction_threads{options.print_aln_seq ? 1 : static_cast(threads)}; - switch (index_file_access.load_index(filename, options, num_index_construction_threads)) { + switch (index_file_access.load_index(index_file, options, num_index_construction_threads)) { case dorado::alignment::IndexLoadResult::reference_file_not_found: - throw std::runtime_error("AlignerNode reference path does not exist: " + filename); + throw std::runtime_error("AlignerNode reference path does not exist: " + index_file); case dorado::alignment::IndexLoadResult::validation_error: throw std::runtime_error("AlignerNode validation error checking minimap options"); case dorado::alignment::IndexLoadResult::split_index_not_supported: @@ -32,7 +33,7 @@ std::shared_ptr load_and_get_index( case dorado::alignment::IndexLoadResult::success: break; } - return index_file_access.get_index(filename, options); + return index_file_access.get_index(index_file, options); } } // namespace @@ -40,13 +41,21 @@ std::shared_ptr load_and_get_index( namespace dorado { AlignerNode::AlignerNode(std::shared_ptr index_file_access, - const std::string& filename, + const std::string& index_file, + const std::string& bed_file, const alignment::Minimap2Options& options, int threads) : MessageSink(10000, threads), m_index_for_bam_messages( - load_and_get_index(*index_file_access, filename, options, threads)), + load_and_get_index(*index_file_access, index_file, options, threads)), m_index_file_access(std::move(index_file_access)) { + auto header_sequence_records = m_index_for_bam_messages->get_sequence_records_for_header(); + if (!bed_file.empty()) { + m_bed_file_for_bam_messages.load(bed_file); + for (const auto& entry : header_sequence_records) { + m_header_sequences_for_bam_messages.emplace_back(entry.first); + } + } start_input_processing(&AlignerNode::input_thread_fn, this); } @@ -109,6 +118,11 @@ void AlignerNode::input_thread_fn() { auto records = alignment::Minimap2Aligner(m_index_for_bam_messages).align(read.get(), tbuf); for (auto& record : records) { + if (!m_bed_file_for_bam_messages.filename().empty() && + !(record->core.flag & BAM_FUNMAP)) { + auto ref_id = record->core.tid; + add_bed_hits_to_record(m_header_sequences_for_bam_messages.at(ref_id), record); + } send_message_to_sink(std::move(record)); } } else if (std::holds_alternative(message)) { @@ -125,4 +139,19 @@ void AlignerNode::input_thread_fn() { stats::NamedStats AlignerNode::sample_stats() const { return stats::from_obj(m_work_queue); } +void AlignerNode::add_bed_hits_to_record(const std::string& genome, BamPtr& record) { + size_t genome_start = record->core.pos; + size_t genome_end = bam_endpos(record.get()); + char direction = (bam_is_rev(record.get())) ? '-' : '+'; + int bed_hits = 0; + for (const auto& interval : m_bed_file_for_bam_messages.entries(genome)) { + if (!(interval.start >= genome_end || interval.end <= genome_start) && + (interval.strand == direction || interval.strand == '.')) { + bed_hits++; + } + } + // update the record. + bam_aux_append(record.get(), "bh", 'i', sizeof(bed_hits), (uint8_t*)&bed_hits); +} + } // namespace dorado diff --git a/dorado/read_pipeline/AlignerNode.h b/dorado/read_pipeline/AlignerNode.h index e7a7c911..133f6330 100644 --- a/dorado/read_pipeline/AlignerNode.h +++ b/dorado/read_pipeline/AlignerNode.h @@ -1,5 +1,6 @@ #pragma once +#include "alignment/BedFile.h" #include "alignment/IndexFileAccess.h" #include "alignment/Minimap2Options.h" #include "read_pipeline/MessageSink.h" @@ -22,7 +23,8 @@ class Minimap2Index; class AlignerNode : public MessageSink { public: AlignerNode(std::shared_ptr index_file_access, - const std::string& filename, + const std::string& index_file, + const std::string& bed_file, const alignment::Minimap2Options& options, int threads); AlignerNode(std::shared_ptr index_file_access, int threads); @@ -38,9 +40,12 @@ class AlignerNode : public MessageSink { void input_thread_fn(); std::shared_ptr get_index(const ReadCommon& read_common); void align_read_common(ReadCommon& read_common, mm_tbuf_t* tbuf); + void add_bed_hits_to_record(const std::string& genome, BamPtr& record); std::shared_ptr m_index_for_bam_messages{}; + std::vector m_header_sequences_for_bam_messages{}; std::shared_ptr m_index_file_access{}; + alignment::BedFile m_bed_file_for_bam_messages{}; }; } // namespace dorado diff --git a/dorado/summary/summary.cpp b/dorado/summary/summary.cpp index 6f00e956..687c7e4f 100644 --- a/dorado/summary/summary.cpp +++ b/dorado/summary/summary.cpp @@ -57,7 +57,7 @@ std::vector SummaryData::s_alignment_fields = { "alignment_length", "alignment_num_aligned", "alignment_num_correct", "alignment_num_insertions", "alignment_num_deletions", "alignment_num_substitutions", "alignment_mapq", "alignment_strand_coverage", "alignment_identity", - "alignment_accuracy"}; + "alignment_accuracy", "alignment_bed_hits"}; SummaryData::SummaryData() = default; @@ -218,6 +218,7 @@ bool SummaryData::write_rows_from_reader( float strand_coverage = 0.0; float alignment_identity = 0.0; float alignment_accurary = 0.0; + int alignment_bed_hits = 0; if (reader.is_aligned && !(reader.record->core.flag & BAM_FUNMAP)) { alignment_mapq = static_cast(reader.record->core.qual); @@ -244,6 +245,7 @@ bool SummaryData::write_rows_from_reader( alignment_identity = alignment_num_correct / static_cast(alignment_counts.matches); alignment_accurary = alignment_num_correct / static_cast(alignment_length); + alignment_bed_hits = reader.get_tag("bh"); } writer << m_separator << alignment_genome << m_separator << alignment_genome_start @@ -254,7 +256,8 @@ bool SummaryData::write_rows_from_reader( << alignment_num_insertions << m_separator << alignment_num_deletions << m_separator << alignment_num_substitutions << m_separator << alignment_mapq << m_separator << strand_coverage << m_separator << alignment_identity - << m_separator << alignment_accurary; + << m_separator << alignment_accurary << m_separator << alignment_bed_hits; + ; } writer << '\n'; } diff --git a/dorado/utils/string_utils.h b/dorado/utils/string_utils.h index be96a3a5..fae6d1c8 100644 --- a/dorado/utils/string_utils.h +++ b/dorado/utils/string_utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -40,4 +41,9 @@ inline bool ends_with(std::string_view str, std::string_view suffix) { return str.substr(str.length() - suffix.length()) == suffix; } +inline void rtrim(std::string& s) { + s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), + s.end()); +} + } // namespace dorado::utils diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index c6ba4a41..4f6652d7 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -136,7 +136,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check standard alignment" options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -160,6 +160,48 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check standard alignment" } } +TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check alignment with bed file", TEST_GROUP) { + using Catch::Matchers::Contains; + + fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); + auto ref = aligner_test_dir / "target.fq"; + auto query = aligner_test_dir / "target.fq"; + auto bed = aligner_test_dir / "target.bed"; + + auto options = dorado::alignment::dflt_options; + options.kmer_size = options.window_size = 15; + options.index_batch_size = 1'000'000'000ull; + dorado::HtsReader reader(query.string(), std::nullopt); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), bed.string(), options, 10); + REQUIRE(bam_records.size() == 1); + + bam1_t* rec = bam_records[0].get(); + bam1_t* in_rec = reader.record.get(); + + // Check input/output reads are matching. + std::string orig_read = dorado::utils::extract_sequence(in_rec); + std::string aligned_read = dorado::utils::extract_sequence(rec); + + // Check quals are matching. + std::vector orig_qual = dorado::utils::extract_quality(in_rec); + std::vector aligned_qual = dorado::utils::extract_quality(rec); + CHECK(orig_qual == aligned_qual); + + // Check aux tags. + uint32_t l_aux = bam_get_l_aux(rec); + std::string aux((char*)bam_get_aux(rec), (char*)(bam_get_aux(rec) + l_aux)); + std::string tags[] = {"NMi", "msi", "ASi", "nni", "def", "tpA", "cmi", "s1i", "rli", "bhi"}; + for (auto tag : tags) { + CHECK_THAT(aux, Contains(tag)); + } + auto bh_tag_ptr = bam_aux_get(rec, "bh"); + auto bh_tag_type = *(char*)bh_tag_ptr; + CHECK(bh_tag_type == 'i'); + int32_t bh_tag_value = 0; + memcpy(&bh_tag_value, bh_tag_ptr + 1, 4); + CHECK(bh_tag_value == 3); +} + TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check supplementary alignment", TEST_GROUP) { using Catch::Matchers::Contains; @@ -171,7 +213,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check supplementary align options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 10); REQUIRE(bam_records.size() == 2); // Check first alignment is primary. @@ -210,7 +252,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -244,7 +286,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -272,7 +314,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.index_batch_size = 1'000'000'000ull; options.soft_clipping = GENERATE(true, false); dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 10); REQUIRE(bam_records.size() == 3); bam1_t* primary_rec = bam_records[0].get(); @@ -316,7 +358,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.kmer_size = options.window_size = 28; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 2); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 2); CHECK(bam_records.size() == 2); // Generates 2 alignments. } @@ -326,7 +368,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.kmer_size = options.window_size = 5; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 2); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 2); CHECK(bam_records.size() == 1); // Generates 1 alignment. } } @@ -339,7 +381,7 @@ TEST_CASE("AlignerTest: Check AlignerNode crashes if multi index encountered", T options.kmer_size = options.window_size = 5; options.index_batch_size = 1000ull; auto index_file_access = std::make_shared(); - CHECK_THROWS(dorado::AlignerNode(index_file_access, ref.string(), options, 1)); + CHECK_THROWS(dorado::AlignerNode(index_file_access, ref.string(), "", options, 1)); } SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GROUP) { @@ -483,7 +525,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, // Get the sam line from BAM pipeline dorado::HtsReader bam_reader(query, std::nullopt); - auto bam_records = RunPipelineWithBamMessages(bam_reader, ref, options, 2); + auto bam_records = RunPipelineWithBamMessages(bam_reader, ref, "", options, 2); CHECK(bam_records.size() == 1); auto sam_line_from_bam_ptr = get_sam_line_from_bam(std::move(bam_records[0])); @@ -549,7 +591,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, options.index_batch_size = 1'000'000'000ull; options.soft_clipping = GENERATE(true, false); dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 1); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), "", options, 1); REQUIRE(bam_records.size() == 3); bam1_t* primary_rec = bam_records[0].get(); diff --git a/tests/BedFileTest.cpp b/tests/BedFileTest.cpp new file mode 100644 index 00000000..d8180c4e --- /dev/null +++ b/tests/BedFileTest.cpp @@ -0,0 +1,24 @@ +#include "alignment/BedFile.h" + +#include "TestUtils.h" + +#include + +#define CUT_TAG "[BedFile]" + +TEST_CASE(CUT_TAG ": test bedfile loading", CUT_TAG) { + auto data_dir = get_data_dir("bedfile_test"); + auto test_file = (data_dir / "test_bed.bed").string(); + dorado::alignment::BedFile bed; + bed.load(test_file); + const auto& entries = bed.entries("Lambda"); + REQUIRE(entries.size() == size_t(4)); + std::vector expected_starts{40000, 41000, 80000, 81000}; + std::vector expected_dir{'+', '+', '-', '+'}; + size_t expected_length = 1000; + for (size_t i = 0; i < entries.size(); ++i) { + REQUIRE(entries[i].start == expected_starts[i]); + REQUIRE(entries[i].end == expected_starts[i] + expected_length); + REQUIRE(entries[i].strand == expected_dir[i]); + } +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 64bdc7ad..9adc8944 100755 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -17,6 +17,7 @@ add_executable(dorado_tests BarcodeClassifierSelectorTest.cpp BarcodeClassifierTest.cpp BarcodeDemuxerNodeTest.cpp + BedFileTest.cpp CliUtilsTest.cpp CRFModelConfigTest.cpp CustomBarcodeParserTest.cpp diff --git a/tests/data/aligner_test/target.bed b/tests/data/aligner_test/target.bed new file mode 100644 index 00000000..080c9836 --- /dev/null +++ b/tests/data/aligner_test/target.bed @@ -0,0 +1,6 @@ +read_0 100 200 COMMENTLINE1 100 + +read_0 200 300 COMMENTLINE2 200 + +read_0 300 400 COMMENTLINE3 300 - +read_0 400 500 COMMENTLINE4 400 + +BACON 40000 41000 COMMENTLINE5 500 + +BACON 81000 82000 COMMENTLINE6 600 - diff --git a/tests/data/bedfile_test/test_bed.bed b/tests/data/bedfile_test/test_bed.bed new file mode 100644 index 00000000..a6e5f84d --- /dev/null +++ b/tests/data/bedfile_test/test_bed.bed @@ -0,0 +1,6 @@ +Lambda 40000 41000 COMMENTLINE1 100 + +Lambda 41000 42000 COMMENTLINE2 200 + +Lambda 80000 81000 COMMENTLINE3 300 - +Lambda 81000 82000 COMMENTLINE4 400 + +BACON 40000 41000 COMMENTLINE5 500 + +BACON 81000 82000 COMMENTLINE6 600 -