Skip to content

Commit

Permalink
Merge branch 'DOR-586_bed_file_support' into 'master'
Browse files Browse the repository at this point in the history
DOR-586 Add bed file support to aligner

Closes DOR-586

See merge request machine-learning/dorado!881
  • Loading branch information
tijyojwad committed Mar 13, 2024
2 parents 572d231 + 0fb27ef commit 246b9b9
Show file tree
Hide file tree
Showing 18 changed files with 335 additions and 67 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ configure_file(dorado/Version.h.in dorado/Version.h)
add_library(dorado_lib
dorado/alignment/alignment_processing_items.cpp
dorado/alignment/alignment_processing_items.h
dorado/alignment/BedFile.cpp
dorado/alignment/BedFile.h
dorado/alignment/IndexFileAccess.cpp
dorado/alignment/IndexFileAccess.h
dorado/alignment/Minimap2Aligner.cpp
Expand Down
1 change: 1 addition & 0 deletions documentation/SAM.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
| pi:Z: | parent read id for a split read |
| sp:i: | start coordinate of split read in parent read signal |
| pt:i: | estimated poly(A/T) tail length in cDNA and dRNA reads |
| bh:i: | number of detected bedfile hits _(only if alignment was performed with a specified bed-file)_ |
| MN:i: | Length of sequence at the time MM and ML were produced |

#### Modified Base Tags
Expand Down
102 changes: 102 additions & 0 deletions dorado/alignment/BedFile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#include "BedFile.h"

#include "utils/string_utils.h"

#include <spdlog/spdlog.h>

#include <fstream>
#include <iomanip>
#include <sstream>

namespace dorado::alignment {

BedFile::Entries const BedFile::NO_ENTRIES{};

const std::string & BedFile::filename() const { return m_file_name; }

bool BedFile::load(const std::string & bed_filename) {
m_file_name = bed_filename;

// Read in each line and parse it into the BedFile structure
std::ifstream file_stream(m_file_name, std::ios::ate);
if (file_stream.fail()) {
spdlog::error("Failed to open Bed file for reading: '{}'", m_file_name);
return false;
}
auto file_size = file_stream.tellg();
file_stream.seekg(0);
bool reading_header = true;
bool allow_column_headers = true;
while (!(file_stream.tellg() == (int32_t)file_size || file_stream.tellg() == -1)) {
std::string bed_line;
std::getline(file_stream, bed_line);

if (utils::starts_with(bed_line, "#")) {
continue;
}
// Remove whitespace from end of the line
utils::rtrim(bed_line);
// check for header markers and ignore if present
if (reading_header) {
if (utils::starts_with(bed_line, "browser") || utils::starts_with(bed_line, "track")) {
continue;
}
reading_header = false;
}

// required fields from BED line
std::string reference_name;
size_t start, end;
char strand = '.';
std::istringstream bed_line_stream(bed_line);
bed_line_stream >> reference_name >> start >> end;
std::string next_col_str;
// Guards against only the required fields being present
if (!bed_line_stream.eof()) {
// Strand is the sixth column so track column index
int8_t c_index = 4;
while (bed_line_stream >> next_col_str) {
if (c_index < 6) {
c_index += 1;
} else if (next_col_str == "+") {
strand = '+';
break;
} else if (next_col_str == "-") {
strand = '-';
break;
// 6th column and not "+/-/."
} else if (next_col_str != ".") {
spdlog::error("Invalid data reading strand from bed file '{}'", m_file_name);
return false;
}
// No strand column present and we're at the end
if (bed_line_stream.eof()) {
break;
}
}
}
if (bed_line_stream.fail() && !bed_line_stream.eof()) {
if (allow_column_headers) {
allow_column_headers = false;
spdlog::info(
"Invalid data found reading bed file '{}'. Assuming column headers and "
"skipping line.",
m_file_name);
continue;
}
spdlog::error("Invalid data reading bed file '{}'", m_file_name);
return false;
}
allow_column_headers = false;
m_genomes[reference_name].push_back({bed_line, start, end, strand});
}

return true;
};

const BedFile::Entries & BedFile::entries(const std::string & genome) const {
auto it = m_genomes.find(genome);
return it != m_genomes.end() ? it->second : NO_ENTRIES;
}

} // namespace dorado::alignment
39 changes: 39 additions & 0 deletions dorado/alignment/BedFile.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#pragma once

#include <map>
#include <string>
#include <vector>

namespace dorado::alignment {

class BedFile {
public:
struct Entry {
std::string bed_line;
size_t start;
size_t end;
char strand;
};

using Entries = std::vector<Entry>;

private:
std::map<std::string, Entries> m_genomes;
std::string m_file_name{};
static const Entries NO_ENTRIES;

public:
BedFile() = default;
BedFile(BedFile&& other) = delete;
BedFile(const BedFile&) = delete;
BedFile& operator=(const BedFile&) = delete;
~BedFile() = default;

bool load(const std::string& index_filename);

const Entries& entries(const std::string& genome) const;

const std::string& filename() const;
};

} // namespace dorado::alignment
52 changes: 26 additions & 26 deletions dorado/alignment/IndexFileAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
namespace dorado::alignment {

const Minimap2Index* IndexFileAccess::get_compatible_index(
const std::string& file,
const std::string& index_file,
const Minimap2IndexOptions& indexing_options) {
auto& compatible_indices = m_index_lut[{file, indexing_options}];
auto& compatible_indices = m_index_lut[{index_file, indexing_options}];
if (compatible_indices.empty()) {
return nullptr;
}
return compatible_indices.begin()->second.get();
}

std::shared_ptr<Minimap2Index> IndexFileAccess::get_exact_index_impl(
const std::string& file,
const std::string& index_file,
const Minimap2Options& options) const {
auto compatible_indices = m_index_lut.find({file, options});
auto compatible_indices = m_index_lut.find({index_file, options});
if (compatible_indices == m_index_lut.end()) {
return nullptr;
}
Expand All @@ -29,31 +29,31 @@ std::shared_ptr<Minimap2Index> IndexFileAccess::get_exact_index_impl(
}

std::shared_ptr<Minimap2Index> IndexFileAccess::get_exact_index(
const std::string& file,
const std::string& index_file,
const Minimap2Options& options) const {
std::lock_guard<std::mutex> lock(m_mutex);
auto exact_index = get_exact_index_impl(file, options);
auto exact_index = get_exact_index_impl(index_file, options);
assert(exact_index && "Cannot access an index which has not been loaded");
return exact_index;
}

bool IndexFileAccess::is_index_loaded_impl(const std::string& file,
bool IndexFileAccess::is_index_loaded_impl(const std::string& index_file,
const Minimap2Options& options) const {
const auto compatible_indices = m_index_lut.find({file, options});
const auto compatible_indices = m_index_lut.find({index_file, options});
if (compatible_indices == m_index_lut.end()) {
return false;
}
return compatible_indices->second.count(options) > 0;
}

std::shared_ptr<Minimap2Index> IndexFileAccess::get_or_load_compatible_index(
const std::string& file,
const std::string& index_file,
const Minimap2Options& options) {
auto index = get_exact_index_impl(file, options);
auto index = get_exact_index_impl(index_file, options);
if (index) {
return index;
}
auto compatible_index = get_compatible_index(file, options);
auto compatible_index = get_compatible_index(index_file, options);
if (!compatible_index) {
return nullptr;
}
Expand All @@ -62,21 +62,21 @@ std::shared_ptr<Minimap2Index> IndexFileAccess::get_or_load_compatible_index(
if (!new_index) {
return nullptr;
}
m_index_lut[{file, options}][options] = new_index;
m_index_lut[{index_file, options}][options] = new_index;
return new_index;
}

bool IndexFileAccess::try_load_compatible_index(const std::string& file,
bool IndexFileAccess::try_load_compatible_index(const std::string& index_file,
const Minimap2Options& options) {
std::lock_guard<std::mutex> lock(m_mutex);
auto index = get_or_load_compatible_index(file, options);
auto index = get_or_load_compatible_index(index_file, options);
return index != nullptr;
}

IndexLoadResult IndexFileAccess::load_index(const std::string& file,
IndexLoadResult IndexFileAccess::load_index(const std::string& index_file,
const Minimap2Options& options,
int num_threads) {
if (try_load_compatible_index(file, options)) {
if (try_load_compatible_index(index_file, options)) {
return IndexLoadResult::success;
}

Expand All @@ -85,36 +85,36 @@ IndexLoadResult IndexFileAccess::load_index(const std::string& file,
return IndexLoadResult::validation_error;
}

auto load_result = new_index->load(file, num_threads);
auto load_result = new_index->load(index_file, num_threads);
if (load_result != IndexLoadResult::success) {
return load_result;
}

std::lock_guard<std::mutex> lock(m_mutex);
m_index_lut[{file, options}][options] = std::move(new_index);
m_index_lut[{index_file, options}][options] = std::move(new_index);
return IndexLoadResult::success;
}

std::shared_ptr<const Minimap2Index> IndexFileAccess::get_index(const std::string& file,
std::shared_ptr<const Minimap2Index> IndexFileAccess::get_index(const std::string& index_file,
const Minimap2Options& options) {
std::lock_guard<std::mutex> lock(m_mutex);
// N.B. Although the index file for a client must be loaded before reads are added to the pipeline
// it is still possible that the index for a read in the pipeline does not have its index loaded
// if the client disconnected and caused the index to be unloaded while there were still reads in
// the pipeline. For this reason we do not assert a non-null index.
return get_or_load_compatible_index(file, options);
return get_or_load_compatible_index(index_file, options);
}

bool IndexFileAccess::is_index_loaded(const std::string& file,
bool IndexFileAccess::is_index_loaded(const std::string& index_file,
const Minimap2Options& options) const {
std::lock_guard<std::mutex> lock(m_mutex);
return is_index_loaded_impl(file, options);
return is_index_loaded_impl(index_file, options);
}

std::string IndexFileAccess::generate_sequence_records_header(
const std::string& file,
const std::string& index_file,
const Minimap2Options& options) const {
auto loaded_index = get_exact_index(file, options);
auto loaded_index = get_exact_index(index_file, options);
assert(loaded_index && "Index must be loaded to generate header records");
auto sequence_records = loaded_index->get_sequence_records_for_header();

Expand All @@ -132,10 +132,10 @@ std::string IndexFileAccess::generate_sequence_records_header(
return header_stream.str();
}

void IndexFileAccess::unload_index(const std::string& file,
void IndexFileAccess::unload_index(const std::string& index_file,
const Minimap2IndexOptions& index_options) {
std::lock_guard<std::mutex> lock(m_mutex);
m_index_lut[{file, index_options}] = {};
m_index_lut[{index_file, index_options}] = {};
}

bool validate_options(const Minimap2Options& options) {
Expand Down
26 changes: 13 additions & 13 deletions dorado/alignment/IndexFileAccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,49 +20,49 @@ class IndexFileAccess {

// Returns true if the index is loaded, will also create the index if a compatible
// one is already loaded and return true.
bool try_load_compatible_index(const std::string& file, const Minimap2Options& options);
bool try_load_compatible_index(const std::string& index_file, const Minimap2Options& options);

// By contract the index must be loaded (else assertion failure)
std::shared_ptr<Minimap2Index> get_exact_index(const std::string& file,
std::shared_ptr<Minimap2Index> get_exact_index(const std::string& index_file,
const Minimap2Options& options) const;

// Requires the mutex to be locked before calling.
const Minimap2Index* get_compatible_index(const std::string& file,
const Minimap2Index* get_compatible_index(const std::string& index_file,
const Minimap2IndexOptions& indexing_options);

// Requires the mutex to be locked before calling.
std::shared_ptr<Minimap2Index> get_exact_index_impl(const std::string& file,
std::shared_ptr<Minimap2Index> get_exact_index_impl(const std::string& index_file,
const Minimap2Options& options) const;

// Requires the mutex to be locked before calling.
bool is_index_loaded_impl(const std::string& file, const Minimap2Options& options) const;
bool is_index_loaded_impl(const std::string& index_file, const Minimap2Options& options) const;

// Requires the mutex to be locked before calling.
std::shared_ptr<Minimap2Index> get_or_load_compatible_index(const std::string& file,
std::shared_ptr<Minimap2Index> get_or_load_compatible_index(const std::string& index_file,
const Minimap2Options& options);

public:
IndexLoadResult load_index(const std::string& file,
IndexLoadResult load_index(const std::string& index_file,
const Minimap2Options& options,
int num_threads);

// Returns the index if already loaded, if not loaded will create an index from an
// existing compatible one.
// By contract there must be a loaded index for the file with matching indexing
// By contract there must be a loaded index for the index file with matching indexing
// options, if not there will be an assertion failure.
std::shared_ptr<const Minimap2Index> get_index(const std::string& file,
std::shared_ptr<const Minimap2Index> get_index(const std::string& index_file,
const Minimap2Options& options);

// Unloads all compatible indices for the given file and indexing options.
// Unloads all compatible indices for the given index file and indexing options.
// The underlying minimap index will be deallocated.
void unload_index(const std::string& file, const Minimap2IndexOptions& index_options);
void unload_index(const std::string& index_file, const Minimap2IndexOptions& index_options);

// returns a string containing the sequence records for the requested index
std::string generate_sequence_records_header(const std::string& file,
std::string generate_sequence_records_header(const std::string& index_file,
const Minimap2Options& options) const;

// Testability. Method needed to support utests
bool is_index_loaded(const std::string& file, const Minimap2Options& options) const;
bool is_index_loaded(const std::string& index_file, const Minimap2Options& options) const;
};

bool validate_options(const Minimap2Options& options);
Expand Down

0 comments on commit 246b9b9

Please sign in to comment.