Skip to content

Commit

Permalink
Merge branch 'cherry-pick-4eee9969' into 'release-v0.6'
Browse files Browse the repository at this point in the history
Cherry-pick 'DOR-662_mm2_optional_options' into 'release-v0.6'

See merge request machine-learning/dorado!960
  • Loading branch information
tijyojwad committed Apr 19, 2024
2 parents 1cc207a + 9d0e74d commit a2abf83
Show file tree
Hide file tree
Showing 11 changed files with 151 additions and 114 deletions.
7 changes: 3 additions & 4 deletions dorado/alignment/IndexFileAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,9 @@ bool IndexFileAccess::is_index_loaded(const std::string& index_file,
return is_index_loaded_impl(index_file, options);
}

std::string IndexFileAccess::generate_sequence_records_header(
const std::string& index_file,
const Minimap2Options& options) const {
auto loaded_index = get_exact_index(index_file, options);
std::string IndexFileAccess::generate_sequence_records_header(const std::string& index_file,
const Minimap2Options& options) {
auto loaded_index = get_index(index_file, options);
assert(loaded_index && "Index must be loaded to generate header records");
auto sequence_records = loaded_index->get_sequence_records_for_header();

Expand Down
5 changes: 4 additions & 1 deletion dorado/alignment/IndexFileAccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,11 @@ class IndexFileAccess {
void unload_index(const std::string& index_file, const Minimap2IndexOptions& index_options);

// returns a string containing the sequence records for the requested index
// Note, if not loaded will create an index from an existing compatible one.
// By contract there must be a loaded index for the index file with matching indexing
// options, if not there will be an assertion failure.
std::string generate_sequence_records_header(const std::string& index_file,
const Minimap2Options& options) const;
const Minimap2Options& options);

// Testability. Method needed to support utests
bool is_index_loaded(const std::string& index_file, const Minimap2Options& options) const;
Expand Down
25 changes: 14 additions & 11 deletions dorado/alignment/Minimap2Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,33 +33,36 @@ IndexReaderPtr create_index_reader(const std::string& index_file,
namespace dorado::alignment {

void Minimap2Index::set_index_options(const Minimap2IndexOptions& index_options) {
m_index_options->k = index_options.kmer_size;
m_index_options->w = index_options.window_size;
m_index_options->k = index_options.kmer_size.value_or(m_index_options->k);
m_index_options->w = index_options.window_size.value_or(m_index_options->w);
spdlog::trace("> Index parameters input by user: kmer size={} and window size={}.",
m_index_options->k, m_index_options->w);

m_index_options->batch_size = index_options.index_batch_size;
m_index_options->mini_batch_size = index_options.index_batch_size;
//if not specified override the preset value of 8000000000 with 16000000000
m_index_options->batch_size = index_options.index_batch_size.value_or(16000000000);
m_index_options->mini_batch_size = m_index_options->batch_size;
spdlog::trace("> Index parameters input by user: batch size={} and mini batch size={}.",
m_index_options->batch_size, m_index_options->mini_batch_size);
}

void Minimap2Index::set_mapping_options(const Minimap2MappingOptions& mapping_options) {
m_mapping_options->bw = mapping_options.bandwidth;
m_mapping_options->bw_long = mapping_options.bandwidth_long;
spdlog::trace("> Map parameters input by user: bandwidth={} and bandwidth long={}.",
m_mapping_options->bw, m_mapping_options->bw_long);
m_mapping_options->bw = mapping_options.bandwidth.value_or(m_mapping_options->bw);
m_mapping_options->bw_long =
mapping_options.bandwidth_long.value_or(m_mapping_options->bw_long);
spdlog::trace("> Map parameters: bandwidth={} and bandwidth long={}.", m_mapping_options->bw,
m_mapping_options->bw_long);

if (!mapping_options.print_secondary) {
if (!mapping_options.print_secondary.value_or(true)) {
m_mapping_options->flag |= MM_F_NO_PRINT_2ND;
}
m_mapping_options->best_n = mapping_options.best_n_secondary;
m_mapping_options->best_n =
mapping_options.best_n_secondary.value_or(m_mapping_options->best_n);
spdlog::trace(
"> Map parameters input by user: don't print secondary={} and best n secondary={}.",
static_cast<bool>(m_mapping_options->flag & MM_F_NO_PRINT_2ND),
m_mapping_options->best_n);

if (mapping_options.soft_clipping) {
if (mapping_options.soft_clipping.value_or(false)) {
m_mapping_options->flag |= MM_F_SOFTCLIP;
}
if (mapping_options.secondary_seq) {
Expand Down
33 changes: 19 additions & 14 deletions dorado/alignment/Minimap2Options.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#pragma once

#include <cstdint>
#include <optional>
#include <string>
#include <tuple>

namespace dorado::alignment {

struct Minimap2IndexOptions {
short kmer_size;
short window_size;
uint64_t index_batch_size;
std::string mm2_preset;
std::optional<short> kmer_size;
std::optional<short> window_size;
std::optional<uint64_t> index_batch_size;
std::string mm2_preset; // By default we use a preset, hence not an optional
};

inline bool operator<(const Minimap2IndexOptions& l, const Minimap2IndexOptions& r) {
Expand Down Expand Up @@ -40,12 +41,12 @@ inline bool operator!=(const Minimap2IndexOptions& l, const Minimap2IndexOptions
}

struct Minimap2MappingOptions {
int best_n_secondary;
int bandwidth;
int bandwidth_long;
bool soft_clipping;
bool secondary_seq;
bool print_secondary;
std::optional<int> best_n_secondary;
std::optional<int> bandwidth;
std::optional<int> bandwidth_long;
std::optional<bool> soft_clipping;
bool secondary_seq; // Not available to be set by the user, hence not optional
std::optional<bool> print_secondary;
};

inline bool operator<(const Minimap2MappingOptions& l, const Minimap2MappingOptions& r) {
Expand Down Expand Up @@ -79,7 +80,7 @@ inline bool operator!=(const Minimap2MappingOptions& l, const Minimap2MappingOpt
}

struct Minimap2Options : public Minimap2IndexOptions, public Minimap2MappingOptions {
bool print_aln_seq;
bool print_aln_seq; // Not available to be set by the user, hence not optional
};

inline bool operator==(const Minimap2Options& l, const Minimap2Options& r) {
Expand All @@ -89,7 +90,11 @@ inline bool operator==(const Minimap2Options& l, const Minimap2Options& r) {

inline bool operator!=(const Minimap2Options& l, const Minimap2Options& r) { return !(l == r); }

static const Minimap2Options dflt_options{{19, 19, 16000000000ull, "lr:hq"},
{5, 500, 20000, false, false, true},
false};
static const std::string DEFAULT_MM_PRESET{"lr:hq"};

static const Minimap2Options dflt_options{
{std::nullopt, std::nullopt, std::nullopt, DEFAULT_MM_PRESET},
{std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, std::nullopt},
false};

} // namespace dorado::alignment
4 changes: 2 additions & 2 deletions dorado/cli/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ int aligner(int argc, char* argv[]) {
.action([&](const auto&) { ++verbosity; })
.append();

cli::add_minimap2_arguments(parser, alignment::dflt_options);
cli::add_minimap2_arguments(parser, alignment::DEFAULT_MM_PRESET);

try {
cli::parse(parser, argc, argv);
Expand Down Expand Up @@ -169,7 +169,7 @@ int aligner(int argc, char* argv[]) {
auto threads(parser.visible.get<int>("threads"));

auto max_reads(parser.visible.get<int>("max-reads"));
auto options = cli::process_minimap2_arguments(parser, alignment::dflt_options);
auto options = cli::process_minimap2_arguments<alignment::Minimap2Options>(parser);

alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_folder,
false};
Expand Down
4 changes: 2 additions & 2 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ int basecaller(int argc, char* argv[]) {
.help("Configuration file for PolyA estimation to change default behaviours")
.default_value(std::string(""));

cli::add_minimap2_arguments(parser, alignment::dflt_options);
cli::add_minimap2_arguments(parser, alignment::DEFAULT_MM_PRESET);
cli::add_internal_arguments(parser);

// Create a copy of the parser to use if the resume feature is enabled. Needed
Expand Down Expand Up @@ -673,7 +673,7 @@ int basecaller(int argc, char* argv[]) {
parser.visible.get<bool>("--emit-moves"), parser.visible.get<int>("--max-reads"),
parser.visible.get<int>("--min-qscore"),
parser.visible.get<std::string>("--read-ids"), recursive,
cli::process_minimap2_arguments(parser, alignment::dflt_options),
cli::process_minimap2_arguments<alignment::Minimap2Options>(parser),
parser.hidden.get<bool>("--skip-model-compatibility-check"),
parser.hidden.get<std::string>("--dump_stats_file"),
parser.hidden.get<std::string>("--dump_stats_filter"),
Expand Down
88 changes: 49 additions & 39 deletions dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <cctype>
#include <cmath>
#include <iostream>
#include <optional>
#include <sstream>
#include <string>
#include <utility>
Expand Down Expand Up @@ -182,46 +183,36 @@ inline void add_internal_arguments(ArgParser& parser) {
.default_value(std::string(""));
}

template <class Options>
void add_minimap2_arguments(ArgParser& parser, const Options& dflt) {
inline void add_minimap2_arguments(ArgParser& parser, const std::string& default_preset) {
parser.visible.add_argument("-k")
.help("minimap2 k-mer size for alignment (maximum 28).")
.template default_value<int>(dflt.kmer_size)
.template scan<'i', int>();

parser.visible.add_argument("-w")
.help("minimap2 minimizer window size for alignment.")
.template default_value<int>(dflt.window_size)
.template scan<'i', int>();

parser.visible.add_argument("-I")
.help("minimap2 index batch size.")
.default_value(to_size(double(dflt.index_batch_size)));
parser.visible.add_argument("-I").help("minimap2 index batch size.");

parser.visible.add_argument("--secondary")
.help("minimap2 outputs secondary alignments")
.default_value(to_yes_or_no(dflt.print_secondary));
parser.visible.add_argument("--secondary").help("minimap2 outputs secondary alignments");

parser.visible.add_argument("-N")
.help("minimap2 retains at most INT secondary alignments")
.default_value(dflt.best_n_secondary)
.template scan<'i', int>();

parser.visible.add_argument("-Y")
.help("minimap2 uses soft clipping for supplementary alignments")
.default_value(false)
.implicit_value(true);

parser.visible.add_argument("--bandwidth")
.help("minimap2 chaining/alignment bandwidth and optionally long-join bandwidth "
"specified as NUM,[NUM]")
.default_value(to_size(dflt.bandwidth) + "," + to_size(dflt.bandwidth_long));
"specified as NUM,[NUM]");

// Setting options to lr:hq which is appropriate for high quality nanopore reads.
parser.visible.add_argument("--mm2-preset")
.help("minimap2 preset for indexing and mapping. Alias for the -x "
"option in minimap2.")
.default_value(dflt.mm2_preset);
.default_value(default_preset);

parser.hidden.add_argument("--secondary-seq")
.help("minimap2 output seq/qual for secondary and supplementary alignments")
Expand All @@ -244,33 +235,52 @@ inline void parse(ArgParser& parser, int argc, const char* const argv[]) {
utils::details::extract_dev_options(parser.hidden.get<std::string>("--devopts"));
}

template <typename TO, typename FROM>
std::optional<TO> get_optional_as(const std::optional<FROM>& from_optional) {
if (from_optional) {
return std::make_optional(static_cast<TO>(*from_optional));
} else {
return std::nullopt;
}
}

template <class Options>
Options process_minimap2_arguments(const ArgParser& parser, const Options& dflt) {
Options res;
res.kmer_size = short(parser.visible.get<int>("k"));
res.window_size = short(parser.visible.get<int>("w"));
res.index_batch_size = cli::parse_string_to_size(parser.visible.get<std::string>("I"));
res.print_secondary = cli::parse_yes_or_no(parser.visible.get<std::string>("secondary"));
res.best_n_secondary = parser.visible.get<int>("N");
if (res.best_n_secondary == 0) {
spdlog::warn("Changed '-N 0' to '-N {} --secondary=no'", dflt.best_n_secondary);
res.print_secondary = false;
res.best_n_secondary = dflt.best_n_secondary;
Options process_minimap2_arguments(const ArgParser& parser) {
Options res{};
res.kmer_size = get_optional_as<short>(parser.visible.present<int>("k"));
res.window_size = get_optional_as<short>(parser.visible.present<int>("w"));
auto index_batch_size = parser.visible.present<std::string>("I");
if (index_batch_size) {
res.index_batch_size =
std::make_optional(cli::parse_string_to_size<uint64_t>(*index_batch_size));
}
auto bandwidth = cli::parse_string_to_sizes(parser.visible.get<std::string>("--bandwidth"));
switch (bandwidth.size()) {
case 1:
res.bandwidth = int(bandwidth[0]);
res.bandwidth_long = dflt.bandwidth_long;
break;
case 2:
res.bandwidth = int(bandwidth[0]);
res.bandwidth_long = int(bandwidth[1]);
break;
default:
throw std::runtime_error("Wrong number of arguments for option '-r'.");
auto print_secondary = parser.visible.present<std::string>("--secondary");
if (print_secondary) {
res.print_secondary = std::make_optional(cli::parse_yes_or_no(*print_secondary));
}
res.best_n_secondary = parser.visible.present<int>("N");
if (res.best_n_secondary.value_or(1) == 0) {
spdlog::warn("Ignoring '-N 0', using preset default");
res.print_secondary = std::nullopt;
res.best_n_secondary = std::nullopt;
}

auto optional_bandwidth = parser.visible.present<std::string>("--bandwidth");
if (optional_bandwidth) {
auto bandwidth = cli::parse_string_to_sizes<int>(*optional_bandwidth);
switch (bandwidth.size()) {
case 1:
res.bandwidth = std::make_optional<int>(bandwidth[0]);
break;
case 2:
res.bandwidth = std::make_optional<int>(bandwidth[0]);
res.bandwidth_long = std::make_optional<int>(bandwidth[1]);
break;
default:
throw std::runtime_error("Wrong number of arguments for option '-r'.");
}
}
res.soft_clipping = parser.visible.get<bool>("Y");
res.soft_clipping = parser.visible.present<bool>("Y");
res.mm2_preset = parser.visible.get<std::string>("mm2-preset");
res.secondary_seq = parser.hidden.get<bool>("secondary-seq");
res.print_aln_seq = parser.hidden.get<bool>("print-aln-seq");
Expand Down
4 changes: 2 additions & 2 deletions dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ int duplex(int argc, char* argv[]) {
.help("the minimum predicted methylation probability for a modified base to be emitted "
"in an all-context model, [0, 1]");

cli::add_minimap2_arguments(parser, alignment::dflt_options);
cli::add_minimap2_arguments(parser, alignment::DEFAULT_MM_PRESET);
cli::add_internal_arguments(parser);

std::set<fs::path> temp_model_paths;
Expand Down Expand Up @@ -377,7 +377,7 @@ int duplex(int argc, char* argv[]) {
hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, gpu_names);
converted_reads_sink = hts_writer;
} else {
auto options = cli::process_minimap2_arguments(parser, alignment::dflt_options);
auto options = cli::process_minimap2_arguments<alignment::Minimap2Options>(parser);
auto index_file_access = std::make_shared<alignment::IndexFileAccess>();
aligner = pipeline_desc.add_node<AlignerNode>({}, index_file_access, ref, "", options,
std::thread::hardware_concurrency());
Expand Down
4 changes: 2 additions & 2 deletions tests/AlignerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture,
bam1_t* supplementary_rec = bam_records[2].get();

// Check aux tags.
if (options.soft_clipping) {
if (*options.soft_clipping) {
CHECK(bam_aux_get(primary_rec, "MM") != nullptr);
CHECK(bam_aux_get(primary_rec, "ML") != nullptr);
CHECK(bam_aux_get(primary_rec, "MN") != nullptr);
Expand Down Expand Up @@ -599,7 +599,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture,

// Check aux tags.
CHECK_THAT(bam_aux2Z(bam_aux_get(primary_rec, "SA")), Equals("read2,1,+,999S899M,60,0;"));
if (options.soft_clipping) {
if (*options.soft_clipping) {
CHECK_THAT(bam_aux2Z(bam_aux_get(secondary_rec, "SA")),
Equals("read3,1,+,999M899S,0,0;read2,1,+,999S899M,60,0;"));
} else {
Expand Down

0 comments on commit a2abf83

Please sign in to comment.