Merge branch 'jdaw/fix-correct-alignments' into 'release-v0.7'

Fix sub-par alignments in dorado correct See merge request machine-learning/dorado!1054
nanoporetech · Jun 12, 2024 · 3b51c1b · 3b51c1b
2 parents d0df79c + d956314
commit 3b51c1b
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 11 deletions.
diff --git a/dorado/alignment/Minimap2Index.cpp b/dorado/alignment/Minimap2Index.cpp
@@ -103,9 +103,12 @@ void Minimap2Index::set_mapping_options(const Minimap2MappingOptions& mapping_op
         }
     }
 
-    // Equivalent to "--cap-kalloc 100m --cap-sw-mem 50m"
-    m_mapping_options->cap_kalloc = 100'000'000;
-    m_mapping_options->max_sw_mat = 50'000'000;
+    // Either use the default value for cap_kalloc and max_sw_mat defined in the dorado
+    // options initialization, or if it's set to nullopt use the minimap2 library default.
+    m_mapping_options->cap_kalloc =
+            mapping_options.cap_kalloc.value_or(m_mapping_options->cap_kalloc);
+    m_mapping_options->max_sw_mat =
+            mapping_options.max_sw_mat.value_or(m_mapping_options->max_sw_mat);
 }
 
 std::shared_ptr<mm_idx_t> Minimap2Index::load_initial_index(const std::string& index_file,

diff --git a/dorado/alignment/Minimap2Options.h b/dorado/alignment/Minimap2Options.h
@@ -57,6 +57,9 @@ struct Minimap2MappingOptions {
     std::optional<std::string> cs;
     std::optional<std::string> dual;
     std::optional<uint64_t> mini_batch_size;
+    // Equivalent to "--cap-kalloc 100m --cap-sw-mem 50m"
+    std::optional<int64_t> cap_kalloc = 100'000'000;
+    std::optional<int64_t> max_sw_mat = 50'000'000;
 };
 
 inline bool operator<(const Minimap2MappingOptions& l, const Minimap2MappingOptions& r) {

diff --git a/dorado/correct/features.cpp b/dorado/correct/features.cpp
@@ -444,7 +444,13 @@ std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWind
             }
             // Sort the filtered overlaps by accuracy score
             std::sort(overlap_windows.begin(), overlap_windows.end(),
-                      [](const OverlapWindow& a, const OverlapWindow& b) {
+                      [&alignments](const OverlapWindow& a, const OverlapWindow& b) {
+                          if (std::fabs(a.accuracy - b.accuracy) < 1e-10) {
+                              const auto& a_qname = alignments.qnames[a.overlap_idx];
+                              const auto& b_qname = alignments.qnames[b.overlap_idx];
+                              return std::lexicographical_compare(a_qname.begin(), a_qname.end(),
+                                                                  b_qname.begin(), b_qname.end());
+                          }
                           return a.accuracy > b.accuracy;
                       });
         }

diff --git a/dorado/correct/infer.cpp b/dorado/correct/infer.cpp
@@ -19,8 +19,8 @@ namespace dorado::correction {
 
 int calculate_batch_size(const std::string& device, float memory_fraction) {
     // These sizes are currently hard coded for version 1 model.
-    const float model_mem = 1.f;        // GB
-    const float per_sample_mem = 0.9f;  // GB
+    const float model_mem = 1.f;       // GB
+    const float per_sample_mem = 1.f;  // GB
     float usable_memory = 0.f;
     if (device == "cpu") {
 #if DORADO_METAL_BUILD

diff --git a/dorado/read_pipeline/CorrectionNode.cpp b/dorado/read_pipeline/CorrectionNode.cpp
@@ -235,7 +235,7 @@ void CorrectionNode::infer_fn(const std::string& device_str, int mtx_idx, int ba
                 at::from_blob(lengths.data(), {(int)lengths.size()},
                               at::TensorOptions().dtype(torch::kInt32).device(torch::kCPU));
         auto batched_bases = collate<int>(bases_batch, (int)11, torch::kInt32);
-        auto batched_quals = collate<float>(quals_batch, 0.f, torch::kFloat32);
+        auto batched_quals = collate<float>(quals_batch, 1.f, torch::kFloat32);
 
         std::unique_lock<std::mutex> lock(m_gpu_mutexes[mtx_idx]);
         std::vector<torch::jit::IValue> inputs;

diff --git a/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp b/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
@@ -56,6 +56,7 @@ void ErrorCorrectionMapperNode::extract_alignments(const mm_reg1_t* reg,
         if (m_read_mutex.find(tname) == m_read_mutex.end()) {
             m_read_mutex.emplace(tname, std::make_unique<std::mutex>());
             CorrectionAlignments new_aln;
+            new_aln.read_name = tname;
             m_correction_records.emplace(tname, std::move(new_aln));
             m_processed_queries_per_target.emplace(tname, std::unordered_set<std::string>());
         }
@@ -115,10 +116,6 @@ void ErrorCorrectionMapperNode::extract_alignments(const mm_reg1_t* reg,
                 continue;
             }
 
-            if (alignments.read_name.empty()) {
-                alignments.read_name = tname;
-            }
-
             alignments.qnames.push_back(qname);
 
             alignments.mm2_cigars.push_back(std::move(cigar));
@@ -255,6 +252,8 @@ ErrorCorrectionMapperNode::ErrorCorrectionMapperNode(const std::string& index_fi
     options.occ_dist = 200;
     options.cs = "short";
     options.dual = "yes";
+    options.cap_kalloc = std::nullopt;
+    options.max_sw_mat = std::nullopt;
 
     m_index = std::make_shared<alignment::Minimap2Index>();
     if (!m_index->initialise(options)) {