Merge branch 'smalton/DOR-499-basecall-deps' into 'master'

DOR-499: Break basecall<->models library dependency Closes DOR-499 See merge request machine-learning/dorado!784
nanoporetech · Dec 21, 2023 · 1893d69 · 1893d69
2 parents e42761c + 17999f6
commit 1893d69
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 57 deletions.
diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt
@@ -47,7 +47,6 @@ target_link_libraries(dorado_basecall
         ${TORCH_LIBRARIES}
         dorado_utils
     PRIVATE
-        dorado_models_lib
         ${KOI_LIBRARIES}
         spdlog::spdlog
 )

diff --git a/dorado/basecall/CRFModelConfig.cpp b/dorado/basecall/CRFModelConfig.cpp
@@ -1,10 +1,7 @@
 #include "CRFModelConfig.h"
 
-#include "models/models.h"
-
 #include <spdlog/spdlog.h>
 #include <toml.hpp>
-#include <toml/value.hpp>
 
 #include <cstddef>
 #include <set>
@@ -31,6 +28,27 @@ SublayerType sublayer_type(const toml::value &segment) {
     return mapping_iter->second;
 }
 
+// the mean Q-score of short reads are artificially lowered because of
+// some lower quality bases at the beginning of the read. to correct for
+// that, mean Q-score calculation should ignore the first few bases. The
+// number of bases to ignore is dependent on the model.
+uint32_t get_mean_qscore_start_pos_by_model_name(const std::string &model_name) {
+    static const std::unordered_map<std::string, uint16_t> mean_qscore_start_pos_by_model = {
+            // To add model specific start positions for older models,
+            // create an entry keyed by model name with the value as
+            // the desired start position.
+            // e.g. {"dna_r10.4.1_e8.2_5khz_400bps_fast@v4.2.0", 10}
+    };
+
+    auto iter = mean_qscore_start_pos_by_model.find(model_name);
+    if (iter != mean_qscore_start_pos_by_model.end()) {
+        return iter->second;
+    } else {
+        // Assume start position of 60 as default.
+        return 60;
+    }
+}
+
 }  // namespace
 namespace dorado::basecall {
 
@@ -217,6 +235,13 @@ CRFModelConfig load_crf_model_config(const std::filesystem::path &path) {
         config.qscale = toml::find<float>(qscore, "scale");
         if (qscore.contains("mean_qscore_start_pos")) {
             config.mean_qscore_start_pos = toml::find<int32_t>(qscore, "mean_qscore_start_pos");
+        } else {
+            // If information is not present in the config, find start position by model name.
+            std::string model_name = config.model_path.filename().string();
+            config.mean_qscore_start_pos = get_mean_qscore_start_pos_by_model_name(model_name);
+        }
+        if (config.mean_qscore_start_pos < 0) {
+            throw std::runtime_error("Mean q-score start position cannot be < 0");
         }
     } else {
         spdlog::debug("> no qscore calibration found");
@@ -307,19 +332,6 @@ CRFModelConfig load_crf_model_config(const std::filesystem::path &path) {
     return config;
 }
 
-int32_t get_model_mean_qscore_start_pos(const CRFModelConfig &model_config) {
-    int32_t mean_qscore_start_pos = model_config.mean_qscore_start_pos;
-    if (mean_qscore_start_pos < 0) {
-        // If unsuccessful, find start position by model name.
-        std::string model_name = model_config.model_path.filename().string();
-        mean_qscore_start_pos = models::get_mean_qscore_start_pos_by_model_name(model_name);
-    }
-    if (mean_qscore_start_pos < 0) {
-        throw std::runtime_error("Mean q-score start position cannot be < 0");
-    }
-    return mean_qscore_start_pos;
-}
-
 bool is_rna_model(const CRFModelConfig &model_config) {
     auto path = std::filesystem::canonical(model_config.model_path);
     auto filename = path.filename();

diff --git a/dorado/basecall/CRFModelConfig.h b/dorado/basecall/CRFModelConfig.h
@@ -108,8 +108,6 @@ struct CRFModelConfig {
 
 CRFModelConfig load_crf_model_config(const std::filesystem::path& path);
 
-int32_t get_model_mean_qscore_start_pos(const CRFModelConfig& model_config);
-
 bool is_rna_model(const CRFModelConfig& model_config);
 
 }  // namespace dorado::basecall
diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp
@@ -181,12 +181,7 @@ void setup(std::vector<std::string> args,
             std::unordered_set<std::string>{}, thread_allocations.read_filter_threads);
 
     auto mean_qscore_start_pos = model_config.mean_qscore_start_pos;
-    if (mean_qscore_start_pos < 0) {
-        mean_qscore_start_pos = models::get_mean_qscore_start_pos_by_model_name(model_name);
-        if (mean_qscore_start_pos < 0) {
-            throw std::runtime_error("Mean q-score start position cannot be < 0");
-        }
-    }
+
     pipelines::create_simplex_pipeline(
             pipeline_desc, std::move(runners), std::move(remora_runners), overlap,
             mean_qscore_start_pos, !adapter_no_trim, thread_allocations.scaler_node_threads,

diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp
@@ -512,13 +512,6 @@ int duplex(int argc, char* argv[]) {
             }
 
             auto mean_qscore_start_pos = models.model_config.mean_qscore_start_pos;
-            if (mean_qscore_start_pos < 0) {
-                mean_qscore_start_pos =
-                        models::get_mean_qscore_start_pos_by_model_name(models.stereo_model_name);
-                if (mean_qscore_start_pos < 0) {
-                    throw std::runtime_error("Mean q-score start position cannot be < 0");
-                }
-            }
 
             pipelines::create_stereo_duplex_pipeline(
                     pipeline_desc, std::move(runners), std::move(stereo_runners),

diff --git a/dorado/models/models.cpp b/dorado/models/models.cpp
@@ -770,14 +770,6 @@ const std::vector<ModelInfo> models = {
 
 }  // namespace modified
 
-const std::unordered_map<std::string, uint16_t> mean_qscore_start_pos_by_model = {
-
-        // To add model specific start positions for older models,
-        // create an entry keyed by model name with the value as
-        // the desired start position.
-        // e.g. {"dna_r10.4.1_e8.2_5khz_400bps_fast@v4.2.0", 10}
-};
-
 std::string calculate_checksum(std::string_view data) {
     // Hash the data.
     std::array<unsigned char, SHA256_DIGEST_LENGTH> hash{};
@@ -1090,17 +1082,7 @@ SamplingRate get_sample_rate_by_model_name(const std::string& model_name) {
         return iter->second.sampling_rate;
     } else {
         // This can only happen if a model_info.chemistry not in chemistries which should be impossible.
-        throw std::logic_error("Couldn't find chemsitry: " + to_string(model_info.chemistry));
-    }
-}
-
-uint32_t get_mean_qscore_start_pos_by_model_name(const std::string& model_name) {
-    auto iter = mean_qscore_start_pos_by_model.find(model_name);
-    if (iter != mean_qscore_start_pos_by_model.end()) {
-        return iter->second;
-    } else {
-        // Assume start position of 60 as default.
-        return 60;
+        throw std::logic_error("Couldn't find chemistry: " + to_string(model_info.chemistry));
     }
 }
 

diff --git a/dorado/models/models.h b/dorado/models/models.h
@@ -59,12 +59,6 @@ std::string get_modification_model(const std::string& simplex_model,
 // get the sampling rate that the model is compatible with
 SamplingRate get_sample_rate_by_model_name(const std::string& model_name);
 
-// the mean Q-score of short reads are artificially lowered because of
-// some lower quality bases at the beginning of the read. to correct for
-// that, mean Q-score calculation should ignore the first few bases. The
-// number of bases to ignore is dependent on the model.
-uint32_t get_mean_qscore_start_pos_by_model_name(const std::string& model_name);
-
 // Extract the model name from the model path.
 std::string extract_model_name_from_path(const std::filesystem::path& model_path);