Merge branch 'rh/auto_model' into 'master'

Auto model selection See merge request machine-learning/dorado!721
nanoporetech · Nov 29, 2023 · fb85a70 · fb85a70
2 parents 46bbfdd + 27e5387
commit fb85a70
Show file tree

Hide file tree

Showing 51 changed files with 3,267 additions and 329 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -358,8 +358,11 @@ build_archive:macos:x64:
     GIT_STRATEGY: none
     MODEL: dna_r10.4.1_e8.2_400bps_hac@v4.1.0
     BATCH: 384
+    MODEL_SPEED: hac
+    MODEL_VERSION: v4.2.0
   script:
     - bash ./tests/test_simple_basecaller_execution.sh ./dist/bin/dorado ${MODEL} ${BATCH}
+    - bash ./tests/test_simple_auto_basecaller_execution.sh ./dist/bin/dorado ${MODEL_SPEED} ${MODEL_VERSION} ${BATCH}
     - bash ./tests/test_expected_logging.sh ./dist/bin/dorado ${MODEL} ${BATCH}
   interruptible: true
 
@@ -413,6 +416,8 @@ test:linux:arm64:bionic:
     GIT_STRATEGY: none
     MODEL: dna_r10.4.1_e8.2_400bps_hac@v4.1.0
     BATCH: 0
+    MODEL_SPEED: hac
+    MODEL_VERSION: v4.2.0
   needs:
     - build:linux:arm64:bionic
 
@@ -439,6 +444,8 @@ test:macos:x64:
     GIT_STRATEGY: none
     MODEL: dna_r10.4.1_e8.2_400bps_fast@v4.1.0
     BATCH: 0
+    MODEL_SPEED: hac
+    MODEL_VERSION: v4.2.0
 
 # Test that you can run dorado in a clean cuda 20.04 environment
 test_archive:linux:x86:20.04_nvidia:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -159,8 +159,8 @@ add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/date EXCLUDE_FROM_ALL)
 
 enable_testing()
 
-add_subdirectory(dorado/models)
 add_subdirectory(dorado/utils)
+add_subdirectory(dorado/models)
 
 if("${CUDA_VERSION}" STREQUAL "")
   set(CUDA_VERSION ${CUDAToolkit_VERSION})
@@ -508,11 +508,14 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS")
     add_library(dorado_io_lib
         dorado/data_loader/DataLoader.cpp
         dorado/data_loader/DataLoader.h
+        dorado/data_loader/ModelFinder.cpp
+        dorado/data_loader/ModelFinder.h
     )
 
     target_link_libraries(dorado_io_lib
         PUBLIC
-            dorado_lib    
+            dorado_lib
+            dorado_models_lib
         PRIVATE
             ${POD5_LIBRARIES}
             ${HDF5_C_LIBRARIES}

diff --git a/README.md b/README.md
@@ -64,21 +64,28 @@ Dorado is Oxford Nanopore's recommended basecaller for offline basecalling. We a
 The following are helpful commands for getting started with Dorado.
 To see all options and their defaults, run `dorado -h` and `dorado <subcommand> -h`.
 
+### Model selection foreword
+
+Dorado can automatically select a basecalling model using a selection of model speed (`fast`, `hac`, `sup`) and the pod5 data. This feature is **not** supported for fast5 data. If the model does not exist locally, dorado will automatically downloaded the model and delete it when finished. To re-use downloaded models, manually download models using `dorado download`.
+
+Dorado continues to support model paths.
+
+For details read [Automatic model selection complex](#automatic-model-selection-complex).
+
 ### Simplex basecalling
 
-To run Dorado basecalling, download a model and point it to either a directory of POD5 files or a single POD5 file _(.fast5 files are supported, but will not be as performant)_.
+To run Dorado basecalling, using the automatically downloaded `hac` model on a directory of POD5 files or a single POD5 file _(.fast5 files are supported, but will not be as performant)_.
 
 ```
-$ dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.1.0
-$ dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5s/ > calls.bam
+$ dorado basecaller hac pod5s/ > calls.bam
 ```
 
 To basecall a single file, simply replace the directory `pod5s/` with a path to your data file.
 
 If basecalling is interrupted, it is possible to resume basecalling from a BAM file. To do so, use the `--resume-from` flag to specify the path to the incomplete BAM file. For example:
 
 ```
-$ dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5s --resume-from incomplete.bam > calls.bam
+$ dorado basecaller hac pod5s --resume-from incomplete.bam > calls.bam
 ```
 
 `calls.bam` will contain all of the reads from `incomplete.bam` plus the new basecalls *(`incomplete.bam` can be discarded after basecalling is complete)*.
@@ -89,10 +96,10 @@ $ dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5s --resume-from incom
 
 Beyond the traditional A, T, C, and G basecalling, Dorado can also detect modified bases such as 5-methylcytosine (5mC), 5-hydroxymethylcytosine (5hmC), and N<sup>6</sup>-methyladenosine (6mA). These modified bases play crucial roles in epigenetic regulation.
 
-To call modifications, add `--modified-bases` to the basecaller command:
+To call modifications, extend the models argument with a comma-separated list of modifications:
 
 ```
-$ dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5s/ --modified-bases 5mCG_5hmCG > calls.bam
+$ dorado basecaller hac,5mCG_5hmCG pod5s/ > calls.bam
 ```
 
 Refer to the [DNA models](#dna-models) table's _Compatible Modifications_ column to see available modifications that can be called with the `--modified-bases` option.
@@ -102,7 +109,7 @@ Refer to the [DNA models](#dna-models) table's _Compatible Modifications_ column
 To run Duplex basecalling, run the command:
 
 ```
-$ dorado duplex dna_r10.4.1_e8.2_400bps_sup@v4.1.0 pod5s/ > duplex.bam
+$ dorado duplex sup pod5s/ > duplex.bam
 ```
 
 When using the `duplex` command, two types of DNA sequence results will be produced: 'simplex' and 'duplex'. Any specific position in the DNA which is in a duplex read is also seen in two simplex strands (the template and complement).  So, each DNA position which is duplex sequenced will be covered by a minimum of three separate readings in the output.
@@ -280,6 +287,35 @@ Below is a table of the available basecalling models and the modified basecallin
 | rna002_70bps_hac@v3 | N/A | N/A | 3 kHz |
 
 
+## Automatic model selection complex
+
+The `model` argument in dorado can specify either a model path or a model **_complex_**. A model complex must start with the **simplex model speed**, and follows this syntax:
+
+```
+(fast|hac|sup)[@(version|latest)][,modification[@(version|latest)]][,...]
+```
+
+Here are a few examples of model complexes:
+
+| Model Complex | Description |
+| :------------ | :---------- |
+| fast  | Latest compatible **fast** model |
+| hac  | Latest compatible **hac** model |
+| sup  | Latest compatible **sup** model |
+| hac@latest | Latest compatible **hac** model |
+| hac@v4.2.0  | Compatible **hac** model with version `v4.2.0`  |
+| hac@v3.5 | Compatible **hac** model with version `v3.5.0`  |
+| hac,5mCG_5hmCG  | Latest compatible **hac** model and latest **5mCG_5hmCG** modifications model |
+| hac,5mCG_5hmCG@v2  | Latest compatible **hac** model and **5mCG_5hmCG** modifications model with version `v2.0.0`  |
+| sup,5mCG_5hmCG,6mA  | Latest compatible **sup** model and both **5mCG_5hmCG** and **6mA** latest modifications models  |
+
+Automatically selected modification models will always match the base simplex model version selected. Noting the highlighted version changes, for example:
+
+| Model Complex | Description | Models |
+| :------------ | :---------- | :---------- |
+| sup,5mCG_5hmCG  | Latest compatible **sup** model and latest **5mCG_5hmCG** modifications model (`v3.1.0`) | dna_r10.4.1_e8.2_400bps_sup@v4.2.0 <br /> dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mCG_5hmCG@v3.1   |
+| sup@v4.1,5mCG_5hmCG  | Compatible **sup** model with version `v4.1.0` and latest **5mCG_5hmCG** modifications model (`v2.0.0`) |  dna_r10.4.1_e8.2_400bps_sup@`v4.1.0`<br />dna_r10.4.1_e8.2_400bps_sup@`v4.1.0`_5mCG_5hmCG@`v2`   |
+
 ## Developer quickstart
 
 ### Linux dependencies

diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp
@@ -1,6 +1,7 @@
 #include "Version.h"
 #include "cli/cli_utils.h"
 #include "data_loader/DataLoader.h"
+#include "data_loader/ModelFinder.h"
 #include "models/models.h"
 #include "nn/CRFModelConfig.h"
 #include "nn/ModBaseRunner.h"
@@ -19,32 +20,39 @@
 #include "utils/bam_utils.h"
 #include "utils/barcode_kits.h"
 #include "utils/basecaller_utils.h"
+#include "utils/fs_utils.h"
 #include "utils/log_utils.h"
 #include "utils/parameters.h"
 #include "utils/stats.h"
+#include "utils/string_utils.h"
 #include "utils/sys_stats.h"
 #include "utils/torch_utils.h"
 
 #include <htslib/sam.h>
 #include <spdlog/spdlog.h>
 
 #include <algorithm>
+#include <cstdlib>
+#include <exception>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <thread>
 
 namespace dorado {
 
 using dorado::utils::default_parameters;
 using namespace std::chrono_literals;
+using namespace dorado::models;
+namespace fs = std::filesystem;
 
 void setup(std::vector<std::string> args,
-           const std::filesystem::path& model_path,
+           const fs::path& model_path,
            const std::string& data_path,
-           const std::string& remora_models,
+           const std::vector<fs::path>& remora_models,
            const std::string& device,
            const std::string& ref,
            size_t chunk_size,
@@ -70,10 +78,11 @@ void setup(std::vector<std::string> args,
            bool barcode_no_trim,
            const std::string& barcode_sample_sheet,
            argparse::ArgumentParser& resume_parser,
-           bool estimate_poly_a) {
-    auto model_config = load_crf_model_config(model_path);
-    std::string model_name = models::extract_model_from_model_path(model_path.string());
-    std::string modbase_model_names = models::extract_model_from_model_paths(remora_models);
+           bool estimate_poly_a,
+           const ModelSelection& model_selection) {
+    const auto model_config = load_crf_model_config(model_path);
+    const std::string model_name = models::extract_model_name_from_path(model_path);
+    const std::string modbase_model_names = models::extract_model_names_from_paths(remora_models);
 
     if (!DataLoader::is_read_data_present(data_path, recursive_file_loading)) {
         std::string err = "No POD5 or FAST5 data found in path: " + data_path;
@@ -213,13 +222,25 @@ void setup(std::vector<std::string> args,
         // sub parser only knows about the `basecaller` command.
         tokens.erase(tokens.begin());
         resume_parser.parse_args(tokens);
-        auto resume_model_name =
-                models::extract_model_from_model_path(resume_parser.get<std::string>("model"));
-        if (model_name != resume_model_name) {
+
+        const std::string model_arg = resume_parser.get<std::string>("model");
+        const ModelSelection resume_selection = ModelComplexParser::parse(model_arg);
+
+        if (resume_selection.is_path()) {
+            // If the model selection is a path, check it exists and matches
+            const auto resume_model_name =
+                    models::extract_model_name_from_path(fs::path(model_arg));
+            if (model_name != resume_model_name) {
+                throw std::runtime_error(
+                        "Resume only works if the same model is used. Resume model was " +
+                        resume_model_name + " and current model is " + model_name);
+            }
+        } else if (resume_selection != model_selection) {
             throw std::runtime_error(
-                    "Resume only works if the same model is used. Resume model was " +
-                    resume_model_name + " and current model is " + model_name);
+                    "Resume only works if the same model is used. Resume model complex was " +
+                    resume_selection.raw + " and current model is " + model_selection.raw);
         }
+
         // Resume functionality injects reads directly into the writer node.
         ResumeLoaderNode resume_loader(hts_writer_ref, resume_from_file);
         resume_loader.copy_completed_reads();
@@ -268,7 +289,9 @@ int basecaller(int argc, char* argv[]) {
 
     cli::ArgParser parser("dorado");
 
-    parser.visible.add_argument("model").help("the basecaller model to run.");
+    parser.visible.add_argument("model").help(
+            "model selection {fast,hac,sup}@v{version} for automatic model selection including "
+            "modbases, or path to existing model directory");
 
     parser.visible.add_argument("data").help("the data directory or file (POD5/FAST5 format).");
 
@@ -324,7 +347,7 @@ int basecaller(int argc, char* argv[]) {
     parser.visible.add_argument("--modified-bases")
             .nargs(argparse::nargs_pattern::at_least_one)
             .action([](const std::string& value) {
-                const auto& mods = models::modified_mods();
+                const auto& mods = models::modified_model_variants();
                 if (std::find(mods.begin(), mods.end(), value) == mods.end()) {
                     spdlog::error(
                             "'{}' is not a supported modification please select from {}", value,
@@ -405,24 +428,22 @@ int basecaller(int argc, char* argv[]) {
         utils::SetVerboseLogging(static_cast<dorado::utils::VerboseLogLevel>(verbosity));
     }
 
-    auto model = parser.visible.get<std::string>("model");
-    auto mod_bases = parser.visible.get<std::vector<std::string>>("--modified-bases");
-    auto mod_bases_models = parser.visible.get<std::string>("--modified-bases-models");
+    const auto model_arg = parser.visible.get<std::string>("model");
+    const auto data = parser.visible.get<std::string>("data");
+    const auto recursive = parser.visible.get<bool>("--recursive");
+    const auto mod_bases = parser.visible.get<std::vector<std::string>>("--modified-bases");
+    const auto mod_bases_models = parser.visible.get<std::string>("--modified-bases-models");
+
+    const ModelSelection model_selection = cli::parse_model_argument(model_arg);
 
-    if (!mod_bases.empty() && !mod_bases_models.empty()) {
+    auto ways = {model_selection.has_mods_variant(), !mod_bases.empty(), !mod_bases_models.empty()};
+    if (std::count(ways.begin(), ways.end(), true) > 1) {
         spdlog::error(
-                "only one of --modified-bases or --modified-bases-models should be specified.");
+                "only one of --modified-bases, --modified-bases-models, or modified models set "
+                "via models argument can be used at once",
+                model_arg);
         std::exit(EXIT_FAILURE);
-    } else if (mod_bases.size()) {
-        std::vector<std::string> m;
-        std::transform(
-                mod_bases.begin(), mod_bases.end(), std::back_inserter(m),
-                [&model](std::string m) { return models::get_modification_model(model, m); });
-
-        mod_bases_models =
-                std::accumulate(std::next(m.begin()), m.end(), m[0],
-                                [](std::string a, std::string b) { return a + "," + b; });
-    }
+    };
 
     auto methylation_threshold = parser.visible.get<float>("--modified-bases-threshold");
     if (methylation_threshold < 0.f || methylation_threshold > 1.f) {
@@ -436,11 +457,12 @@ int basecaller(int argc, char* argv[]) {
     auto emit_sam = parser.visible.get<bool>("--emit-sam");
 
     if (emit_fastq && emit_sam) {
-        throw std::runtime_error("Only one of --emit-{fastq, sam} can be set (or none).");
+        spdlog::error("Only one of --emit-{fastq, sam} can be set (or none).");
+        std::exit(EXIT_FAILURE);
     }
 
     if (emit_fastq) {
-        if (!mod_bases.empty() || !mod_bases_models.empty()) {
+        if (model_selection.has_mods_variant() || !mod_bases.empty() || !mod_bases_models.empty()) {
             spdlog::error(
                     "--emit-fastq cannot be used with modbase models as FASTQ cannot store modbase "
                     "results.");
@@ -460,18 +482,49 @@ int basecaller(int argc, char* argv[]) {
         output_mode = HtsWriter::OutputMode::UBAM;
     }
 
+    fs::path model_path;
+    std::vector<fs::path> mods_model_paths;
+    std::set<fs::path> temp_download_paths;
+
+    if (model_selection.is_path()) {
+        model_path = fs::path(model_arg);
+
+        if (mod_bases.size() > 0) {
+            std::transform(mod_bases.begin(), mod_bases.end(), std::back_inserter(mods_model_paths),
+                           [&model_arg](std::string m) {
+                               return fs::path(models::get_modification_model(model_arg, m));
+                           });
+        } else if (mod_bases_models.size() > 0) {
+            const auto split = utils::split(mod_bases_models, ',');
+            std::transform(split.begin(), split.end(), std::back_inserter(mods_model_paths),
+                           [&](std::string m) { return fs::path(m); });
+        }
+
+    } else {
+        auto model_finder = cli::model_finder(model_selection, data, recursive, true);
+        try {
+            model_path = model_finder.fetch_simplex_model();
+            if (model_selection.has_mods_variant()) {
+                mods_model_paths = model_finder.fetch_mods_models();
+            }
+            temp_download_paths = model_finder.downloaded_models();
+        } catch (std::exception& e) {
+            spdlog::error(e.what());
+            std::exit(EXIT_FAILURE);
+        }
+    }
+
     spdlog::info("> Creating basecall pipeline");
 
     try {
-        setup(args, model, parser.visible.get<std::string>("data"), mod_bases_models,
-              parser.visible.get<std::string>("-x"), parser.visible.get<std::string>("--reference"),
-              parser.visible.get<int>("-c"), parser.visible.get<int>("-o"),
-              parser.visible.get<int>("-b"), default_parameters.num_runners,
-              default_parameters.remora_batchsize, default_parameters.remora_threads,
-              methylation_threshold, output_mode, parser.visible.get<bool>("--emit-moves"),
-              parser.visible.get<int>("--max-reads"), parser.visible.get<int>("--min-qscore"),
-              parser.visible.get<std::string>("--read-ids"),
-              parser.visible.get<bool>("--recursive"),
+        setup(args, model_path, data, mods_model_paths, parser.visible.get<std::string>("-x"),
+              parser.visible.get<std::string>("--reference"), parser.visible.get<int>("-c"),
+              parser.visible.get<int>("-o"), parser.visible.get<int>("-b"),
+              default_parameters.num_runners, default_parameters.remora_batchsize,
+              default_parameters.remora_threads, methylation_threshold, output_mode,
+              parser.visible.get<bool>("--emit-moves"), parser.visible.get<int>("--max-reads"),
+              parser.visible.get<int>("--min-qscore"),
+              parser.visible.get<std::string>("--read-ids"), recursive,
               cli::process_minimap2_arguments(parser, alignment::dflt_options),
               parser.hidden.get<bool>("--skip-model-compatibility-check"),
               parser.hidden.get<std::string>("--dump_stats_file"),
@@ -481,12 +534,13 @@ int basecaller(int argc, char* argv[]) {
               parser.visible.get<bool>("--barcode-both-ends"),
               parser.visible.get<bool>("--no-trim"),
               parser.visible.get<std::string>("--sample-sheet"), resume_parser,
-              parser.visible.get<bool>("--estimate-poly-a"));
+              parser.visible.get<bool>("--estimate-poly-a"), model_selection);
     } catch (const std::exception& e) {
         spdlog::error("{}", e.what());
         return 1;
     }
 
+    utils::clean_temporary_models(temp_download_paths);
     spdlog::info("> Finished");
     return 0;
 }