From 24fb7adf406bc4523716cbde06858fde5e9438d7 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Tue, 19 Dec 2023 14:34:39 -0500 Subject: [PATCH 1/2] Prevent users from running models with overlapping mods this is not supported by dorado and in general discouraged because it leads to multiple models predicting mod base probabilities for the same methylation at the same location and there's no way to disambiguate which one is which. --- dorado/api/runner_creation.cpp | 3 +++ dorado/modbase/ModBaseModelConfig.cpp | 31 +++++++++++++++++++++++ dorado/modbase/ModBaseModelConfig.h | 7 +++-- tests/test_simple_basecaller_execution.sh | 7 +++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/dorado/api/runner_creation.cpp b/dorado/api/runner_creation.cpp index 2fc55b9e..8d580c7a 100644 --- a/dorado/api/runner_creation.cpp +++ b/dorado/api/runner_creation.cpp @@ -1,6 +1,7 @@ #include "runner_creation.h" #include "basecall/crf_utils.h" +#include "modbase/ModBaseModelConfig.h" #if DORADO_GPU_BUILD #ifdef __APPLE__ @@ -142,6 +143,8 @@ std::vector create_modbase_runners( return {}; } + modbase::check_modbase_multi_model_compatibility(remora_models); + // generate model callers before nodes or it affects the speed calculations std::vector remora_runners; std::vector modbase_devices; diff --git a/dorado/modbase/ModBaseModelConfig.cpp b/dorado/modbase/ModBaseModelConfig.cpp index 1111c7fd..94473dd5 100644 --- a/dorado/modbase/ModBaseModelConfig.cpp +++ b/dorado/modbase/ModBaseModelConfig.cpp @@ -2,6 +2,7 @@ #include "utils/bam_utils.h" #include "utils/sequence_utils.h" +#include "utils/string_utils.h" #include "utils/tensor_utils.h" #include @@ -130,4 +131,34 @@ ModBaseInfo get_modbase_info( return result; } +void check_modbase_multi_model_compatibility( + const std::vector& modbase_models) { + std::string err_msg = ""; + for (size_t i = 0; i < modbase_models.size(); i++) { + auto ref_model = load_modbase_model_config(modbase_models[i]); + const auto& ref_model_mods = ref_model.mod_long_names; + for (size_t j = i + 1; j < modbase_models.size(); j++) { + auto query_model = load_modbase_model_config(modbase_models[j]); + const auto& query_model_mods = query_model.mod_long_names; + + std::vector overlap_mods; + std::set_intersection(ref_model_mods.begin(), ref_model_mods.end(), + query_model_mods.begin(), query_model_mods.end(), + std::back_inserter(overlap_mods)); + + if (!overlap_mods.empty()) { + err_msg += modbase_models[i].string() + " and " + modbase_models[j].string() + + " have overlapping mods: " + utils::join(overlap_mods, ",") + ";"; + } + } + } + + if (!err_msg.empty()) { + throw std::runtime_error( + "Following are incompatible modbase models. Please select only one of them to " + "run:\n" + + err_msg); + } +} + } // namespace dorado::modbase diff --git a/dorado/modbase/ModBaseModelConfig.h b/dorado/modbase/ModBaseModelConfig.h index 84fabbdc..3225c6ae 100644 --- a/dorado/modbase/ModBaseModelConfig.h +++ b/dorado/modbase/ModBaseModelConfig.h @@ -26,10 +26,13 @@ struct ModBaseModelConfig { bool reverse_signal{false}; ///< Reverse model data before processing (rna model) }; -ModBaseModelConfig load_modbase_model_config(const std::filesystem::path & model_path); +ModBaseModelConfig load_modbase_model_config(const std::filesystem::path& model_path); // Determine the modbase alphabet from parameters and calculate offset positions for the results ModBaseInfo get_modbase_info( - const std::vector> & base_mod_params); + const std::vector>& base_mod_params); + +void check_modbase_multi_model_compatibility( + const std::vector& modbase_models); } // namespace dorado::modbase diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index 44f7499a..c25c7c9d 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -15,6 +15,7 @@ dorado_bin=$(cd "$(dirname $1)"; pwd -P)/$(basename $1) model_name=${2:-dna_r10.4.1_e8.2_400bps_hac@v4.1.0} batch=${3:-384} model_name_5k=${4:-dna_r10.4.1_e8.2_400bps_hac@v4.2.0} +model_name_5k_v43=${4:-dna_r10.4.1_e8.2_400bps_hac@v4.3.0} data_dir=$test_dir/data output_dir_name=$(echo $RANDOM | head -c 10) output_dir=${test_dir}/${output_dir_name} @@ -27,6 +28,8 @@ $dorado_bin download --model ${model_name} --directory ${output_dir} model=${output_dir}/${model_name} $dorado_bin download --model ${model_name_5k} --directory ${output_dir} model_5k=${output_dir}/${model_name_5k} +$dorado_bin download --model ${model_name_5k_v43} --directory ${output_dir} +model_5k_v43=${output_dir}/${model_name_5k_v43} echo dorado basecaller test stage $dorado_bin basecaller ${model} $data_dir/pod5 -b ${batch} --emit-fastq > $output_dir/ref.fq @@ -56,6 +59,10 @@ if $dorado_bin basecaller ${model} $data_dir/pod5 -b ${batch} --emit-fastq --mod echo "Error: dorado basecaller should fail with combination of emit-fastq and modbase!" exit 1 fi +if $dorado_bin basecaller $model_5k_v43 $data_dir/duplex/pod5 --modified-bases 5mC_5hmC 5mCG_5hmCG > $output_dir/error_condition.fq; then + echo "Error: dorado basecaller should fail with multiple modbase configs having overlapping mods!" + exit 1 +fi set -e echo dorado summary test stage From a023984064ca6933032b6f0692f520c45b9f8bd4 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Wed, 20 Dec 2023 10:00:19 -0500 Subject: [PATCH 2/2] use motif/offset from config instead of the mod long names to determine the overlapping mods --- dorado/modbase/ModBaseModelConfig.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/dorado/modbase/ModBaseModelConfig.cpp b/dorado/modbase/ModBaseModelConfig.cpp index 94473dd5..0b2fb4c2 100644 --- a/dorado/modbase/ModBaseModelConfig.cpp +++ b/dorado/modbase/ModBaseModelConfig.cpp @@ -2,7 +2,6 @@ #include "utils/bam_utils.h" #include "utils/sequence_utils.h" -#include "utils/string_utils.h" #include "utils/tensor_utils.h" #include @@ -136,19 +135,14 @@ void check_modbase_multi_model_compatibility( std::string err_msg = ""; for (size_t i = 0; i < modbase_models.size(); i++) { auto ref_model = load_modbase_model_config(modbase_models[i]); - const auto& ref_model_mods = ref_model.mod_long_names; + const auto& ref_motif = ref_model.motif[ref_model.motif_offset]; for (size_t j = i + 1; j < modbase_models.size(); j++) { auto query_model = load_modbase_model_config(modbase_models[j]); - const auto& query_model_mods = query_model.mod_long_names; + const auto& query_motif = query_model.motif[query_model.motif_offset]; - std::vector overlap_mods; - std::set_intersection(ref_model_mods.begin(), ref_model_mods.end(), - query_model_mods.begin(), query_model_mods.end(), - std::back_inserter(overlap_mods)); - - if (!overlap_mods.empty()) { + if (ref_motif == query_motif) { err_msg += modbase_models[i].string() + " and " + modbase_models[j].string() + - " have overlapping mods: " + utils::join(overlap_mods, ",") + ";"; + " have overlapping canonical motif: " + ref_motif; } } }