From f7909572c9630875511982bb29b9888d73a732af Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 08:48:22 +0000 Subject: [PATCH 1/9] Untemplate ModelRunner as it always uses CPU decode --- dorado/api/pipeline_creation.cpp | 2 +- dorado/api/runner_creation.cpp | 3 +- dorado/basecall/CMakeLists.txt | 15 +++--- dorado/basecall/CudaCRFModel.h | 3 +- dorado/basecall/MetalCRFModel.h | 2 +- dorado/basecall/ModelRunner.cpp | 53 ++++++++++++++++++ dorado/basecall/ModelRunner.h | 71 +------------------------ dorado/basecall/ModelRunnerBase.h | 34 ++++++++++++ dorado/basecall/decode/CPUDecoder.h | 2 +- dorado/basecall/decode/GPUDecoder.h | 2 +- dorado/read_pipeline/BasecallerNode.cpp | 4 +- tests/NodeSmokeTest.cpp | 4 +- 12 files changed, 107 insertions(+), 88 deletions(-) create mode 100644 dorado/basecall/ModelRunner.cpp create mode 100644 dorado/basecall/ModelRunnerBase.h diff --git a/dorado/api/pipeline_creation.cpp b/dorado/api/pipeline_creation.cpp index b7a20896..9262bc1d 100644 --- a/dorado/api/pipeline_creation.cpp +++ b/dorado/api/pipeline_creation.cpp @@ -1,7 +1,7 @@ #include "pipeline_creation.h" #include "basecall/CRFModelConfig.h" -#include "basecall/ModelRunner.h" +#include "basecall/ModelRunnerBase.h" #include "modbase/ModBaseRunner.h" #include "read_pipeline/BasecallerNode.h" #include "read_pipeline/ModBaseCallerNode.h" diff --git a/dorado/api/runner_creation.cpp b/dorado/api/runner_creation.cpp index 80224986..2fc55b9e 100644 --- a/dorado/api/runner_creation.cpp +++ b/dorado/api/runner_creation.cpp @@ -1,7 +1,6 @@ #include "runner_creation.h" #include "basecall/crf_utils.h" -#include "basecall/decode/CPUDecoder.h" #if DORADO_GPU_BUILD #ifdef __APPLE__ @@ -53,7 +52,7 @@ std::pair, size_t> create_basecall_runners( num_cpu_runners); for (size_t i = 0; i < num_cpu_runners; i++) { - runners.push_back(std::make_unique>( + runners.push_back(std::make_unique( model_config, device, int(chunk_size), int(batch_size))); } } diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt index 88bf5df9..a846c87e 100644 --- a/dorado/basecall/CMakeLists.txt +++ b/dorado/basecall/CMakeLists.txt @@ -1,11 +1,13 @@ add_library(dorado_basecall STATIC - crf_utils.h crf_utils.cpp - CRFModel.h + crf_utils.h CRFModel.cpp - CRFModelConfig.h + CRFModel.h CRFModelConfig.cpp + CRFModelConfig.h + ModelRunner.cpp ModelRunner.h + ModelRunnerBase.h decode/beam_search.cpp decode/beam_search.h decode/CPUDecoder.cpp @@ -16,14 +18,14 @@ if (DORADO_GPU_BUILD) if(APPLE) target_sources(dorado_basecall PRIVATE - MetalCRFModel.h MetalCRFModel.cpp + MetalCRFModel.h ) else() target_sources(dorado_basecall PRIVATE - CudaCRFModel.h CudaCRFModel.cpp + CudaCRFModel.h decode/GPUDecoder.cpp decode/GPUDecoder.h ) @@ -32,10 +34,9 @@ endif() target_include_directories(dorado_basecall SYSTEM - PUBLIC - ${DORADO_3RD_PARTY_SOURCE}/toml11 PRIVATE ${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include + ${DORADO_3RD_PARTY_SOURCE}/toml11 ) diff --git a/dorado/basecall/CudaCRFModel.h b/dorado/basecall/CudaCRFModel.h index ce00ebfc..df0dc111 100644 --- a/dorado/basecall/CudaCRFModel.h +++ b/dorado/basecall/CudaCRFModel.h @@ -1,11 +1,12 @@ #pragma once #include "CRFModel.h" -#include "ModelRunner.h" +#include "ModelRunnerBase.h" #include #include +#include #include #include #include diff --git a/dorado/basecall/MetalCRFModel.h b/dorado/basecall/MetalCRFModel.h index 927faef5..d6029b77 100644 --- a/dorado/basecall/MetalCRFModel.h +++ b/dorado/basecall/MetalCRFModel.h @@ -1,6 +1,6 @@ #pragma once -#include "ModelRunner.h" +#include "ModelRunnerBase.h" #include diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp new file mode 100644 index 00000000..e59c2adf --- /dev/null +++ b/dorado/basecall/ModelRunner.cpp @@ -0,0 +1,53 @@ +#include "ModelRunner.h" + +#include "CRFModel.h" +#include "decode/CPUDecoder.h" + +namespace dorado::basecall { + +ModelRunner::ModelRunner(const CRFModelConfig &model_config, + const std::string &device, + int chunk_size, + int batch_size) + : m_config(model_config) { + m_decoder_options = decode::DecoderOptions(); + m_decoder_options.q_shift = model_config.qbias; + m_decoder_options.q_scale = model_config.qscale; + m_decoder = std::make_unique(); + + m_options = at::TensorOptions().dtype(decode::CPUDecoder::dtype).device(device); + m_module = load_crf_model(model_config, m_options); + + // adjust chunk size to be a multiple of the stride + chunk_size -= chunk_size % model_config.stride; + + m_input = at::zeros({batch_size, model_config.num_features, chunk_size}, + at::TensorOptions().dtype(decode::CPUDecoder::dtype).device(at::kCPU)); +} + +std::vector ModelRunner::call_chunks(int num_chunks) { + at::InferenceMode guard; + dorado::stats::Timer timer; + auto scores = m_module->forward(m_input.to(m_options.device_opt().value())); + const auto forward_ms = timer.GetElapsedMS(); + auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options); + const auto forward_plus_decode_ms = timer.GetElapsedMS(); + ++m_num_batches_called; + m_model_ms += forward_ms; + m_decode_ms += forward_plus_decode_ms - forward_ms; + return decoded_chunks; +} + +void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) { + m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk); +} + +stats::NamedStats ModelRunner::sample_stats() const { + stats::NamedStats stats; + stats["batches_called"] = double(m_num_batches_called); + stats["model_ms"] = double(m_model_ms); + stats["decode_ms"] = double(m_decode_ms); + return stats; +} + +} // namespace dorado::basecall diff --git a/dorado/basecall/ModelRunner.h b/dorado/basecall/ModelRunner.h index a9dee625..7f1906ba 100644 --- a/dorado/basecall/ModelRunner.h +++ b/dorado/basecall/ModelRunner.h @@ -1,7 +1,7 @@ #pragma once -#include "CRFModel.h" #include "CRFModelConfig.h" +#include "ModelRunnerBase.h" #include "decode/Decoder.h" #include "utils/stats.h" @@ -12,24 +12,6 @@ namespace dorado::basecall { -class ModelRunnerBase { -public: - virtual ~ModelRunnerBase() = default; - virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0; - virtual std::vector call_chunks(int num_chunks) = 0; - virtual const CRFModelConfig &config() const = 0; - virtual size_t model_stride() const = 0; - virtual size_t chunk_size() const = 0; - virtual size_t batch_size() const = 0; - virtual void terminate() = 0; - virtual void restart() = 0; - virtual std::string get_name() const = 0; - virtual stats::NamedStats sample_stats() const = 0; -}; - -using RunnerPtr = std::unique_ptr; - -template class ModelRunner final : public ModelRunnerBase { public: ModelRunner(const CRFModelConfig &model_config, @@ -51,7 +33,7 @@ class ModelRunner final : public ModelRunnerBase { const CRFModelConfig m_config; at::Tensor m_input; at::TensorOptions m_options; - std::unique_ptr m_decoder; + std::unique_ptr m_decoder; decode::DecoderOptions m_decoder_options; torch::nn::ModuleHolder m_module{nullptr}; @@ -61,53 +43,4 @@ class ModelRunner final : public ModelRunnerBase { std::atomic m_decode_ms = 0; }; -template -ModelRunner::ModelRunner(const CRFModelConfig &model_config, - const std::string &device, - int chunk_size, - int batch_size) - : m_config(model_config) { - m_decoder_options = decode::DecoderOptions(); - m_decoder_options.q_shift = model_config.qbias; - m_decoder_options.q_scale = model_config.qscale; - m_decoder = std::make_unique(); - - m_options = at::TensorOptions().dtype(T::dtype).device(device); - m_module = load_crf_model(model_config, m_options); - - // adjust chunk size to be a multiple of the stride - chunk_size -= chunk_size % model_config.stride; - - m_input = at::zeros({batch_size, model_config.num_features, chunk_size}, - at::TensorOptions().dtype(T::dtype).device(at::kCPU)); -} - -template -std::vector ModelRunner::call_chunks(int num_chunks) { - at::InferenceMode guard; - dorado::stats::Timer timer; - auto scores = m_module->forward(m_input.to(m_options.device_opt().value())); - const auto forward_ms = timer.GetElapsedMS(); - auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options); - const auto forward_plus_decode_ms = timer.GetElapsedMS(); - ++m_num_batches_called; - m_model_ms += forward_ms; - m_decode_ms += forward_plus_decode_ms - forward_ms; - return decoded_chunks; -} - -template -void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) { - m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk); -} - -template -stats::NamedStats ModelRunner::sample_stats() const { - stats::NamedStats stats; - stats["batches_called"] = double(m_num_batches_called); - stats["model_ms"] = double(m_model_ms); - stats["decode_ms"] = double(m_decode_ms); - return stats; -} - } // namespace dorado::basecall diff --git a/dorado/basecall/ModelRunnerBase.h b/dorado/basecall/ModelRunnerBase.h new file mode 100644 index 00000000..2aafc461 --- /dev/null +++ b/dorado/basecall/ModelRunnerBase.h @@ -0,0 +1,34 @@ +#pragma once + +#include "decode/Decoder.h" +#include "utils/stats.h" + +#include +#include + +namespace at { +class Tensor; +} + +namespace dorado::basecall { + +struct CRFModelConfig; + +class ModelRunnerBase { +public: + virtual ~ModelRunnerBase() = default; + virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0; + virtual std::vector call_chunks(int num_chunks) = 0; + virtual const CRFModelConfig &config() const = 0; + virtual size_t model_stride() const = 0; + virtual size_t chunk_size() const = 0; + virtual size_t batch_size() const = 0; + virtual void terminate() = 0; + virtual void restart() = 0; + virtual std::string get_name() const = 0; + virtual stats::NamedStats sample_stats() const = 0; +}; + +using RunnerPtr = std::unique_ptr; + +} // namespace dorado::basecall diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h index b2011d5c..c0e87bb4 100644 --- a/dorado/basecall/decode/CPUDecoder.h +++ b/dorado/basecall/decode/CPUDecoder.h @@ -6,7 +6,7 @@ namespace dorado::basecall::decode { -class CPUDecoder final : Decoder { +class CPUDecoder final : public Decoder { public: std::vector beam_search(const at::Tensor& scores, int num_chunks, diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h index 46ff4d28..feaec8e0 100644 --- a/dorado/basecall/decode/GPUDecoder.h +++ b/dorado/basecall/decode/GPUDecoder.h @@ -6,7 +6,7 @@ namespace dorado::basecall::decode { -class GPUDecoder final : Decoder { +class GPUDecoder final : public Decoder { public: explicit GPUDecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} diff --git a/dorado/read_pipeline/BasecallerNode.cpp b/dorado/read_pipeline/BasecallerNode.cpp index c39882f6..e8411f30 100644 --- a/dorado/read_pipeline/BasecallerNode.cpp +++ b/dorado/read_pipeline/BasecallerNode.cpp @@ -1,7 +1,7 @@ #include "BasecallerNode.h" -#include "basecall/ModelRunner.h" -#include "basecall/decode/CPUDecoder.h" +#include "basecall/CRFModelConfig.h" +#include "basecall/ModelRunnerBase.h" #include "stitch.h" #include "utils/stats.h" diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index ef86c218..9f8087d8 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -3,7 +3,6 @@ #include "basecall/CRFModel.h" #include "basecall/CRFModelConfig.h" #include "basecall/ModelRunner.h" -#include "basecall/decode/CPUDecoder.h" #include "modbase/ModBaseModel.h" #include "modbase/ModBaseRunner.h" #include "models/models.h" @@ -243,8 +242,7 @@ DEFINE_TEST(NodeSmokeTestRead, "BasecallerNode") { set_num_reads(5); set_expected_messages(5); batch_size = 8; - runners.push_back(std::make_unique< - dorado::basecall::ModelRunner>( + runners.push_back(std::make_unique( model_config, "cpu", default_params.chunksize, int(batch_size))); } From 7f680f3e8a7adbf2e8cb412884b9e35dd2b74d5a Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 08:48:22 +0000 Subject: [PATCH 2/9] Unify decoder interfaces --- dorado/basecall/CudaCRFModel.cpp | 16 +++++++++------- dorado/basecall/ModelRunner.cpp | 3 ++- dorado/basecall/decode/CPUDecoder.cpp | 10 ++++++---- dorado/basecall/decode/CPUDecoder.h | 6 +++--- dorado/basecall/decode/Decoder.h | 13 ++++++++++--- dorado/basecall/decode/GPUDecoder.cpp | 19 +++++++++---------- dorado/basecall/decode/GPUDecoder.h | 12 +++++------- 7 files changed, 44 insertions(+), 35 deletions(-) diff --git a/dorado/basecall/CudaCRFModel.cpp b/dorado/basecall/CudaCRFModel.cpp index 09bbc3a7..403ef0be 100644 --- a/dorado/basecall/CudaCRFModel.cpp +++ b/dorado/basecall/CudaCRFModel.cpp @@ -222,9 +222,10 @@ class CudaCaller { } struct NNTask { - NNTask(at::Tensor input_) : input(input_) {} + NNTask(at::Tensor input_, int num_chunks_) : input(input_), num_chunks(num_chunks_) {} at::Tensor input; - at::Tensor out; + int num_chunks; + decode::DecodeData out; std::mutex mut; std::condition_variable cv; bool done{false}; @@ -241,7 +242,7 @@ class CudaCaller { return std::vector(); } - auto task = std::make_shared(input.to(m_options.device())); + auto task = std::make_shared(input.to(m_options.device()), num_chunks); { std::lock_guard lock(m_input_lock); m_input_queue.push_front(task); @@ -253,8 +254,8 @@ class CudaCaller { task->cv.wait(lock); } - output.copy_(task->out); - return m_decoder->cpu_part(output); + output.copy_(task->out.data); + return m_decoder->beam_search_part_2({output, num_chunks, m_decoder_options}); } void cuda_thread_fn() { @@ -322,7 +323,8 @@ class CudaCaller { stats::Timer timer; auto scores = m_module->forward(task->input); const auto forward_ms = timer.GetElapsedMS(); - task->out = m_decoder->gpu_part(scores, m_decoder_options); + task->out = m_decoder->beam_search_part_1( + {scores, task->num_chunks, m_decoder_options}); stream.synchronize(); const auto forward_plus_decode_ms = timer.GetElapsedMS(); m_model_ms += forward_ms; @@ -373,7 +375,7 @@ class CudaCaller { const CRFModelConfig m_config; std::string m_device; at::TensorOptions m_options; - std::unique_ptr m_decoder; + std::unique_ptr m_decoder; decode::DecoderOptions m_decoder_options; torch::nn::ModuleHolder m_module{nullptr}; std::atomic m_terminate{false}; diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp index e59c2adf..aa67d6c6 100644 --- a/dorado/basecall/ModelRunner.cpp +++ b/dorado/basecall/ModelRunner.cpp @@ -30,7 +30,8 @@ std::vector ModelRunner::call_chunks(int num_chunks) { dorado::stats::Timer timer; auto scores = m_module->forward(m_input.to(m_options.device_opt().value())); const auto forward_ms = timer.GetElapsedMS(); - auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options); + auto decoded_chunks = m_decoder->beam_search_part_2( + m_decoder->beam_search_part_1({scores, num_chunks, m_decoder_options})); const auto forward_plus_decode_ms = timer.GetElapsedMS(); ++m_num_batches_called; m_model_ms += forward_ms; diff --git a/dorado/basecall/decode/CPUDecoder.cpp b/dorado/basecall/decode/CPUDecoder.cpp index f405c003..69a3635c 100644 --- a/dorado/basecall/decode/CPUDecoder.cpp +++ b/dorado/basecall/decode/CPUDecoder.cpp @@ -88,10 +88,12 @@ at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_scor namespace dorado::basecall::decode { -std::vector CPUDecoder::beam_search(const at::Tensor& scores, - const int num_chunks, - const DecoderOptions& options) { - const auto scores_cpu = scores.to(at::kCPU); +DecodeData CPUDecoder::beam_search_part_1(DecodeData data) { return data; } + +std::vector CPUDecoder::beam_search_part_2(DecodeData data) { + const auto scores_cpu = data.data.to(at::kCPU); + const auto num_chunks = data.num_chunks; + const auto& options = data.options; int num_threads = std::min(num_chunks, 4); int chunks_per_thread = num_chunks / num_threads; int num_threads_with_one_more_chunk = num_chunks % num_threads; diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h index c0e87bb4..a730bf1b 100644 --- a/dorado/basecall/decode/CPUDecoder.h +++ b/dorado/basecall/decode/CPUDecoder.h @@ -8,9 +8,9 @@ namespace dorado::basecall::decode { class CPUDecoder final : public Decoder { public: - std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) final; + DecodeData beam_search_part_1(DecodeData data); + std::vector beam_search_part_2(DecodeData data); + constexpr static at::ScalarType dtype = at::ScalarType::Float; }; diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h index 09ef76bf..ee65395e 100644 --- a/dorado/basecall/decode/Decoder.h +++ b/dorado/basecall/decode/Decoder.h @@ -2,6 +2,7 @@ #include +#include #include #include @@ -23,11 +24,17 @@ struct DecoderOptions { bool move_pad = false; }; +struct DecodeData { + at::Tensor data; + int num_chunks; + DecoderOptions options; +}; + class Decoder { public: - virtual std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) = 0; + virtual ~Decoder() = default; + virtual DecodeData beam_search_part_1(DecodeData data) = 0; + virtual std::vector beam_search_part_2(DecodeData data) = 0; }; } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/GPUDecoder.cpp b/dorado/basecall/decode/GPUDecoder.cpp index 7561371b..6f523556 100644 --- a/dorado/basecall/decode/GPUDecoder.cpp +++ b/dorado/basecall/decode/GPUDecoder.cpp @@ -1,6 +1,5 @@ #include "GPUDecoder.h" -#include "Decoder.h" #include "utils/cuda_utils.h" #include "utils/gpu_profiling.h" @@ -13,7 +12,10 @@ extern "C" { namespace dorado::basecall::decode { -at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) { +DecodeData GPUDecoder::beam_search_part_1(DecodeData data) { + auto scores = data.data; + auto &options = data.options; + c10::cuda::CUDAGuard device_guard(scores.device()); utils::ScopedProfileRange loop{"gpu_decode", 1}; long int N = (long int)(scores.sizes()[0]); @@ -78,10 +80,13 @@ at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) { sequence.data_ptr(), qstring.data_ptr(), options.q_scale, options.q_shift, int(options.beam_width), options.beam_cut, options.blank_score, options.move_pad)); } - return moves_sequence_qstring.reshape({3, N, -1}); + + data.data = moves_sequence_qstring.reshape({3, N, -1}); + return data; } -std::vector GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring_cpu) { +std::vector GPUDecoder::beam_search_part_2(DecodeData data) { + auto moves_sequence_qstring_cpu = data.data; nvtx3::scoped_range loop{"cpu_decode"}; assert(moves_sequence_qstring_cpu.device() == at::kCPU); auto moves_cpu = moves_sequence_qstring_cpu[0]; @@ -107,10 +112,4 @@ std::vector GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring return called_chunks; } -std::vector GPUDecoder::beam_search(const at::Tensor &scores, - int, - const DecoderOptions &options) { - return cpu_part(gpu_part(scores, options)); -} - } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h index feaec8e0..7296c49c 100644 --- a/dorado/basecall/decode/GPUDecoder.h +++ b/dorado/basecall/decode/GPUDecoder.h @@ -10,17 +10,15 @@ class GPUDecoder final : public Decoder { public: explicit GPUDecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} - std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) final; - constexpr static at::ScalarType dtype = at::ScalarType::Half; - // We split beam_search into two parts, the first one running on the GPU and the second // one on the CPU. While the second part is running we can submit more commands to the GPU // on another thread. - at::Tensor gpu_part(at::Tensor scores, DecoderOptions options); - std::vector cpu_part(at::Tensor moves_sequence_qstring_cpu); + DecodeData beam_search_part_1(DecodeData data); + std::vector beam_search_part_2(DecodeData data); + + constexpr static at::ScalarType dtype = at::ScalarType::Half; +private: float m_score_clamp_val; }; From c8965d05a198b6371b2fa32ec1995e4fdadc1b1b Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 08:48:22 +0000 Subject: [PATCH 3/9] Create appropriate decoder based on the device --- dorado/basecall/CMakeLists.txt | 2 ++ dorado/basecall/CudaCRFModel.cpp | 16 +++++++------- dorado/basecall/ModelRunner.cpp | 14 ++++++------- dorado/basecall/ModelRunner.h | 4 ++-- dorado/basecall/decode/CPUDecoder.cpp | 4 ++-- dorado/basecall/decode/CPUDecoder.h | 6 +++--- dorado/basecall/decode/Decoder.cpp | 30 +++++++++++++++++++++++++++ dorado/basecall/decode/Decoder.h | 11 ++++++++-- dorado/basecall/decode/GPUDecoder.cpp | 4 ++-- dorado/basecall/decode/GPUDecoder.h | 6 +++--- 10 files changed, 66 insertions(+), 31 deletions(-) create mode 100644 dorado/basecall/decode/Decoder.cpp diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt index a846c87e..f9d1621b 100644 --- a/dorado/basecall/CMakeLists.txt +++ b/dorado/basecall/CMakeLists.txt @@ -12,6 +12,8 @@ add_library(dorado_basecall STATIC decode/beam_search.h decode/CPUDecoder.cpp decode/CPUDecoder.h + decode/Decoder.cpp + decode/Decoder.h ) if (DORADO_GPU_BUILD) diff --git a/dorado/basecall/CudaCRFModel.cpp b/dorado/basecall/CudaCRFModel.cpp index 403ef0be..d3025783 100644 --- a/dorado/basecall/CudaCRFModel.cpp +++ b/dorado/basecall/CudaCRFModel.cpp @@ -1,7 +1,7 @@ #include "CudaCRFModel.h" #include "CRFModelConfig.h" -#include "decode/GPUDecoder.h" +#include "decode/Decoder.h" #include "utils/cuda_utils.h" #include "utils/math_utils.h" @@ -31,19 +31,18 @@ class CudaCaller { bool exclusive_gpu_access) : m_config(model_config), m_device(device), + m_decoder(decode::create_decoder(device, m_config)), + m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)), m_exclusive_gpu_access(exclusive_gpu_access) { - m_decoder_options = decode::DecoderOptions(); + assert(m_options.device().is_cuda()); + m_decoder_options.q_shift = model_config.qbias; m_decoder_options.q_scale = model_config.qscale; - m_decoder = std::make_unique(model_config.clamp ? 5.f : 0.f); m_num_input_features = model_config.num_features; // adjust chunk size to be a multiple of the stride m_out_chunk_size = chunk_size / model_config.stride; m_in_chunk_size = m_out_chunk_size * model_config.stride; - m_options = at::TensorOptions().dtype(decode::GPUDecoder::dtype).device(device); - assert(m_options.device().is_cuda()); - at::InferenceMode guard; m_module = load_crf_model(model_config, m_options); @@ -155,8 +154,7 @@ class CudaCaller { // Determine size of working memory for decoder divided by (batch_size * chunk_size) // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes // where num_states = 4^(state_len+1) - // See `dorado::basecall::decode::GPUDecoder::gpu_part()`, block beginning with `if (!initialized) {` - // for more details. + // See `dorado::basecall::decode::GPUDecoder::beam_search_part_1()` for more details. int64_t decode_bytes_per_chunk_timestep = 10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2)); @@ -374,9 +372,9 @@ class CudaCaller { const CRFModelConfig m_config; std::string m_device; - at::TensorOptions m_options; std::unique_ptr m_decoder; decode::DecoderOptions m_decoder_options; + at::TensorOptions m_options; torch::nn::ModuleHolder m_module{nullptr}; std::atomic m_terminate{false}; std::deque> m_input_queue; diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp index aa67d6c6..41395097 100644 --- a/dorado/basecall/ModelRunner.cpp +++ b/dorado/basecall/ModelRunner.cpp @@ -1,7 +1,7 @@ #include "ModelRunner.h" #include "CRFModel.h" -#include "decode/CPUDecoder.h" +#include "decode/Decoder.h" namespace dorado::basecall { @@ -9,20 +9,18 @@ ModelRunner::ModelRunner(const CRFModelConfig &model_config, const std::string &device, int chunk_size, int batch_size) - : m_config(model_config) { - m_decoder_options = decode::DecoderOptions(); + : m_config(model_config), + m_decoder(decode::create_decoder(device, model_config)), + m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)), + m_module(load_crf_model(model_config, m_options)) { m_decoder_options.q_shift = model_config.qbias; m_decoder_options.q_scale = model_config.qscale; - m_decoder = std::make_unique(); - - m_options = at::TensorOptions().dtype(decode::CPUDecoder::dtype).device(device); - m_module = load_crf_model(model_config, m_options); // adjust chunk size to be a multiple of the stride chunk_size -= chunk_size % model_config.stride; m_input = at::zeros({batch_size, model_config.num_features, chunk_size}, - at::TensorOptions().dtype(decode::CPUDecoder::dtype).device(at::kCPU)); + at::TensorOptions().dtype(m_decoder->dtype()).device(at::kCPU)); } std::vector ModelRunner::call_chunks(int num_chunks) { diff --git a/dorado/basecall/ModelRunner.h b/dorado/basecall/ModelRunner.h index 7f1906ba..a3e16a0a 100644 --- a/dorado/basecall/ModelRunner.h +++ b/dorado/basecall/ModelRunner.h @@ -31,11 +31,11 @@ class ModelRunner final : public ModelRunnerBase { private: const CRFModelConfig m_config; - at::Tensor m_input; - at::TensorOptions m_options; std::unique_ptr m_decoder; + at::TensorOptions m_options; decode::DecoderOptions m_decoder_options; torch::nn::ModuleHolder m_module{nullptr}; + at::Tensor m_input; // Performance monitoring stats. std::atomic m_num_batches_called = 0; diff --git a/dorado/basecall/decode/CPUDecoder.cpp b/dorado/basecall/decode/CPUDecoder.cpp index 69a3635c..e138b7ab 100644 --- a/dorado/basecall/decode/CPUDecoder.cpp +++ b/dorado/basecall/decode/CPUDecoder.cpp @@ -88,9 +88,9 @@ at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_scor namespace dorado::basecall::decode { -DecodeData CPUDecoder::beam_search_part_1(DecodeData data) { return data; } +DecodeData CPUDecoder::beam_search_part_1(DecodeData data) const { return data; } -std::vector CPUDecoder::beam_search_part_2(DecodeData data) { +std::vector CPUDecoder::beam_search_part_2(DecodeData data) const { const auto scores_cpu = data.data.to(at::kCPU); const auto num_chunks = data.num_chunks; const auto& options = data.options; diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h index a730bf1b..80c947ff 100644 --- a/dorado/basecall/decode/CPUDecoder.h +++ b/dorado/basecall/decode/CPUDecoder.h @@ -8,10 +8,10 @@ namespace dorado::basecall::decode { class CPUDecoder final : public Decoder { public: - DecodeData beam_search_part_1(DecodeData data); - std::vector beam_search_part_2(DecodeData data); + DecodeData beam_search_part_1(DecodeData data) const; + std::vector beam_search_part_2(DecodeData data) const; - constexpr static at::ScalarType dtype = at::ScalarType::Float; + c10::ScalarType dtype() const { return at::ScalarType::Float; }; }; } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/Decoder.cpp b/dorado/basecall/decode/Decoder.cpp new file mode 100644 index 00000000..66bebb4e --- /dev/null +++ b/dorado/basecall/decode/Decoder.cpp @@ -0,0 +1,30 @@ +#include "Decoder.h" + +#if DORADO_GPU_BUILD && !defined(__APPLE__) +#include "GPUDecoder.h" +#endif + +#include "CPUDecoder.h" +#include "basecall/CRFModelConfig.h" + +#include + +namespace dorado::basecall::decode { + +std::unique_ptr create_decoder(c10::Device device, + const CRFModelConfig& config) { +#if DORADO_GPU_BUILD && !defined(__APPLE__) + if (device.is_cuda()) { + return std::make_unique(config.clamp ? 5.f : 0.f); + } +#else + (void)config; // unused in other build types +#endif + if (device.is_cpu()) { + return std::make_unique(); + } + + throw std::runtime_error("Unsupported device type for decoder creation: " + device.str()); +} + +} // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h index ee65395e..47f47421 100644 --- a/dorado/basecall/decode/Decoder.h +++ b/dorado/basecall/decode/Decoder.h @@ -6,6 +6,10 @@ #include #include +namespace dorado::basecall { +struct CRFModelConfig; +} + namespace dorado::basecall::decode { struct DecodedChunk { @@ -33,8 +37,11 @@ struct DecodeData { class Decoder { public: virtual ~Decoder() = default; - virtual DecodeData beam_search_part_1(DecodeData data) = 0; - virtual std::vector beam_search_part_2(DecodeData data) = 0; + virtual DecodeData beam_search_part_1(DecodeData data) const = 0; + virtual std::vector beam_search_part_2(DecodeData data) const = 0; + virtual c10::ScalarType dtype() const = 0; }; +std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config); + } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/GPUDecoder.cpp b/dorado/basecall/decode/GPUDecoder.cpp index 6f523556..944621bf 100644 --- a/dorado/basecall/decode/GPUDecoder.cpp +++ b/dorado/basecall/decode/GPUDecoder.cpp @@ -12,7 +12,7 @@ extern "C" { namespace dorado::basecall::decode { -DecodeData GPUDecoder::beam_search_part_1(DecodeData data) { +DecodeData GPUDecoder::beam_search_part_1(DecodeData data) const { auto scores = data.data; auto &options = data.options; @@ -85,7 +85,7 @@ DecodeData GPUDecoder::beam_search_part_1(DecodeData data) { return data; } -std::vector GPUDecoder::beam_search_part_2(DecodeData data) { +std::vector GPUDecoder::beam_search_part_2(DecodeData data) const { auto moves_sequence_qstring_cpu = data.data; nvtx3::scoped_range loop{"cpu_decode"}; assert(moves_sequence_qstring_cpu.device() == at::kCPU); diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h index 7296c49c..9c2fba9d 100644 --- a/dorado/basecall/decode/GPUDecoder.h +++ b/dorado/basecall/decode/GPUDecoder.h @@ -13,10 +13,10 @@ class GPUDecoder final : public Decoder { // We split beam_search into two parts, the first one running on the GPU and the second // one on the CPU. While the second part is running we can submit more commands to the GPU // on another thread. - DecodeData beam_search_part_1(DecodeData data); - std::vector beam_search_part_2(DecodeData data); + DecodeData beam_search_part_1(DecodeData data) const; + std::vector beam_search_part_2(DecodeData data) const; - constexpr static at::ScalarType dtype = at::ScalarType::Half; + c10::ScalarType dtype() const { return at::ScalarType::Half; }; private: float m_score_clamp_val; From 7ab561db9e87482d3cc9ea0f9dc3f64e965329b5 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 08:48:22 +0000 Subject: [PATCH 4/9] Fix basecaller node as it's no longer inheriting torch from elsewhere --- dorado/read_pipeline/BasecallerNode.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dorado/read_pipeline/BasecallerNode.cpp b/dorado/read_pipeline/BasecallerNode.cpp index e8411f30..cc81b494 100644 --- a/dorado/read_pipeline/BasecallerNode.cpp +++ b/dorado/read_pipeline/BasecallerNode.cpp @@ -5,6 +5,7 @@ #include "stitch.h" #include "utils/stats.h" +#include #include #include @@ -15,7 +16,7 @@ #endif using namespace std::chrono_literals; -using namespace torch::indexing; +using namespace at::indexing; namespace dorado { @@ -242,15 +243,13 @@ void BasecallerNode::basecall_worker_thread(int worker_id) { if (slice_size != m_chunk_size) { if (input_slice.ndimension() == 1) { auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size); - input_slice = torch::concat( - {input_slice.repeat({n}), - input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})}); + input_slice = at::concat({input_slice.repeat({n}), + input_slice.index({Ellipsis, Slice(0, overhang)})}); } else if (input_slice.ndimension() == 2) { auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size); - input_slice = torch::concat( - {input_slice.repeat({1, n}), - input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})}, - 1); + input_slice = at::concat({input_slice.repeat({1, n}), + input_slice.index({Ellipsis, Slice(0, overhang)})}, + 1); } } From 8ce48c288c0c2f9f3aaa2b2ced13d35dbc03899d Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 09:25:15 +0000 Subject: [PATCH 5/9] Fix formatting --- dorado/basecall/decode/Decoder.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dorado/basecall/decode/Decoder.cpp b/dorado/basecall/decode/Decoder.cpp index 66bebb4e..321875bd 100644 --- a/dorado/basecall/decode/Decoder.cpp +++ b/dorado/basecall/decode/Decoder.cpp @@ -11,8 +11,7 @@ namespace dorado::basecall::decode { -std::unique_ptr create_decoder(c10::Device device, - const CRFModelConfig& config) { +std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config) { #if DORADO_GPU_BUILD && !defined(__APPLE__) if (device.is_cuda()) { return std::make_unique(config.clamp ? 5.f : 0.f); From 87d8f7a60d9b3767cb671c54a52cf2fd887a87fd Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 15:45:42 +0000 Subject: [PATCH 6/9] Add missing header --- dorado/basecall/decode/Decoder.h | 1 + 1 file changed, 1 insertion(+) diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h index 47f47421..2c04b5cf 100644 --- a/dorado/basecall/decode/Decoder.h +++ b/dorado/basecall/decode/Decoder.h @@ -3,6 +3,7 @@ #include #include +#include #include #include From 4ac99ee059634af0e70aaa941ff5b683216a9485 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 15:46:02 +0000 Subject: [PATCH 7/9] Fix return type and add comment --- dorado/basecall/decode/CPUDecoder.h | 2 +- dorado/basecall/decode/Decoder.h | 3 ++- dorado/basecall/decode/GPUDecoder.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h index 80c947ff..d1c1924e 100644 --- a/dorado/basecall/decode/CPUDecoder.h +++ b/dorado/basecall/decode/CPUDecoder.h @@ -11,7 +11,7 @@ class CPUDecoder final : public Decoder { DecodeData beam_search_part_1(DecodeData data) const; std::vector beam_search_part_2(DecodeData data) const; - c10::ScalarType dtype() const { return at::ScalarType::Float; }; + at::ScalarType dtype() const { return at::ScalarType::Float; }; }; } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h index 2c04b5cf..f34b6bbc 100644 --- a/dorado/basecall/decode/Decoder.h +++ b/dorado/basecall/decode/Decoder.h @@ -40,7 +40,8 @@ class Decoder { virtual ~Decoder() = default; virtual DecodeData beam_search_part_1(DecodeData data) const = 0; virtual std::vector beam_search_part_2(DecodeData data) const = 0; - virtual c10::ScalarType dtype() const = 0; + // Returns the torch::TensorOptions::dtype to use for input data to models that use this decoder + virtual at::ScalarType dtype() const = 0; }; std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config); diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h index 9c2fba9d..67c8762d 100644 --- a/dorado/basecall/decode/GPUDecoder.h +++ b/dorado/basecall/decode/GPUDecoder.h @@ -16,7 +16,7 @@ class GPUDecoder final : public Decoder { DecodeData beam_search_part_1(DecodeData data) const; std::vector beam_search_part_2(DecodeData data) const; - c10::ScalarType dtype() const { return at::ScalarType::Half; }; + at::ScalarType dtype() const { return at::ScalarType::Half; }; private: float m_score_clamp_val; From f8a1d79fa68a6e461667f5cc448efc3417c3acb2 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 15:46:58 +0000 Subject: [PATCH 8/9] Rename class --- dorado/basecall/CMakeLists.txt | 4 ++-- dorado/basecall/CRFModel.cpp | 2 +- dorado/basecall/CudaCRFModel.cpp | 2 +- dorado/basecall/decode/{GPUDecoder.cpp => CUDADecoder.cpp} | 6 +++--- dorado/basecall/decode/{GPUDecoder.h => CUDADecoder.h} | 4 ++-- dorado/basecall/decode/Decoder.cpp | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) rename dorado/basecall/decode/{GPUDecoder.cpp => CUDADecoder.cpp} (96%) rename dorado/basecall/decode/{GPUDecoder.h => CUDADecoder.h} (82%) diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt index f9d1621b..813f203a 100644 --- a/dorado/basecall/CMakeLists.txt +++ b/dorado/basecall/CMakeLists.txt @@ -28,8 +28,8 @@ if (DORADO_GPU_BUILD) PRIVATE CudaCRFModel.cpp CudaCRFModel.h - decode/GPUDecoder.cpp - decode/GPUDecoder.h + decode/CUDADecoder.cpp + decode/CUDADecoder.h ) endif() endif() diff --git a/dorado/basecall/CRFModel.cpp b/dorado/basecall/CRFModel.cpp index 0d9e3a7a..e5dac7af 100644 --- a/dorado/basecall/CRFModel.cpp +++ b/dorado/basecall/CRFModel.cpp @@ -871,7 +871,7 @@ struct CRFModelImpl : Module { } // Clamping the scores to [-5, 5], if active (i.e. the role of `clamp1`), is performed by - // `GPUDecoder` on reading the scores. This eliminates the cost of a large matrix + // `CUDADecoder` on reading the scores. This eliminates the cost of a large matrix // read-modify-write operation. // Output is [N, T, C], F16, contiguous diff --git a/dorado/basecall/CudaCRFModel.cpp b/dorado/basecall/CudaCRFModel.cpp index d3025783..2205dddf 100644 --- a/dorado/basecall/CudaCRFModel.cpp +++ b/dorado/basecall/CudaCRFModel.cpp @@ -154,7 +154,7 @@ class CudaCaller { // Determine size of working memory for decoder divided by (batch_size * chunk_size) // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes // where num_states = 4^(state_len+1) - // See `dorado::basecall::decode::GPUDecoder::beam_search_part_1()` for more details. + // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details. int64_t decode_bytes_per_chunk_timestep = 10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2)); diff --git a/dorado/basecall/decode/GPUDecoder.cpp b/dorado/basecall/decode/CUDADecoder.cpp similarity index 96% rename from dorado/basecall/decode/GPUDecoder.cpp rename to dorado/basecall/decode/CUDADecoder.cpp index 944621bf..8db11227 100644 --- a/dorado/basecall/decode/GPUDecoder.cpp +++ b/dorado/basecall/decode/CUDADecoder.cpp @@ -1,4 +1,4 @@ -#include "GPUDecoder.h" +#include "CUDADecoder.h" #include "utils/cuda_utils.h" #include "utils/gpu_profiling.h" @@ -12,7 +12,7 @@ extern "C" { namespace dorado::basecall::decode { -DecodeData GPUDecoder::beam_search_part_1(DecodeData data) const { +DecodeData CUDADecoder::beam_search_part_1(DecodeData data) const { auto scores = data.data; auto &options = data.options; @@ -85,7 +85,7 @@ DecodeData GPUDecoder::beam_search_part_1(DecodeData data) const { return data; } -std::vector GPUDecoder::beam_search_part_2(DecodeData data) const { +std::vector CUDADecoder::beam_search_part_2(DecodeData data) const { auto moves_sequence_qstring_cpu = data.data; nvtx3::scoped_range loop{"cpu_decode"}; assert(moves_sequence_qstring_cpu.device() == at::kCPU); diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/CUDADecoder.h similarity index 82% rename from dorado/basecall/decode/GPUDecoder.h rename to dorado/basecall/decode/CUDADecoder.h index 67c8762d..6d5a7a8d 100644 --- a/dorado/basecall/decode/GPUDecoder.h +++ b/dorado/basecall/decode/CUDADecoder.h @@ -6,9 +6,9 @@ namespace dorado::basecall::decode { -class GPUDecoder final : public Decoder { +class CUDADecoder final : public Decoder { public: - explicit GPUDecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} + explicit CUDADecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} // We split beam_search into two parts, the first one running on the GPU and the second // one on the CPU. While the second part is running we can submit more commands to the GPU diff --git a/dorado/basecall/decode/Decoder.cpp b/dorado/basecall/decode/Decoder.cpp index 321875bd..b30bfb50 100644 --- a/dorado/basecall/decode/Decoder.cpp +++ b/dorado/basecall/decode/Decoder.cpp @@ -1,7 +1,7 @@ #include "Decoder.h" #if DORADO_GPU_BUILD && !defined(__APPLE__) -#include "GPUDecoder.h" +#include "CUDADecoder.h" #endif #include "CPUDecoder.h" @@ -14,7 +14,7 @@ namespace dorado::basecall::decode { std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config) { #if DORADO_GPU_BUILD && !defined(__APPLE__) if (device.is_cuda()) { - return std::make_unique(config.clamp ? 5.f : 0.f); + return std::make_unique(config.clamp ? 5.f : 0.f); } #else (void)config; // unused in other build types From f9b0bb06c7f035b2d8fd314a5e4d35c807f4f973 Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Fri, 15 Dec 2023 15:47:46 +0000 Subject: [PATCH 9/9] No need for optional --- dorado/basecall/ModelRunner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp index 41395097..22c6a331 100644 --- a/dorado/basecall/ModelRunner.cpp +++ b/dorado/basecall/ModelRunner.cpp @@ -26,7 +26,7 @@ ModelRunner::ModelRunner(const CRFModelConfig &model_config, std::vector ModelRunner::call_chunks(int num_chunks) { at::InferenceMode guard; dorado::stats::Timer timer; - auto scores = m_module->forward(m_input.to(m_options.device_opt().value())); + auto scores = m_module->forward(m_input.to(m_options.device())); const auto forward_ms = timer.GetElapsedMS(); auto decoded_chunks = m_decoder->beam_search_part_2( m_decoder->beam_search_part_1({scores, num_chunks, m_decoder_options}));