Skip to content

Commit

Permalink
Merge branch 'smalton/DOR-484-decoder' into 'master'
Browse files Browse the repository at this point in the history
DOR-484: Unify decoder interfaces

Closes DOR-484

See merge request machine-learning/dorado!770
  • Loading branch information
malton-ont committed Dec 18, 2023
2 parents a3dfc94 + f9b0bb0 commit 5fa4de7
Show file tree
Hide file tree
Showing 19 changed files with 227 additions and 164 deletions.
2 changes: 1 addition & 1 deletion dorado/api/pipeline_creation.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "pipeline_creation.h"

#include "basecall/CRFModelConfig.h"
#include "basecall/ModelRunner.h"
#include "basecall/ModelRunnerBase.h"
#include "modbase/ModBaseRunner.h"
#include "read_pipeline/BasecallerNode.h"
#include "read_pipeline/ModBaseCallerNode.h"
Expand Down
3 changes: 1 addition & 2 deletions dorado/api/runner_creation.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "runner_creation.h"

#include "basecall/crf_utils.h"
#include "basecall/decode/CPUDecoder.h"

#if DORADO_GPU_BUILD
#ifdef __APPLE__
Expand Down Expand Up @@ -53,7 +52,7 @@ std::pair<std::vector<basecall::RunnerPtr>, size_t> create_basecall_runners(
num_cpu_runners);

for (size_t i = 0; i < num_cpu_runners; i++) {
runners.push_back(std::make_unique<basecall::ModelRunner<basecall::decode::CPUDecoder>>(
runners.push_back(std::make_unique<basecall::ModelRunner>(
model_config, device, int(chunk_size), int(batch_size)));
}
}
Expand Down
21 changes: 12 additions & 9 deletions dorado/basecall/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,44 @@
add_library(dorado_basecall STATIC
crf_utils.h
crf_utils.cpp
CRFModel.h
crf_utils.h
CRFModel.cpp
CRFModelConfig.h
CRFModel.h
CRFModelConfig.cpp
CRFModelConfig.h
ModelRunner.cpp
ModelRunner.h
ModelRunnerBase.h
decode/beam_search.cpp
decode/beam_search.h
decode/CPUDecoder.cpp
decode/CPUDecoder.h
decode/Decoder.cpp
decode/Decoder.h
)

if (DORADO_GPU_BUILD)
if(APPLE)
target_sources(dorado_basecall
PRIVATE
MetalCRFModel.h
MetalCRFModel.cpp
MetalCRFModel.h
)
else()
target_sources(dorado_basecall
PRIVATE
CudaCRFModel.h
CudaCRFModel.cpp
decode/GPUDecoder.cpp
decode/GPUDecoder.h
CudaCRFModel.h
decode/CUDADecoder.cpp
decode/CUDADecoder.h
)
endif()
endif()

target_include_directories(dorado_basecall
SYSTEM
PUBLIC
${DORADO_3RD_PARTY_SOURCE}/toml11
PRIVATE
${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include
${DORADO_3RD_PARTY_SOURCE}/toml11
)


Expand Down
2 changes: 1 addition & 1 deletion dorado/basecall/CRFModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -871,7 +871,7 @@ struct CRFModelImpl : Module {
}

// Clamping the scores to [-5, 5], if active (i.e. the role of `clamp1`), is performed by
// `GPUDecoder` on reading the scores. This eliminates the cost of a large matrix
// `CUDADecoder` on reading the scores. This eliminates the cost of a large matrix
// read-modify-write operation.

// Output is [N, T, C], F16, contiguous
Expand Down
32 changes: 16 additions & 16 deletions dorado/basecall/CudaCRFModel.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "CudaCRFModel.h"

#include "CRFModelConfig.h"
#include "decode/GPUDecoder.h"
#include "decode/Decoder.h"
#include "utils/cuda_utils.h"
#include "utils/math_utils.h"

Expand Down Expand Up @@ -31,19 +31,18 @@ class CudaCaller {
bool exclusive_gpu_access)
: m_config(model_config),
m_device(device),
m_decoder(decode::create_decoder(device, m_config)),
m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)),
m_exclusive_gpu_access(exclusive_gpu_access) {
m_decoder_options = decode::DecoderOptions();
assert(m_options.device().is_cuda());

m_decoder_options.q_shift = model_config.qbias;
m_decoder_options.q_scale = model_config.qscale;
m_decoder = std::make_unique<decode::GPUDecoder>(model_config.clamp ? 5.f : 0.f);
m_num_input_features = model_config.num_features;
// adjust chunk size to be a multiple of the stride
m_out_chunk_size = chunk_size / model_config.stride;
m_in_chunk_size = m_out_chunk_size * model_config.stride;

m_options = at::TensorOptions().dtype(decode::GPUDecoder::dtype).device(device);
assert(m_options.device().is_cuda());

at::InferenceMode guard;
m_module = load_crf_model(model_config, m_options);

Expand Down Expand Up @@ -155,8 +154,7 @@ class CudaCaller {
// Determine size of working memory for decoder divided by (batch_size * chunk_size)
// Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes
// where num_states = 4^(state_len+1)
// See `dorado::basecall::decode::GPUDecoder::gpu_part()`, block beginning with `if (!initialized) {`
// for more details.
// See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details.
int64_t decode_bytes_per_chunk_timestep =
10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2));

Expand Down Expand Up @@ -222,9 +220,10 @@ class CudaCaller {
}

struct NNTask {
NNTask(at::Tensor input_) : input(input_) {}
NNTask(at::Tensor input_, int num_chunks_) : input(input_), num_chunks(num_chunks_) {}
at::Tensor input;
at::Tensor out;
int num_chunks;
decode::DecodeData out;
std::mutex mut;
std::condition_variable cv;
bool done{false};
Expand All @@ -241,7 +240,7 @@ class CudaCaller {
return std::vector<decode::DecodedChunk>();
}

auto task = std::make_shared<NNTask>(input.to(m_options.device()));
auto task = std::make_shared<NNTask>(input.to(m_options.device()), num_chunks);
{
std::lock_guard<std::mutex> lock(m_input_lock);
m_input_queue.push_front(task);
Expand All @@ -253,8 +252,8 @@ class CudaCaller {
task->cv.wait(lock);
}

output.copy_(task->out);
return m_decoder->cpu_part(output);
output.copy_(task->out.data);
return m_decoder->beam_search_part_2({output, num_chunks, m_decoder_options});
}

void cuda_thread_fn() {
Expand Down Expand Up @@ -322,7 +321,8 @@ class CudaCaller {
stats::Timer timer;
auto scores = m_module->forward(task->input);
const auto forward_ms = timer.GetElapsedMS();
task->out = m_decoder->gpu_part(scores, m_decoder_options);
task->out = m_decoder->beam_search_part_1(
{scores, task->num_chunks, m_decoder_options});
stream.synchronize();
const auto forward_plus_decode_ms = timer.GetElapsedMS();
m_model_ms += forward_ms;
Expand Down Expand Up @@ -372,9 +372,9 @@ class CudaCaller {

const CRFModelConfig m_config;
std::string m_device;
at::TensorOptions m_options;
std::unique_ptr<decode::GPUDecoder> m_decoder;
std::unique_ptr<decode::Decoder> m_decoder;
decode::DecoderOptions m_decoder_options;
at::TensorOptions m_options;
torch::nn::ModuleHolder<torch::nn::AnyModule> m_module{nullptr};
std::atomic<bool> m_terminate{false};
std::deque<std::shared_ptr<NNTask>> m_input_queue;
Expand Down
3 changes: 2 additions & 1 deletion dorado/basecall/CudaCRFModel.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#pragma once

#include "CRFModel.h"
#include "ModelRunner.h"
#include "ModelRunnerBase.h"

#include <ATen/core/TensorBody.h>
#include <c10/cuda/CUDAStream.h>

#include <atomic>
#include <filesystem>
#include <memory>
#include <vector>
Expand Down
2 changes: 1 addition & 1 deletion dorado/basecall/MetalCRFModel.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include "ModelRunner.h"
#include "ModelRunnerBase.h"

#include <ATen/core/TensorBody.h>

Expand Down
52 changes: 52 additions & 0 deletions dorado/basecall/ModelRunner.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "ModelRunner.h"

#include "CRFModel.h"
#include "decode/Decoder.h"

namespace dorado::basecall {

ModelRunner::ModelRunner(const CRFModelConfig &model_config,
const std::string &device,
int chunk_size,
int batch_size)
: m_config(model_config),
m_decoder(decode::create_decoder(device, model_config)),
m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)),
m_module(load_crf_model(model_config, m_options)) {
m_decoder_options.q_shift = model_config.qbias;
m_decoder_options.q_scale = model_config.qscale;

// adjust chunk size to be a multiple of the stride
chunk_size -= chunk_size % model_config.stride;

m_input = at::zeros({batch_size, model_config.num_features, chunk_size},
at::TensorOptions().dtype(m_decoder->dtype()).device(at::kCPU));
}

std::vector<decode::DecodedChunk> ModelRunner::call_chunks(int num_chunks) {
at::InferenceMode guard;
dorado::stats::Timer timer;
auto scores = m_module->forward(m_input.to(m_options.device()));
const auto forward_ms = timer.GetElapsedMS();
auto decoded_chunks = m_decoder->beam_search_part_2(
m_decoder->beam_search_part_1({scores, num_chunks, m_decoder_options}));
const auto forward_plus_decode_ms = timer.GetElapsedMS();
++m_num_batches_called;
m_model_ms += forward_ms;
m_decode_ms += forward_plus_decode_ms - forward_ms;
return decoded_chunks;
}

void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) {
m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk);
}

stats::NamedStats ModelRunner::sample_stats() const {
stats::NamedStats stats;
stats["batches_called"] = double(m_num_batches_called);
stats["model_ms"] = double(m_model_ms);
stats["decode_ms"] = double(m_decode_ms);
return stats;
}

} // namespace dorado::basecall
73 changes: 3 additions & 70 deletions dorado/basecall/ModelRunner.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "CRFModel.h"
#include "CRFModelConfig.h"
#include "ModelRunnerBase.h"
#include "decode/Decoder.h"
#include "utils/stats.h"

Expand All @@ -12,24 +12,6 @@

namespace dorado::basecall {

class ModelRunnerBase {
public:
virtual ~ModelRunnerBase() = default;
virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0;
virtual std::vector<decode::DecodedChunk> call_chunks(int num_chunks) = 0;
virtual const CRFModelConfig &config() const = 0;
virtual size_t model_stride() const = 0;
virtual size_t chunk_size() const = 0;
virtual size_t batch_size() const = 0;
virtual void terminate() = 0;
virtual void restart() = 0;
virtual std::string get_name() const = 0;
virtual stats::NamedStats sample_stats() const = 0;
};

using RunnerPtr = std::unique_ptr<ModelRunnerBase>;

template <typename T>
class ModelRunner final : public ModelRunnerBase {
public:
ModelRunner(const CRFModelConfig &model_config,
Expand All @@ -49,65 +31,16 @@ class ModelRunner final : public ModelRunnerBase {

private:
const CRFModelConfig m_config;
at::Tensor m_input;
std::unique_ptr<decode::Decoder> m_decoder;
at::TensorOptions m_options;
std::unique_ptr<T> m_decoder;
decode::DecoderOptions m_decoder_options;
torch::nn::ModuleHolder<torch::nn::AnyModule> m_module{nullptr};
at::Tensor m_input;

// Performance monitoring stats.
std::atomic<int64_t> m_num_batches_called = 0;
std::atomic<int64_t> m_model_ms = 0;
std::atomic<int64_t> m_decode_ms = 0;
};

template <typename T>
ModelRunner<T>::ModelRunner(const CRFModelConfig &model_config,
const std::string &device,
int chunk_size,
int batch_size)
: m_config(model_config) {
m_decoder_options = decode::DecoderOptions();
m_decoder_options.q_shift = model_config.qbias;
m_decoder_options.q_scale = model_config.qscale;
m_decoder = std::make_unique<T>();

m_options = at::TensorOptions().dtype(T::dtype).device(device);
m_module = load_crf_model(model_config, m_options);

// adjust chunk size to be a multiple of the stride
chunk_size -= chunk_size % model_config.stride;

m_input = at::zeros({batch_size, model_config.num_features, chunk_size},
at::TensorOptions().dtype(T::dtype).device(at::kCPU));
}

template <typename T>
std::vector<decode::DecodedChunk> ModelRunner<T>::call_chunks(int num_chunks) {
at::InferenceMode guard;
dorado::stats::Timer timer;
auto scores = m_module->forward(m_input.to(m_options.device_opt().value()));
const auto forward_ms = timer.GetElapsedMS();
auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options);
const auto forward_plus_decode_ms = timer.GetElapsedMS();
++m_num_batches_called;
m_model_ms += forward_ms;
m_decode_ms += forward_plus_decode_ms - forward_ms;
return decoded_chunks;
}

template <typename T>
void ModelRunner<T>::accept_chunk(int chunk_idx, const at::Tensor &chunk) {
m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk);
}

template <typename T>
stats::NamedStats ModelRunner<T>::sample_stats() const {
stats::NamedStats stats;
stats["batches_called"] = double(m_num_batches_called);
stats["model_ms"] = double(m_model_ms);
stats["decode_ms"] = double(m_decode_ms);
return stats;
}

} // namespace dorado::basecall
34 changes: 34 additions & 0 deletions dorado/basecall/ModelRunnerBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#pragma once

#include "decode/Decoder.h"
#include "utils/stats.h"

#include <string>
#include <vector>

namespace at {
class Tensor;
}

namespace dorado::basecall {

struct CRFModelConfig;

class ModelRunnerBase {
public:
virtual ~ModelRunnerBase() = default;
virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0;
virtual std::vector<decode::DecodedChunk> call_chunks(int num_chunks) = 0;
virtual const CRFModelConfig &config() const = 0;
virtual size_t model_stride() const = 0;
virtual size_t chunk_size() const = 0;
virtual size_t batch_size() const = 0;
virtual void terminate() = 0;
virtual void restart() = 0;
virtual std::string get_name() const = 0;
virtual stats::NamedStats sample_stats() const = 0;
};

using RunnerPtr = std::unique_ptr<ModelRunnerBase>;

} // namespace dorado::basecall

0 comments on commit 5fa4de7

Please sign in to comment.