diff --git a/dorado/api/pipeline_creation.cpp b/dorado/api/pipeline_creation.cpp index b7a20896..9262bc1d 100644 --- a/dorado/api/pipeline_creation.cpp +++ b/dorado/api/pipeline_creation.cpp @@ -1,7 +1,7 @@ #include "pipeline_creation.h" #include "basecall/CRFModelConfig.h" -#include "basecall/ModelRunner.h" +#include "basecall/ModelRunnerBase.h" #include "modbase/ModBaseRunner.h" #include "read_pipeline/BasecallerNode.h" #include "read_pipeline/ModBaseCallerNode.h" diff --git a/dorado/api/runner_creation.cpp b/dorado/api/runner_creation.cpp index 80224986..2fc55b9e 100644 --- a/dorado/api/runner_creation.cpp +++ b/dorado/api/runner_creation.cpp @@ -1,7 +1,6 @@ #include "runner_creation.h" #include "basecall/crf_utils.h" -#include "basecall/decode/CPUDecoder.h" #if DORADO_GPU_BUILD #ifdef __APPLE__ @@ -53,7 +52,7 @@ std::pair, size_t> create_basecall_runners( num_cpu_runners); for (size_t i = 0; i < num_cpu_runners; i++) { - runners.push_back(std::make_unique>( + runners.push_back(std::make_unique( model_config, device, int(chunk_size), int(batch_size))); } } diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt index 0bed0d3d..ed2b06f4 100644 --- a/dorado/basecall/CMakeLists.txt +++ b/dorado/basecall/CMakeLists.txt @@ -1,41 +1,44 @@ add_library(dorado_basecall STATIC - crf_utils.h crf_utils.cpp - CRFModel.h + crf_utils.h CRFModel.cpp - CRFModelConfig.h + CRFModel.h CRFModelConfig.cpp + CRFModelConfig.h + ModelRunner.cpp ModelRunner.h + ModelRunnerBase.h decode/beam_search.cpp decode/beam_search.h decode/CPUDecoder.cpp decode/CPUDecoder.h + decode/Decoder.cpp + decode/Decoder.h ) if (DORADO_GPU_BUILD) if(APPLE) target_sources(dorado_basecall PRIVATE - MetalCRFModel.h MetalCRFModel.cpp + MetalCRFModel.h ) else() target_sources(dorado_basecall PRIVATE - CudaCRFModel.h CudaCRFModel.cpp - decode/GPUDecoder.cpp - decode/GPUDecoder.h + CudaCRFModel.h + decode/CUDADecoder.cpp + decode/CUDADecoder.h ) endif() endif() target_include_directories(dorado_basecall SYSTEM - PUBLIC - ${DORADO_3RD_PARTY_SOURCE}/toml11 PRIVATE ${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include + ${DORADO_3RD_PARTY_SOURCE}/toml11 ) diff --git a/dorado/basecall/CRFModel.cpp b/dorado/basecall/CRFModel.cpp index 0d9e3a7a..e5dac7af 100644 --- a/dorado/basecall/CRFModel.cpp +++ b/dorado/basecall/CRFModel.cpp @@ -871,7 +871,7 @@ struct CRFModelImpl : Module { } // Clamping the scores to [-5, 5], if active (i.e. the role of `clamp1`), is performed by - // `GPUDecoder` on reading the scores. This eliminates the cost of a large matrix + // `CUDADecoder` on reading the scores. This eliminates the cost of a large matrix // read-modify-write operation. // Output is [N, T, C], F16, contiguous diff --git a/dorado/basecall/CudaCRFModel.cpp b/dorado/basecall/CudaCRFModel.cpp index 09bbc3a7..2205dddf 100644 --- a/dorado/basecall/CudaCRFModel.cpp +++ b/dorado/basecall/CudaCRFModel.cpp @@ -1,7 +1,7 @@ #include "CudaCRFModel.h" #include "CRFModelConfig.h" -#include "decode/GPUDecoder.h" +#include "decode/Decoder.h" #include "utils/cuda_utils.h" #include "utils/math_utils.h" @@ -31,19 +31,18 @@ class CudaCaller { bool exclusive_gpu_access) : m_config(model_config), m_device(device), + m_decoder(decode::create_decoder(device, m_config)), + m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)), m_exclusive_gpu_access(exclusive_gpu_access) { - m_decoder_options = decode::DecoderOptions(); + assert(m_options.device().is_cuda()); + m_decoder_options.q_shift = model_config.qbias; m_decoder_options.q_scale = model_config.qscale; - m_decoder = std::make_unique(model_config.clamp ? 5.f : 0.f); m_num_input_features = model_config.num_features; // adjust chunk size to be a multiple of the stride m_out_chunk_size = chunk_size / model_config.stride; m_in_chunk_size = m_out_chunk_size * model_config.stride; - m_options = at::TensorOptions().dtype(decode::GPUDecoder::dtype).device(device); - assert(m_options.device().is_cuda()); - at::InferenceMode guard; m_module = load_crf_model(model_config, m_options); @@ -155,8 +154,7 @@ class CudaCaller { // Determine size of working memory for decoder divided by (batch_size * chunk_size) // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes // where num_states = 4^(state_len+1) - // See `dorado::basecall::decode::GPUDecoder::gpu_part()`, block beginning with `if (!initialized) {` - // for more details. + // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details. int64_t decode_bytes_per_chunk_timestep = 10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2)); @@ -222,9 +220,10 @@ class CudaCaller { } struct NNTask { - NNTask(at::Tensor input_) : input(input_) {} + NNTask(at::Tensor input_, int num_chunks_) : input(input_), num_chunks(num_chunks_) {} at::Tensor input; - at::Tensor out; + int num_chunks; + decode::DecodeData out; std::mutex mut; std::condition_variable cv; bool done{false}; @@ -241,7 +240,7 @@ class CudaCaller { return std::vector(); } - auto task = std::make_shared(input.to(m_options.device())); + auto task = std::make_shared(input.to(m_options.device()), num_chunks); { std::lock_guard lock(m_input_lock); m_input_queue.push_front(task); @@ -253,8 +252,8 @@ class CudaCaller { task->cv.wait(lock); } - output.copy_(task->out); - return m_decoder->cpu_part(output); + output.copy_(task->out.data); + return m_decoder->beam_search_part_2({output, num_chunks, m_decoder_options}); } void cuda_thread_fn() { @@ -322,7 +321,8 @@ class CudaCaller { stats::Timer timer; auto scores = m_module->forward(task->input); const auto forward_ms = timer.GetElapsedMS(); - task->out = m_decoder->gpu_part(scores, m_decoder_options); + task->out = m_decoder->beam_search_part_1( + {scores, task->num_chunks, m_decoder_options}); stream.synchronize(); const auto forward_plus_decode_ms = timer.GetElapsedMS(); m_model_ms += forward_ms; @@ -372,9 +372,9 @@ class CudaCaller { const CRFModelConfig m_config; std::string m_device; - at::TensorOptions m_options; - std::unique_ptr m_decoder; + std::unique_ptr m_decoder; decode::DecoderOptions m_decoder_options; + at::TensorOptions m_options; torch::nn::ModuleHolder m_module{nullptr}; std::atomic m_terminate{false}; std::deque> m_input_queue; diff --git a/dorado/basecall/CudaCRFModel.h b/dorado/basecall/CudaCRFModel.h index ce00ebfc..df0dc111 100644 --- a/dorado/basecall/CudaCRFModel.h +++ b/dorado/basecall/CudaCRFModel.h @@ -1,11 +1,12 @@ #pragma once #include "CRFModel.h" -#include "ModelRunner.h" +#include "ModelRunnerBase.h" #include #include +#include #include #include #include diff --git a/dorado/basecall/MetalCRFModel.h b/dorado/basecall/MetalCRFModel.h index 927faef5..d6029b77 100644 --- a/dorado/basecall/MetalCRFModel.h +++ b/dorado/basecall/MetalCRFModel.h @@ -1,6 +1,6 @@ #pragma once -#include "ModelRunner.h" +#include "ModelRunnerBase.h" #include diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp new file mode 100644 index 00000000..22c6a331 --- /dev/null +++ b/dorado/basecall/ModelRunner.cpp @@ -0,0 +1,52 @@ +#include "ModelRunner.h" + +#include "CRFModel.h" +#include "decode/Decoder.h" + +namespace dorado::basecall { + +ModelRunner::ModelRunner(const CRFModelConfig &model_config, + const std::string &device, + int chunk_size, + int batch_size) + : m_config(model_config), + m_decoder(decode::create_decoder(device, model_config)), + m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)), + m_module(load_crf_model(model_config, m_options)) { + m_decoder_options.q_shift = model_config.qbias; + m_decoder_options.q_scale = model_config.qscale; + + // adjust chunk size to be a multiple of the stride + chunk_size -= chunk_size % model_config.stride; + + m_input = at::zeros({batch_size, model_config.num_features, chunk_size}, + at::TensorOptions().dtype(m_decoder->dtype()).device(at::kCPU)); +} + +std::vector ModelRunner::call_chunks(int num_chunks) { + at::InferenceMode guard; + dorado::stats::Timer timer; + auto scores = m_module->forward(m_input.to(m_options.device())); + const auto forward_ms = timer.GetElapsedMS(); + auto decoded_chunks = m_decoder->beam_search_part_2( + m_decoder->beam_search_part_1({scores, num_chunks, m_decoder_options})); + const auto forward_plus_decode_ms = timer.GetElapsedMS(); + ++m_num_batches_called; + m_model_ms += forward_ms; + m_decode_ms += forward_plus_decode_ms - forward_ms; + return decoded_chunks; +} + +void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) { + m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk); +} + +stats::NamedStats ModelRunner::sample_stats() const { + stats::NamedStats stats; + stats["batches_called"] = double(m_num_batches_called); + stats["model_ms"] = double(m_model_ms); + stats["decode_ms"] = double(m_decode_ms); + return stats; +} + +} // namespace dorado::basecall diff --git a/dorado/basecall/ModelRunner.h b/dorado/basecall/ModelRunner.h index a9dee625..a3e16a0a 100644 --- a/dorado/basecall/ModelRunner.h +++ b/dorado/basecall/ModelRunner.h @@ -1,7 +1,7 @@ #pragma once -#include "CRFModel.h" #include "CRFModelConfig.h" +#include "ModelRunnerBase.h" #include "decode/Decoder.h" #include "utils/stats.h" @@ -12,24 +12,6 @@ namespace dorado::basecall { -class ModelRunnerBase { -public: - virtual ~ModelRunnerBase() = default; - virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0; - virtual std::vector call_chunks(int num_chunks) = 0; - virtual const CRFModelConfig &config() const = 0; - virtual size_t model_stride() const = 0; - virtual size_t chunk_size() const = 0; - virtual size_t batch_size() const = 0; - virtual void terminate() = 0; - virtual void restart() = 0; - virtual std::string get_name() const = 0; - virtual stats::NamedStats sample_stats() const = 0; -}; - -using RunnerPtr = std::unique_ptr; - -template class ModelRunner final : public ModelRunnerBase { public: ModelRunner(const CRFModelConfig &model_config, @@ -49,11 +31,11 @@ class ModelRunner final : public ModelRunnerBase { private: const CRFModelConfig m_config; - at::Tensor m_input; + std::unique_ptr m_decoder; at::TensorOptions m_options; - std::unique_ptr m_decoder; decode::DecoderOptions m_decoder_options; torch::nn::ModuleHolder m_module{nullptr}; + at::Tensor m_input; // Performance monitoring stats. std::atomic m_num_batches_called = 0; @@ -61,53 +43,4 @@ class ModelRunner final : public ModelRunnerBase { std::atomic m_decode_ms = 0; }; -template -ModelRunner::ModelRunner(const CRFModelConfig &model_config, - const std::string &device, - int chunk_size, - int batch_size) - : m_config(model_config) { - m_decoder_options = decode::DecoderOptions(); - m_decoder_options.q_shift = model_config.qbias; - m_decoder_options.q_scale = model_config.qscale; - m_decoder = std::make_unique(); - - m_options = at::TensorOptions().dtype(T::dtype).device(device); - m_module = load_crf_model(model_config, m_options); - - // adjust chunk size to be a multiple of the stride - chunk_size -= chunk_size % model_config.stride; - - m_input = at::zeros({batch_size, model_config.num_features, chunk_size}, - at::TensorOptions().dtype(T::dtype).device(at::kCPU)); -} - -template -std::vector ModelRunner::call_chunks(int num_chunks) { - at::InferenceMode guard; - dorado::stats::Timer timer; - auto scores = m_module->forward(m_input.to(m_options.device_opt().value())); - const auto forward_ms = timer.GetElapsedMS(); - auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options); - const auto forward_plus_decode_ms = timer.GetElapsedMS(); - ++m_num_batches_called; - m_model_ms += forward_ms; - m_decode_ms += forward_plus_decode_ms - forward_ms; - return decoded_chunks; -} - -template -void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) { - m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk); -} - -template -stats::NamedStats ModelRunner::sample_stats() const { - stats::NamedStats stats; - stats["batches_called"] = double(m_num_batches_called); - stats["model_ms"] = double(m_model_ms); - stats["decode_ms"] = double(m_decode_ms); - return stats; -} - } // namespace dorado::basecall diff --git a/dorado/basecall/ModelRunnerBase.h b/dorado/basecall/ModelRunnerBase.h new file mode 100644 index 00000000..2aafc461 --- /dev/null +++ b/dorado/basecall/ModelRunnerBase.h @@ -0,0 +1,34 @@ +#pragma once + +#include "decode/Decoder.h" +#include "utils/stats.h" + +#include +#include + +namespace at { +class Tensor; +} + +namespace dorado::basecall { + +struct CRFModelConfig; + +class ModelRunnerBase { +public: + virtual ~ModelRunnerBase() = default; + virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0; + virtual std::vector call_chunks(int num_chunks) = 0; + virtual const CRFModelConfig &config() const = 0; + virtual size_t model_stride() const = 0; + virtual size_t chunk_size() const = 0; + virtual size_t batch_size() const = 0; + virtual void terminate() = 0; + virtual void restart() = 0; + virtual std::string get_name() const = 0; + virtual stats::NamedStats sample_stats() const = 0; +}; + +using RunnerPtr = std::unique_ptr; + +} // namespace dorado::basecall diff --git a/dorado/basecall/decode/CPUDecoder.cpp b/dorado/basecall/decode/CPUDecoder.cpp index f405c003..e138b7ab 100644 --- a/dorado/basecall/decode/CPUDecoder.cpp +++ b/dorado/basecall/decode/CPUDecoder.cpp @@ -88,10 +88,12 @@ at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_scor namespace dorado::basecall::decode { -std::vector CPUDecoder::beam_search(const at::Tensor& scores, - const int num_chunks, - const DecoderOptions& options) { - const auto scores_cpu = scores.to(at::kCPU); +DecodeData CPUDecoder::beam_search_part_1(DecodeData data) const { return data; } + +std::vector CPUDecoder::beam_search_part_2(DecodeData data) const { + const auto scores_cpu = data.data.to(at::kCPU); + const auto num_chunks = data.num_chunks; + const auto& options = data.options; int num_threads = std::min(num_chunks, 4); int chunks_per_thread = num_chunks / num_threads; int num_threads_with_one_more_chunk = num_chunks % num_threads; diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h index b2011d5c..d1c1924e 100644 --- a/dorado/basecall/decode/CPUDecoder.h +++ b/dorado/basecall/decode/CPUDecoder.h @@ -6,12 +6,12 @@ namespace dorado::basecall::decode { -class CPUDecoder final : Decoder { +class CPUDecoder final : public Decoder { public: - std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) final; - constexpr static at::ScalarType dtype = at::ScalarType::Float; + DecodeData beam_search_part_1(DecodeData data) const; + std::vector beam_search_part_2(DecodeData data) const; + + at::ScalarType dtype() const { return at::ScalarType::Float; }; }; } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/GPUDecoder.cpp b/dorado/basecall/decode/CUDADecoder.cpp similarity index 89% rename from dorado/basecall/decode/GPUDecoder.cpp rename to dorado/basecall/decode/CUDADecoder.cpp index 7561371b..8db11227 100644 --- a/dorado/basecall/decode/GPUDecoder.cpp +++ b/dorado/basecall/decode/CUDADecoder.cpp @@ -1,6 +1,5 @@ -#include "GPUDecoder.h" +#include "CUDADecoder.h" -#include "Decoder.h" #include "utils/cuda_utils.h" #include "utils/gpu_profiling.h" @@ -13,7 +12,10 @@ extern "C" { namespace dorado::basecall::decode { -at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) { +DecodeData CUDADecoder::beam_search_part_1(DecodeData data) const { + auto scores = data.data; + auto &options = data.options; + c10::cuda::CUDAGuard device_guard(scores.device()); utils::ScopedProfileRange loop{"gpu_decode", 1}; long int N = (long int)(scores.sizes()[0]); @@ -78,10 +80,13 @@ at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) { sequence.data_ptr(), qstring.data_ptr(), options.q_scale, options.q_shift, int(options.beam_width), options.beam_cut, options.blank_score, options.move_pad)); } - return moves_sequence_qstring.reshape({3, N, -1}); + + data.data = moves_sequence_qstring.reshape({3, N, -1}); + return data; } -std::vector GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring_cpu) { +std::vector CUDADecoder::beam_search_part_2(DecodeData data) const { + auto moves_sequence_qstring_cpu = data.data; nvtx3::scoped_range loop{"cpu_decode"}; assert(moves_sequence_qstring_cpu.device() == at::kCPU); auto moves_cpu = moves_sequence_qstring_cpu[0]; @@ -107,10 +112,4 @@ std::vector GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring return called_chunks; } -std::vector GPUDecoder::beam_search(const at::Tensor &scores, - int, - const DecoderOptions &options) { - return cpu_part(gpu_part(scores, options)); -} - } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/CUDADecoder.h b/dorado/basecall/decode/CUDADecoder.h new file mode 100644 index 00000000..6d5a7a8d --- /dev/null +++ b/dorado/basecall/decode/CUDADecoder.h @@ -0,0 +1,25 @@ +#pragma once + +#include "Decoder.h" + +#include + +namespace dorado::basecall::decode { + +class CUDADecoder final : public Decoder { +public: + explicit CUDADecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} + + // We split beam_search into two parts, the first one running on the GPU and the second + // one on the CPU. While the second part is running we can submit more commands to the GPU + // on another thread. + DecodeData beam_search_part_1(DecodeData data) const; + std::vector beam_search_part_2(DecodeData data) const; + + at::ScalarType dtype() const { return at::ScalarType::Half; }; + +private: + float m_score_clamp_val; +}; + +} // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/Decoder.cpp b/dorado/basecall/decode/Decoder.cpp new file mode 100644 index 00000000..b30bfb50 --- /dev/null +++ b/dorado/basecall/decode/Decoder.cpp @@ -0,0 +1,29 @@ +#include "Decoder.h" + +#if DORADO_GPU_BUILD && !defined(__APPLE__) +#include "CUDADecoder.h" +#endif + +#include "CPUDecoder.h" +#include "basecall/CRFModelConfig.h" + +#include + +namespace dorado::basecall::decode { + +std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config) { +#if DORADO_GPU_BUILD && !defined(__APPLE__) + if (device.is_cuda()) { + return std::make_unique(config.clamp ? 5.f : 0.f); + } +#else + (void)config; // unused in other build types +#endif + if (device.is_cpu()) { + return std::make_unique(); + } + + throw std::runtime_error("Unsupported device type for decoder creation: " + device.str()); +} + +} // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h index 09ef76bf..f34b6bbc 100644 --- a/dorado/basecall/decode/Decoder.h +++ b/dorado/basecall/decode/Decoder.h @@ -2,9 +2,15 @@ #include +#include +#include #include #include +namespace dorado::basecall { +struct CRFModelConfig; +} + namespace dorado::basecall::decode { struct DecodedChunk { @@ -23,11 +29,21 @@ struct DecoderOptions { bool move_pad = false; }; +struct DecodeData { + at::Tensor data; + int num_chunks; + DecoderOptions options; +}; + class Decoder { public: - virtual std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) = 0; + virtual ~Decoder() = default; + virtual DecodeData beam_search_part_1(DecodeData data) const = 0; + virtual std::vector beam_search_part_2(DecodeData data) const = 0; + // Returns the torch::TensorOptions::dtype to use for input data to models that use this decoder + virtual at::ScalarType dtype() const = 0; }; +std::unique_ptr create_decoder(c10::Device device, const CRFModelConfig& config); + } // namespace dorado::basecall::decode diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h deleted file mode 100644 index 46ff4d28..00000000 --- a/dorado/basecall/decode/GPUDecoder.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include "Decoder.h" - -#include - -namespace dorado::basecall::decode { - -class GPUDecoder final : Decoder { -public: - explicit GPUDecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {} - - std::vector beam_search(const at::Tensor& scores, - int num_chunks, - const DecoderOptions& options) final; - constexpr static at::ScalarType dtype = at::ScalarType::Half; - - // We split beam_search into two parts, the first one running on the GPU and the second - // one on the CPU. While the second part is running we can submit more commands to the GPU - // on another thread. - at::Tensor gpu_part(at::Tensor scores, DecoderOptions options); - std::vector cpu_part(at::Tensor moves_sequence_qstring_cpu); - - float m_score_clamp_val; -}; - -} // namespace dorado::basecall::decode diff --git a/dorado/read_pipeline/BasecallerNode.cpp b/dorado/read_pipeline/BasecallerNode.cpp index c39882f6..cc81b494 100644 --- a/dorado/read_pipeline/BasecallerNode.cpp +++ b/dorado/read_pipeline/BasecallerNode.cpp @@ -1,10 +1,11 @@ #include "BasecallerNode.h" -#include "basecall/ModelRunner.h" -#include "basecall/decode/CPUDecoder.h" +#include "basecall/CRFModelConfig.h" +#include "basecall/ModelRunnerBase.h" #include "stitch.h" #include "utils/stats.h" +#include #include #include @@ -15,7 +16,7 @@ #endif using namespace std::chrono_literals; -using namespace torch::indexing; +using namespace at::indexing; namespace dorado { @@ -242,15 +243,13 @@ void BasecallerNode::basecall_worker_thread(int worker_id) { if (slice_size != m_chunk_size) { if (input_slice.ndimension() == 1) { auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size); - input_slice = torch::concat( - {input_slice.repeat({n}), - input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})}); + input_slice = at::concat({input_slice.repeat({n}), + input_slice.index({Ellipsis, Slice(0, overhang)})}); } else if (input_slice.ndimension() == 2) { auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size); - input_slice = torch::concat( - {input_slice.repeat({1, n}), - input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})}, - 1); + input_slice = at::concat({input_slice.repeat({1, n}), + input_slice.index({Ellipsis, Slice(0, overhang)})}, + 1); } } diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index ef86c218..9f8087d8 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -3,7 +3,6 @@ #include "basecall/CRFModel.h" #include "basecall/CRFModelConfig.h" #include "basecall/ModelRunner.h" -#include "basecall/decode/CPUDecoder.h" #include "modbase/ModBaseModel.h" #include "modbase/ModBaseRunner.h" #include "models/models.h" @@ -243,8 +242,7 @@ DEFINE_TEST(NodeSmokeTestRead, "BasecallerNode") { set_num_reads(5); set_expected_messages(5); batch_size = 8; - runners.push_back(std::make_unique< - dorado::basecall::ModelRunner>( + runners.push_back(std::make_unique( model_config, "cpu", default_params.chunksize, int(batch_size))); }