diff --git a/dorado/api/pipeline_creation.cpp b/dorado/api/pipeline_creation.cpp
index b7a20896..9262bc1d 100644
--- a/dorado/api/pipeline_creation.cpp
+++ b/dorado/api/pipeline_creation.cpp
@@ -1,7 +1,7 @@
 #include "pipeline_creation.h"
 
 #include "basecall/CRFModelConfig.h"
-#include "basecall/ModelRunner.h"
+#include "basecall/ModelRunnerBase.h"
 #include "modbase/ModBaseRunner.h"
 #include "read_pipeline/BasecallerNode.h"
 #include "read_pipeline/ModBaseCallerNode.h"
diff --git a/dorado/api/runner_creation.cpp b/dorado/api/runner_creation.cpp
index 80224986..2fc55b9e 100644
--- a/dorado/api/runner_creation.cpp
+++ b/dorado/api/runner_creation.cpp
@@ -1,7 +1,6 @@
 #include "runner_creation.h"
 
 #include "basecall/crf_utils.h"
-#include "basecall/decode/CPUDecoder.h"
 
 #if DORADO_GPU_BUILD
 #ifdef __APPLE__
@@ -53,7 +52,7 @@ std::pair<std::vector<basecall::RunnerPtr>, size_t> create_basecall_runners(
                       num_cpu_runners);
 
         for (size_t i = 0; i < num_cpu_runners; i++) {
-            runners.push_back(std::make_unique<basecall::ModelRunner<basecall::decode::CPUDecoder>>(
+            runners.push_back(std::make_unique<basecall::ModelRunner>(
                     model_config, device, int(chunk_size), int(batch_size)));
         }
     }
diff --git a/dorado/basecall/CMakeLists.txt b/dorado/basecall/CMakeLists.txt
index 0bed0d3d..ed2b06f4 100644
--- a/dorado/basecall/CMakeLists.txt
+++ b/dorado/basecall/CMakeLists.txt
@@ -1,41 +1,44 @@
 add_library(dorado_basecall STATIC
-    crf_utils.h
     crf_utils.cpp
-    CRFModel.h
+    crf_utils.h
     CRFModel.cpp
-    CRFModelConfig.h
+    CRFModel.h
     CRFModelConfig.cpp
+    CRFModelConfig.h
+    ModelRunner.cpp
     ModelRunner.h
+    ModelRunnerBase.h
     decode/beam_search.cpp
     decode/beam_search.h
     decode/CPUDecoder.cpp
     decode/CPUDecoder.h
+    decode/Decoder.cpp
+    decode/Decoder.h
 )
 
 if (DORADO_GPU_BUILD)
     if(APPLE)
         target_sources(dorado_basecall
             PRIVATE
-            MetalCRFModel.h
             MetalCRFModel.cpp
+            MetalCRFModel.h
         )
     else()
         target_sources(dorado_basecall
             PRIVATE
-            CudaCRFModel.h
             CudaCRFModel.cpp
-            decode/GPUDecoder.cpp
-            decode/GPUDecoder.h
+            CudaCRFModel.h
+            decode/CUDADecoder.cpp
+            decode/CUDADecoder.h
         )
     endif()
 endif()
 
 target_include_directories(dorado_basecall
     SYSTEM
-    PUBLIC
-        ${DORADO_3RD_PARTY_SOURCE}/toml11
     PRIVATE
         ${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include
+        ${DORADO_3RD_PARTY_SOURCE}/toml11
 )
 
 
diff --git a/dorado/basecall/CRFModel.cpp b/dorado/basecall/CRFModel.cpp
index 0d9e3a7a..e5dac7af 100644
--- a/dorado/basecall/CRFModel.cpp
+++ b/dorado/basecall/CRFModel.cpp
@@ -871,7 +871,7 @@ struct CRFModelImpl : Module {
         }
 
         // Clamping the scores to [-5, 5], if active (i.e. the role of `clamp1`), is performed by
-        // `GPUDecoder` on reading the scores. This eliminates the cost of a large matrix
+        // `CUDADecoder` on reading the scores. This eliminates the cost of a large matrix
         // read-modify-write operation.
 
         // Output is [N, T, C], F16, contiguous
diff --git a/dorado/basecall/CudaCRFModel.cpp b/dorado/basecall/CudaCRFModel.cpp
index 09bbc3a7..2205dddf 100644
--- a/dorado/basecall/CudaCRFModel.cpp
+++ b/dorado/basecall/CudaCRFModel.cpp
@@ -1,7 +1,7 @@
 #include "CudaCRFModel.h"
 
 #include "CRFModelConfig.h"
-#include "decode/GPUDecoder.h"
+#include "decode/Decoder.h"
 #include "utils/cuda_utils.h"
 #include "utils/math_utils.h"
 
@@ -31,19 +31,18 @@ class CudaCaller {
                bool exclusive_gpu_access)
             : m_config(model_config),
               m_device(device),
+              m_decoder(decode::create_decoder(device, m_config)),
+              m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)),
               m_exclusive_gpu_access(exclusive_gpu_access) {
-        m_decoder_options = decode::DecoderOptions();
+        assert(m_options.device().is_cuda());
+
         m_decoder_options.q_shift = model_config.qbias;
         m_decoder_options.q_scale = model_config.qscale;
-        m_decoder = std::make_unique<decode::GPUDecoder>(model_config.clamp ? 5.f : 0.f);
         m_num_input_features = model_config.num_features;
         // adjust chunk size to be a multiple of the stride
         m_out_chunk_size = chunk_size / model_config.stride;
         m_in_chunk_size = m_out_chunk_size * model_config.stride;
 
-        m_options = at::TensorOptions().dtype(decode::GPUDecoder::dtype).device(device);
-        assert(m_options.device().is_cuda());
-
         at::InferenceMode guard;
         m_module = load_crf_model(model_config, m_options);
 
@@ -155,8 +154,7 @@ class CudaCaller {
         // Determine size of working memory for decoder divided by (batch_size * chunk_size)
         // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes
         // where num_states = 4^(state_len+1)
-        // See `dorado::basecall::decode::GPUDecoder::gpu_part()`, block beginning with `if (!initialized) {`
-        // for more details.
+        // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details.
         int64_t decode_bytes_per_chunk_timestep =
                 10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2));
 
@@ -222,9 +220,10 @@ class CudaCaller {
     }
 
     struct NNTask {
-        NNTask(at::Tensor input_) : input(input_) {}
+        NNTask(at::Tensor input_, int num_chunks_) : input(input_), num_chunks(num_chunks_) {}
         at::Tensor input;
-        at::Tensor out;
+        int num_chunks;
+        decode::DecodeData out;
         std::mutex mut;
         std::condition_variable cv;
         bool done{false};
@@ -241,7 +240,7 @@ class CudaCaller {
             return std::vector<decode::DecodedChunk>();
         }
 
-        auto task = std::make_shared<NNTask>(input.to(m_options.device()));
+        auto task = std::make_shared<NNTask>(input.to(m_options.device()), num_chunks);
         {
             std::lock_guard<std::mutex> lock(m_input_lock);
             m_input_queue.push_front(task);
@@ -253,8 +252,8 @@ class CudaCaller {
             task->cv.wait(lock);
         }
 
-        output.copy_(task->out);
-        return m_decoder->cpu_part(output);
+        output.copy_(task->out.data);
+        return m_decoder->beam_search_part_2({output, num_chunks, m_decoder_options});
     }
 
     void cuda_thread_fn() {
@@ -322,7 +321,8 @@ class CudaCaller {
                 stats::Timer timer;
                 auto scores = m_module->forward(task->input);
                 const auto forward_ms = timer.GetElapsedMS();
-                task->out = m_decoder->gpu_part(scores, m_decoder_options);
+                task->out = m_decoder->beam_search_part_1(
+                        {scores, task->num_chunks, m_decoder_options});
                 stream.synchronize();
                 const auto forward_plus_decode_ms = timer.GetElapsedMS();
                 m_model_ms += forward_ms;
@@ -372,9 +372,9 @@ class CudaCaller {
 
     const CRFModelConfig m_config;
     std::string m_device;
-    at::TensorOptions m_options;
-    std::unique_ptr<decode::GPUDecoder> m_decoder;
+    std::unique_ptr<decode::Decoder> m_decoder;
     decode::DecoderOptions m_decoder_options;
+    at::TensorOptions m_options;
     torch::nn::ModuleHolder<torch::nn::AnyModule> m_module{nullptr};
     std::atomic<bool> m_terminate{false};
     std::deque<std::shared_ptr<NNTask>> m_input_queue;
diff --git a/dorado/basecall/CudaCRFModel.h b/dorado/basecall/CudaCRFModel.h
index ce00ebfc..df0dc111 100644
--- a/dorado/basecall/CudaCRFModel.h
+++ b/dorado/basecall/CudaCRFModel.h
@@ -1,11 +1,12 @@
 #pragma once
 
 #include "CRFModel.h"
-#include "ModelRunner.h"
+#include "ModelRunnerBase.h"
 
 #include <ATen/core/TensorBody.h>
 #include <c10/cuda/CUDAStream.h>
 
+#include <atomic>
 #include <filesystem>
 #include <memory>
 #include <vector>
diff --git a/dorado/basecall/MetalCRFModel.h b/dorado/basecall/MetalCRFModel.h
index 927faef5..d6029b77 100644
--- a/dorado/basecall/MetalCRFModel.h
+++ b/dorado/basecall/MetalCRFModel.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ModelRunner.h"
+#include "ModelRunnerBase.h"
 
 #include <ATen/core/TensorBody.h>
 
diff --git a/dorado/basecall/ModelRunner.cpp b/dorado/basecall/ModelRunner.cpp
new file mode 100644
index 00000000..22c6a331
--- /dev/null
+++ b/dorado/basecall/ModelRunner.cpp
@@ -0,0 +1,52 @@
+#include "ModelRunner.h"
+
+#include "CRFModel.h"
+#include "decode/Decoder.h"
+
+namespace dorado::basecall {
+
+ModelRunner::ModelRunner(const CRFModelConfig &model_config,
+                         const std::string &device,
+                         int chunk_size,
+                         int batch_size)
+        : m_config(model_config),
+          m_decoder(decode::create_decoder(device, model_config)),
+          m_options(at::TensorOptions().dtype(m_decoder->dtype()).device(device)),
+          m_module(load_crf_model(model_config, m_options)) {
+    m_decoder_options.q_shift = model_config.qbias;
+    m_decoder_options.q_scale = model_config.qscale;
+
+    // adjust chunk size to be a multiple of the stride
+    chunk_size -= chunk_size % model_config.stride;
+
+    m_input = at::zeros({batch_size, model_config.num_features, chunk_size},
+                        at::TensorOptions().dtype(m_decoder->dtype()).device(at::kCPU));
+}
+
+std::vector<decode::DecodedChunk> ModelRunner::call_chunks(int num_chunks) {
+    at::InferenceMode guard;
+    dorado::stats::Timer timer;
+    auto scores = m_module->forward(m_input.to(m_options.device()));
+    const auto forward_ms = timer.GetElapsedMS();
+    auto decoded_chunks = m_decoder->beam_search_part_2(
+            m_decoder->beam_search_part_1({scores, num_chunks, m_decoder_options}));
+    const auto forward_plus_decode_ms = timer.GetElapsedMS();
+    ++m_num_batches_called;
+    m_model_ms += forward_ms;
+    m_decode_ms += forward_plus_decode_ms - forward_ms;
+    return decoded_chunks;
+}
+
+void ModelRunner::accept_chunk(int chunk_idx, const at::Tensor &chunk) {
+    m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk);
+}
+
+stats::NamedStats ModelRunner::sample_stats() const {
+    stats::NamedStats stats;
+    stats["batches_called"] = double(m_num_batches_called);
+    stats["model_ms"] = double(m_model_ms);
+    stats["decode_ms"] = double(m_decode_ms);
+    return stats;
+}
+
+}  // namespace dorado::basecall
diff --git a/dorado/basecall/ModelRunner.h b/dorado/basecall/ModelRunner.h
index a9dee625..a3e16a0a 100644
--- a/dorado/basecall/ModelRunner.h
+++ b/dorado/basecall/ModelRunner.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "CRFModel.h"
 #include "CRFModelConfig.h"
+#include "ModelRunnerBase.h"
 #include "decode/Decoder.h"
 #include "utils/stats.h"
 
@@ -12,24 +12,6 @@
 
 namespace dorado::basecall {
 
-class ModelRunnerBase {
-public:
-    virtual ~ModelRunnerBase() = default;
-    virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0;
-    virtual std::vector<decode::DecodedChunk> call_chunks(int num_chunks) = 0;
-    virtual const CRFModelConfig &config() const = 0;
-    virtual size_t model_stride() const = 0;
-    virtual size_t chunk_size() const = 0;
-    virtual size_t batch_size() const = 0;
-    virtual void terminate() = 0;
-    virtual void restart() = 0;
-    virtual std::string get_name() const = 0;
-    virtual stats::NamedStats sample_stats() const = 0;
-};
-
-using RunnerPtr = std::unique_ptr<ModelRunnerBase>;
-
-template <typename T>
 class ModelRunner final : public ModelRunnerBase {
 public:
     ModelRunner(const CRFModelConfig &model_config,
@@ -49,11 +31,11 @@ class ModelRunner final : public ModelRunnerBase {
 
 private:
     const CRFModelConfig m_config;
-    at::Tensor m_input;
+    std::unique_ptr<decode::Decoder> m_decoder;
     at::TensorOptions m_options;
-    std::unique_ptr<T> m_decoder;
     decode::DecoderOptions m_decoder_options;
     torch::nn::ModuleHolder<torch::nn::AnyModule> m_module{nullptr};
+    at::Tensor m_input;
 
     // Performance monitoring stats.
     std::atomic<int64_t> m_num_batches_called = 0;
@@ -61,53 +43,4 @@ class ModelRunner final : public ModelRunnerBase {
     std::atomic<int64_t> m_decode_ms = 0;
 };
 
-template <typename T>
-ModelRunner<T>::ModelRunner(const CRFModelConfig &model_config,
-                            const std::string &device,
-                            int chunk_size,
-                            int batch_size)
-        : m_config(model_config) {
-    m_decoder_options = decode::DecoderOptions();
-    m_decoder_options.q_shift = model_config.qbias;
-    m_decoder_options.q_scale = model_config.qscale;
-    m_decoder = std::make_unique<T>();
-
-    m_options = at::TensorOptions().dtype(T::dtype).device(device);
-    m_module = load_crf_model(model_config, m_options);
-
-    // adjust chunk size to be a multiple of the stride
-    chunk_size -= chunk_size % model_config.stride;
-
-    m_input = at::zeros({batch_size, model_config.num_features, chunk_size},
-                        at::TensorOptions().dtype(T::dtype).device(at::kCPU));
-}
-
-template <typename T>
-std::vector<decode::DecodedChunk> ModelRunner<T>::call_chunks(int num_chunks) {
-    at::InferenceMode guard;
-    dorado::stats::Timer timer;
-    auto scores = m_module->forward(m_input.to(m_options.device_opt().value()));
-    const auto forward_ms = timer.GetElapsedMS();
-    auto decoded_chunks = m_decoder->beam_search(scores, num_chunks, m_decoder_options);
-    const auto forward_plus_decode_ms = timer.GetElapsedMS();
-    ++m_num_batches_called;
-    m_model_ms += forward_ms;
-    m_decode_ms += forward_plus_decode_ms - forward_ms;
-    return decoded_chunks;
-}
-
-template <typename T>
-void ModelRunner<T>::accept_chunk(int chunk_idx, const at::Tensor &chunk) {
-    m_input.index_put_({chunk_idx, at::indexing::Ellipsis}, chunk);
-}
-
-template <typename T>
-stats::NamedStats ModelRunner<T>::sample_stats() const {
-    stats::NamedStats stats;
-    stats["batches_called"] = double(m_num_batches_called);
-    stats["model_ms"] = double(m_model_ms);
-    stats["decode_ms"] = double(m_decode_ms);
-    return stats;
-}
-
 }  // namespace dorado::basecall
diff --git a/dorado/basecall/ModelRunnerBase.h b/dorado/basecall/ModelRunnerBase.h
new file mode 100644
index 00000000..2aafc461
--- /dev/null
+++ b/dorado/basecall/ModelRunnerBase.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "decode/Decoder.h"
+#include "utils/stats.h"
+
+#include <string>
+#include <vector>
+
+namespace at {
+class Tensor;
+}
+
+namespace dorado::basecall {
+
+struct CRFModelConfig;
+
+class ModelRunnerBase {
+public:
+    virtual ~ModelRunnerBase() = default;
+    virtual void accept_chunk(int chunk_idx, const at::Tensor &chunk) = 0;
+    virtual std::vector<decode::DecodedChunk> call_chunks(int num_chunks) = 0;
+    virtual const CRFModelConfig &config() const = 0;
+    virtual size_t model_stride() const = 0;
+    virtual size_t chunk_size() const = 0;
+    virtual size_t batch_size() const = 0;
+    virtual void terminate() = 0;
+    virtual void restart() = 0;
+    virtual std::string get_name() const = 0;
+    virtual stats::NamedStats sample_stats() const = 0;
+};
+
+using RunnerPtr = std::unique_ptr<ModelRunnerBase>;
+
+}  // namespace dorado::basecall
diff --git a/dorado/basecall/decode/CPUDecoder.cpp b/dorado/basecall/decode/CPUDecoder.cpp
index f405c003..e138b7ab 100644
--- a/dorado/basecall/decode/CPUDecoder.cpp
+++ b/dorado/basecall/decode/CPUDecoder.cpp
@@ -88,10 +88,12 @@ at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_scor
 
 namespace dorado::basecall::decode {
 
-std::vector<DecodedChunk> CPUDecoder::beam_search(const at::Tensor& scores,
-                                                  const int num_chunks,
-                                                  const DecoderOptions& options) {
-    const auto scores_cpu = scores.to(at::kCPU);
+DecodeData CPUDecoder::beam_search_part_1(DecodeData data) const { return data; }
+
+std::vector<DecodedChunk> CPUDecoder::beam_search_part_2(DecodeData data) const {
+    const auto scores_cpu = data.data.to(at::kCPU);
+    const auto num_chunks = data.num_chunks;
+    const auto& options = data.options;
     int num_threads = std::min(num_chunks, 4);
     int chunks_per_thread = num_chunks / num_threads;
     int num_threads_with_one_more_chunk = num_chunks % num_threads;
diff --git a/dorado/basecall/decode/CPUDecoder.h b/dorado/basecall/decode/CPUDecoder.h
index b2011d5c..d1c1924e 100644
--- a/dorado/basecall/decode/CPUDecoder.h
+++ b/dorado/basecall/decode/CPUDecoder.h
@@ -6,12 +6,12 @@
 
 namespace dorado::basecall::decode {
 
-class CPUDecoder final : Decoder {
+class CPUDecoder final : public Decoder {
 public:
-    std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
-                                          int num_chunks,
-                                          const DecoderOptions& options) final;
-    constexpr static at::ScalarType dtype = at::ScalarType::Float;
+    DecodeData beam_search_part_1(DecodeData data) const;
+    std::vector<DecodedChunk> beam_search_part_2(DecodeData data) const;
+
+    at::ScalarType dtype() const { return at::ScalarType::Float; };
 };
 
 }  // namespace dorado::basecall::decode
diff --git a/dorado/basecall/decode/GPUDecoder.cpp b/dorado/basecall/decode/CUDADecoder.cpp
similarity index 89%
rename from dorado/basecall/decode/GPUDecoder.cpp
rename to dorado/basecall/decode/CUDADecoder.cpp
index 7561371b..8db11227 100644
--- a/dorado/basecall/decode/GPUDecoder.cpp
+++ b/dorado/basecall/decode/CUDADecoder.cpp
@@ -1,6 +1,5 @@
-#include "GPUDecoder.h"
+#include "CUDADecoder.h"
 
-#include "Decoder.h"
 #include "utils/cuda_utils.h"
 #include "utils/gpu_profiling.h"
 
@@ -13,7 +12,10 @@ extern "C" {
 
 namespace dorado::basecall::decode {
 
-at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) {
+DecodeData CUDADecoder::beam_search_part_1(DecodeData data) const {
+    auto scores = data.data;
+    auto &options = data.options;
+
     c10::cuda::CUDAGuard device_guard(scores.device());
     utils::ScopedProfileRange loop{"gpu_decode", 1};
     long int N = (long int)(scores.sizes()[0]);
@@ -78,10 +80,13 @@ at::Tensor GPUDecoder::gpu_part(at::Tensor scores, DecoderOptions options) {
                 sequence.data_ptr(), qstring.data_ptr(), options.q_scale, options.q_shift,
                 int(options.beam_width), options.beam_cut, options.blank_score, options.move_pad));
     }
-    return moves_sequence_qstring.reshape({3, N, -1});
+
+    data.data = moves_sequence_qstring.reshape({3, N, -1});
+    return data;
 }
 
-std::vector<DecodedChunk> GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring_cpu) {
+std::vector<DecodedChunk> CUDADecoder::beam_search_part_2(DecodeData data) const {
+    auto moves_sequence_qstring_cpu = data.data;
     nvtx3::scoped_range loop{"cpu_decode"};
     assert(moves_sequence_qstring_cpu.device() == at::kCPU);
     auto moves_cpu = moves_sequence_qstring_cpu[0];
@@ -107,10 +112,4 @@ std::vector<DecodedChunk> GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring
     return called_chunks;
 }
 
-std::vector<DecodedChunk> GPUDecoder::beam_search(const at::Tensor &scores,
-                                                  int,
-                                                  const DecoderOptions &options) {
-    return cpu_part(gpu_part(scores, options));
-}
-
 }  // namespace dorado::basecall::decode
diff --git a/dorado/basecall/decode/CUDADecoder.h b/dorado/basecall/decode/CUDADecoder.h
new file mode 100644
index 00000000..6d5a7a8d
--- /dev/null
+++ b/dorado/basecall/decode/CUDADecoder.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "Decoder.h"
+
+#include <ATen/core/TensorBody.h>
+
+namespace dorado::basecall::decode {
+
+class CUDADecoder final : public Decoder {
+public:
+    explicit CUDADecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {}
+
+    // We split beam_search into two parts, the first one running on the GPU and the second
+    // one on the CPU. While the second part is running we can submit more commands to the GPU
+    // on another thread.
+    DecodeData beam_search_part_1(DecodeData data) const;
+    std::vector<DecodedChunk> beam_search_part_2(DecodeData data) const;
+
+    at::ScalarType dtype() const { return at::ScalarType::Half; };
+
+private:
+    float m_score_clamp_val;
+};
+
+}  // namespace dorado::basecall::decode
diff --git a/dorado/basecall/decode/Decoder.cpp b/dorado/basecall/decode/Decoder.cpp
new file mode 100644
index 00000000..b30bfb50
--- /dev/null
+++ b/dorado/basecall/decode/Decoder.cpp
@@ -0,0 +1,29 @@
+#include "Decoder.h"
+
+#if DORADO_GPU_BUILD && !defined(__APPLE__)
+#include "CUDADecoder.h"
+#endif
+
+#include "CPUDecoder.h"
+#include "basecall/CRFModelConfig.h"
+
+#include <c10/core/Device.h>
+
+namespace dorado::basecall::decode {
+
+std::unique_ptr<Decoder> create_decoder(c10::Device device, const CRFModelConfig& config) {
+#if DORADO_GPU_BUILD && !defined(__APPLE__)
+    if (device.is_cuda()) {
+        return std::make_unique<decode::CUDADecoder>(config.clamp ? 5.f : 0.f);
+    }
+#else
+    (void)config;  // unused in other build types
+#endif
+    if (device.is_cpu()) {
+        return std::make_unique<decode::CPUDecoder>();
+    }
+
+    throw std::runtime_error("Unsupported device type for decoder creation: " + device.str());
+}
+
+}  // namespace dorado::basecall::decode
diff --git a/dorado/basecall/decode/Decoder.h b/dorado/basecall/decode/Decoder.h
index 09ef76bf..f34b6bbc 100644
--- a/dorado/basecall/decode/Decoder.h
+++ b/dorado/basecall/decode/Decoder.h
@@ -2,9 +2,15 @@
 
 #include <ATen/core/TensorBody.h>
 
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
+namespace dorado::basecall {
+struct CRFModelConfig;
+}
+
 namespace dorado::basecall::decode {
 
 struct DecodedChunk {
@@ -23,11 +29,21 @@ struct DecoderOptions {
     bool move_pad = false;
 };
 
+struct DecodeData {
+    at::Tensor data;
+    int num_chunks;
+    DecoderOptions options;
+};
+
 class Decoder {
 public:
-    virtual std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
-                                                  int num_chunks,
-                                                  const DecoderOptions& options) = 0;
+    virtual ~Decoder() = default;
+    virtual DecodeData beam_search_part_1(DecodeData data) const = 0;
+    virtual std::vector<DecodedChunk> beam_search_part_2(DecodeData data) const = 0;
+    // Returns the torch::TensorOptions::dtype to use for input data to models that use this decoder
+    virtual at::ScalarType dtype() const = 0;
 };
 
+std::unique_ptr<Decoder> create_decoder(c10::Device device, const CRFModelConfig& config);
+
 }  // namespace dorado::basecall::decode
diff --git a/dorado/basecall/decode/GPUDecoder.h b/dorado/basecall/decode/GPUDecoder.h
deleted file mode 100644
index 46ff4d28..00000000
--- a/dorado/basecall/decode/GPUDecoder.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "Decoder.h"
-
-#include <ATen/core/TensorBody.h>
-
-namespace dorado::basecall::decode {
-
-class GPUDecoder final : Decoder {
-public:
-    explicit GPUDecoder(float score_clamp_val) : m_score_clamp_val(score_clamp_val) {}
-
-    std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
-                                          int num_chunks,
-                                          const DecoderOptions& options) final;
-    constexpr static at::ScalarType dtype = at::ScalarType::Half;
-
-    // We split beam_search into two parts, the first one running on the GPU and the second
-    // one on the CPU. While the second part is running we can submit more commands to the GPU
-    // on another thread.
-    at::Tensor gpu_part(at::Tensor scores, DecoderOptions options);
-    std::vector<DecodedChunk> cpu_part(at::Tensor moves_sequence_qstring_cpu);
-
-    float m_score_clamp_val;
-};
-
-}  // namespace dorado::basecall::decode
diff --git a/dorado/read_pipeline/BasecallerNode.cpp b/dorado/read_pipeline/BasecallerNode.cpp
index c39882f6..cc81b494 100644
--- a/dorado/read_pipeline/BasecallerNode.cpp
+++ b/dorado/read_pipeline/BasecallerNode.cpp
@@ -1,10 +1,11 @@
 #include "BasecallerNode.h"
 
-#include "basecall/ModelRunner.h"
-#include "basecall/decode/CPUDecoder.h"
+#include "basecall/CRFModelConfig.h"
+#include "basecall/ModelRunnerBase.h"
 #include "stitch.h"
 #include "utils/stats.h"
 
+#include <ATen/ATen.h>
 #include <nvtx3/nvtx3.hpp>
 
 #include <algorithm>
@@ -15,7 +16,7 @@
 #endif
 
 using namespace std::chrono_literals;
-using namespace torch::indexing;
+using namespace at::indexing;
 
 namespace dorado {
 
@@ -242,15 +243,13 @@ void BasecallerNode::basecall_worker_thread(int worker_id) {
             if (slice_size != m_chunk_size) {
                 if (input_slice.ndimension() == 1) {
                     auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size);
-                    input_slice = torch::concat(
-                            {input_slice.repeat({n}),
-                             input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})});
+                    input_slice = at::concat({input_slice.repeat({n}),
+                                              input_slice.index({Ellipsis, Slice(0, overhang)})});
                 } else if (input_slice.ndimension() == 2) {
                     auto [n, overhang] = std::div((int)m_chunk_size, (int)slice_size);
-                    input_slice = torch::concat(
-                            {input_slice.repeat({1, n}),
-                             input_slice.index({Ellipsis, torch::indexing::Slice(0, overhang)})},
-                            1);
+                    input_slice = at::concat({input_slice.repeat({1, n}),
+                                              input_slice.index({Ellipsis, Slice(0, overhang)})},
+                                             1);
                 }
             }
 
diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp
index ef86c218..9f8087d8 100644
--- a/tests/NodeSmokeTest.cpp
+++ b/tests/NodeSmokeTest.cpp
@@ -3,7 +3,6 @@
 #include "basecall/CRFModel.h"
 #include "basecall/CRFModelConfig.h"
 #include "basecall/ModelRunner.h"
-#include "basecall/decode/CPUDecoder.h"
 #include "modbase/ModBaseModel.h"
 #include "modbase/ModBaseRunner.h"
 #include "models/models.h"
@@ -243,8 +242,7 @@ DEFINE_TEST(NodeSmokeTestRead, "BasecallerNode") {
         set_num_reads(5);
         set_expected_messages(5);
         batch_size = 8;
-        runners.push_back(std::make_unique<
-                          dorado::basecall::ModelRunner<dorado::basecall::decode::CPUDecoder>>(
+        runners.push_back(std::make_unique<dorado::basecall::ModelRunner>(
                 model_config, "cpu", default_params.chunksize, int(batch_size)));
     }