From 6170326f7d3d5133136cc41780c50138b235c298 Mon Sep 17 00:00:00 2001
From: Steve Malton <stephen.malton@nanoporetech.com>
Date: Wed, 31 Jan 2024 15:32:56 +0000
Subject: [PATCH] Fix stream creation for modbase

Input tensors are CPU pinned memory, but rather than failing they return a default cuda stream. We want a real stream on the real device, so we need to fetch that from the caller.
---
 dorado/modbase/ModBaseCaller.h   |  1 +
 dorado/modbase/ModBaseRunner.cpp | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/dorado/modbase/ModBaseCaller.h b/dorado/modbase/ModBaseCaller.h
index 8a476215..c025b1c6 100644
--- a/dorado/modbase/ModBaseCaller.h
+++ b/dorado/modbase/ModBaseCaller.h
@@ -52,6 +52,7 @@ class ModBaseCaller {
 
     std::vector<at::Tensor> create_input_sig_tensors() const;
     std::vector<at::Tensor> create_input_seq_tensors() const;
+    c10::Device device() const { return m_options.device(); }
 
     at::Tensor call_chunks(size_t model_id,
                            at::Tensor& input_sigs,
diff --git a/dorado/modbase/ModBaseRunner.cpp b/dorado/modbase/ModBaseRunner.cpp
index ee6c6ee9..da4ea134 100644
--- a/dorado/modbase/ModBaseRunner.cpp
+++ b/dorado/modbase/ModBaseRunner.cpp
@@ -14,12 +14,12 @@
 
 namespace {
 #if DORADO_CUDA_BUILD
-std::vector<c10::optional<c10::Stream>> get_streams_from_tensors(
-        const std::vector<at::Tensor>& tensors) {
+std::vector<c10::optional<c10::Stream>> get_streams_from_caller(
+        const std::shared_ptr<dorado::modbase::ModBaseCaller>& caller) {
     std::vector<c10::optional<c10::Stream>> streams;
-    for (const auto& tensor : tensors) {
-        if (tensor.device().is_cuda()) {
-            streams.push_back(c10::cuda::getStreamFromPool(false, tensor.device().index()));
+    for (size_t i = 0; i < caller->num_model_callers(); ++i) {
+        if (caller->device().is_cuda()) {
+            streams.push_back(c10::cuda::getStreamFromPool(false, caller->device().index()));
         } else {
             streams.emplace_back();
         }
@@ -37,7 +37,7 @@ ModBaseRunner::ModBaseRunner(std::shared_ptr<ModBaseCaller> caller)
           m_input_seqs(m_caller->create_input_seq_tensors())
 #if DORADO_CUDA_BUILD
           ,
-          m_streams(get_streams_from_tensors(m_input_sigs))
+          m_streams(get_streams_from_caller(m_caller))
 #endif
 {
 }