Merge branch 'INSTX-3807-metal-batch-size-server' into 'master'

[INSTX-3807] Allow the memory fraction to be specified in the metal callers See merge request machine-learning/dorado!843
nanoporetech · Feb 13, 2024 · 77c5599 · 77c5599
2 parents f844f35 + 7e7a6a7
commit 77c5599
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 17 deletions.
diff --git a/dorado/api/caller_creation.cpp b/dorado/api/caller_creation.cpp
@@ -22,9 +22,13 @@ std::shared_ptr<basecall::CudaCaller> create_cuda_caller(
                                                   memory_limit_fraction, exclusive_gpu_access);
 }
 #elif DORADO_METAL_BUILD
-std::shared_ptr<basecall::MetalCaller>
-create_metal_caller(const basecall::CRFModelConfig& model_config, int chunk_size, int batch_size) {
-    return std::make_shared<basecall::MetalCaller>(model_config, chunk_size, batch_size);
+std::shared_ptr<basecall::MetalCaller> create_metal_caller(
+        const basecall::CRFModelConfig& model_config,
+        int chunk_size,
+        int batch_size,
+        float memory_limit_fraction) {
+    return std::make_shared<basecall::MetalCaller>(model_config, chunk_size, batch_size,
+                                                   memory_limit_fraction);
 }
 #endif
 

diff --git a/dorado/api/caller_creation.h b/dorado/api/caller_creation.h
@@ -31,8 +31,11 @@ std::shared_ptr<basecall::CudaCaller> create_cuda_caller(
         float memory_limit_fraction,
         bool exclusive_gpu_access);
 #elif DORADO_METAL_BUILD
-std::shared_ptr<basecall::MetalCaller>
-create_metal_caller(const basecall::CRFModelConfig& model_config, int chunk_size, int batch_size);
+std::shared_ptr<basecall::MetalCaller> create_metal_caller(
+        const basecall::CRFModelConfig& model_config,
+        int chunk_size,
+        int batch_size,
+        float memory_limit_fraction);
 #endif
 
 std::shared_ptr<modbase::ModBaseCaller> create_modbase_caller(

diff --git a/dorado/api/runner_creation.cpp b/dorado/api/runner_creation.cpp
@@ -59,7 +59,8 @@ std::pair<std::vector<basecall::RunnerPtr>, size_t> create_basecall_runners(
     }
 #if DORADO_METAL_BUILD
     else if (device == "metal") {
-        auto caller = create_metal_caller(model_config, int(chunk_size), int(batch_size));
+        auto caller = create_metal_caller(model_config, int(chunk_size), int(batch_size),
+                                          memory_fraction);
         for (size_t i = 0; i < num_gpu_runners; i++) {
             runners.push_back(std::make_unique<basecall::MetalModelRunner>(caller));
         }

diff --git a/dorado/basecall/MetalCaller.cpp b/dorado/basecall/MetalCaller.cpp
@@ -40,7 +40,10 @@ struct MetalCaller::NNTask {
     uint64_t decode_complete_event_id = static_cast<uint64_t>(0);
 };
 
-MetalCaller::MetalCaller(const CRFModelConfig &model_config, int chunk_size, int batch_size)
+MetalCaller::MetalCaller(const CRFModelConfig &model_config,
+                         int chunk_size,
+                         int batch_size,
+                         float memory_limit_fraction)
         : m_config(model_config) {
     ScopedAutoReleasePool autorelease_pool;
 
@@ -76,7 +79,8 @@ MetalCaller::MetalCaller(const CRFModelConfig &model_config, int chunk_size, int
             model_config.model_path, model_config.out_features.has_value(), model_config.bias);
 
     auto selected_batch_size = (batch_size == 0)
-                                       ? benchmark_batch_sizes(model_config, state_dict, chunk_size)
+                                       ? benchmark_batch_sizes(model_config, state_dict, chunk_size,
+                                                               memory_limit_fraction)
                                        : utils::pad_to(batch_size, MTL_CORE_BATCH_SIZE);
     set_chunk_batch_size(model_config, state_dict, chunk_size, selected_batch_size);
 
@@ -231,9 +235,12 @@ void MetalCaller::set_chunk_batch_size(const CRFModelConfig &model_config,
 
 int MetalCaller::benchmark_batch_sizes(const CRFModelConfig &model_config,
                                        const std::vector<at::Tensor> &state_dict,
-                                       int chunk_size) {
+                                       int chunk_size,
+                                       float memory_limit_fraction) {
     const size_t physical_memory = get_apple_physical_memory_bytes();
-    spdlog::debug("Physical memory available {} GB", physical_memory / (size_t{1} << 30));
+    const size_t usable_memory = physical_memory * memory_limit_fraction;
+    spdlog::debug("Physical/Usable memory available: {}/{} GB", physical_memory / BYTES_PER_GB,
+                  usable_memory / BYTES_PER_GB);
 
     // Constrain the maximum batch size to use about half physical memory for decode buffers,
     // with neural network GPU buffers and CPU buffers assumed to occupy a subset of the
@@ -248,7 +255,7 @@ int MetalCaller::benchmark_batch_sizes(const CRFModelConfig &model_config,
              static_cast<size_t>(m_states) * sizeof(float));    // Back guides.
     spdlog::trace("decode_buffer_size_per_elem {}", decode_buffer_size_per_elem);
     const int max_batch_size = static_cast<int>(
-            std::clamp(utils::pad_to(physical_memory / (2 * decode_buffer_size_per_elem),
+            std::clamp(utils::pad_to(usable_memory / (2 * decode_buffer_size_per_elem),
                                      static_cast<size_t>(MTL_CORE_BATCH_SIZE)),
                        static_cast<size_t>(MTL_CORE_BATCH_SIZE),
                        static_cast<size_t>(MTL_CORE_BATCH_SIZE * get_mtl_device_core_count())));

diff --git a/dorado/basecall/MetalCaller.h b/dorado/basecall/MetalCaller.h
@@ -18,7 +18,10 @@ namespace dorado::basecall {
 
 class MetalCaller {
 public:
-    MetalCaller(const CRFModelConfig &model_config, int chunk_size, int batch_size);
+    MetalCaller(const CRFModelConfig &model_config,
+                int chunk_size,
+                int batch_size,
+                float memory_limit_fraction);
     ~MetalCaller();
 
     void call_chunks(at::Tensor &input,
@@ -40,7 +43,8 @@ class MetalCaller {
                               int batch_size);
     int benchmark_batch_sizes(const CRFModelConfig &model_config,
                               const std::vector<at::Tensor> &state_dict,
-                              int chunk_size);
+                              int chunk_size,
+                              float memory_limit_fraction);
     bool run_scan_kernels(MTL::CommandBuffer *const cb, int try_count);
 
     void start_threads();

diff --git a/dorado/utils/memory_utils.cpp b/dorado/utils/memory_utils.cpp
@@ -11,10 +11,6 @@
 
 #include <array>
 
-namespace {
-constexpr size_t BYTES_PER_GB{1024 * 1024 * 1024};
-}
-
 namespace dorado::utils {
 
 size_t available_host_memory_GB() {

diff --git a/dorado/utils/memory_utils.h b/dorado/utils/memory_utils.h
@@ -3,6 +3,8 @@
 
 namespace dorado::utils {
 
+inline constexpr size_t BYTES_PER_GB{1024 * 1024 * 1024};
+
 size_t available_host_memory_GB();
 size_t total_host_memory_GB();