Merge branch 'smalton/INSTX-3448-OoM-max-split' into 'master'

INSTX-3448: OoM issues See merge request machine-learning/dorado!809
nanoporetech · Jan 16, 2024 · 6f283a5 · 6f283a5
2 parents a001855 + 180a4de
commit 6f283a5
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 54 deletions.
diff --git a/dorado/basecall/CudaCaller.cpp b/dorado/basecall/CudaCaller.cpp
@@ -56,29 +56,30 @@ CudaCaller::CudaCaller(const CRFModelConfig &model_config,
     // Batch size will be rounded up to a multiple of batch_size_granularity, regardless of
     // user choice. This makes sure batch size is compatible with GPU kernels.
     if (batch_size == 0) {
-        m_batch_size = determine_batch_size(model_config, chunk_size, memory_limit_fraction, true);
+        m_batch_size = determine_batch_size(memory_limit_fraction, true);
     } else {
         int batch_size_granularity = get_batch_size_granularity();
         m_batch_size = utils::pad_to(batch_size, batch_size_granularity);
         // Make sure the requested batch size doesn't exceed the maximum for the memory available.
-        auto max_batch_size =
-                determine_batch_size(model_config, chunk_size, memory_limit_fraction, false);
+        auto max_batch_size = determine_batch_size(memory_limit_fraction, false);
         if (m_batch_size > max_batch_size) {
             spdlog::warn(
                     "Specified batch size {} exceeds maximum batch size based on available "
                     "memory. Using maximum safe batch size {}.",
                     m_batch_size, max_batch_size);
             m_batch_size = max_batch_size;
         }
-        // Warmup
-        auto input = torch::empty({m_batch_size, m_num_input_features, m_in_chunk_size}, m_options);
-        m_module->forward(input);
-        torch::cuda::synchronize(m_options.device().index());
     }
 
     c10::cuda::CUDAGuard device_guard(m_options.device());
     c10::cuda::CUDACachingAllocator::emptyCache();
 
+    // Warmup
+    auto input = torch::empty({m_batch_size, m_num_input_features, m_in_chunk_size}, m_options);
+    auto scores = m_module->forward(input);
+    m_decoder->beam_search_part_1({scores, m_batch_size, m_decoder_options});
+    torch::cuda::synchronize(m_options.device().index());
+
     start_threads();
 }
 
@@ -153,10 +154,43 @@ stats::NamedStats CudaCaller::sample_stats() const {
     return stats;
 }
 
-int CudaCaller::determine_batch_size(const CRFModelConfig &model_config,
-                                     int chunk_size_in,
-                                     float memory_limit_fraction,
-                                     bool run_benchmark) {
+std::pair<int64_t, int64_t> CudaCaller::calculate_memory_requirements() const {
+    // Determine size of working memory for CRFModel divided by (batch_size * chunk_size)
+    // These values have been determined by running dorado with different models and
+    // reporting the actual allocation size per chunk-timestep.
+    int64_t crfmodel_bytes_per_chunk_timestep;
+    if (m_config.out_features.has_value()) {
+        auto out_features = m_config.out_features.value();
+        std::unordered_map<int, int64_t> out_features_map{{128, 2312}, {256, 8712}};
+        crfmodel_bytes_per_chunk_timestep = out_features_map[out_features];
+        if (crfmodel_bytes_per_chunk_timestep == 0) {
+            spdlog::warn("Auto batchsize detection failed. Unexpected model out_features {}.",
+                         out_features);
+            return {0, 0};
+        }
+    } else {
+        std::unordered_map<int, int64_t> insize_map{
+                {96, 960}, {128, 1280}, {384, 2816}, {768, 9728}, {1024, 10240}};
+        crfmodel_bytes_per_chunk_timestep = insize_map[m_config.lstm_size];
+        if (crfmodel_bytes_per_chunk_timestep == 0) {
+            spdlog::warn("Auto batchsize detection failed. Unexpected model insize {}.",
+                         m_config.lstm_size);
+            return {0, 0};
+        }
+    }
+
+    // Determine size of working memory for decoder divided by (batch_size * chunk_size)
+    // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes
+    // where num_states = 4^(state_len+1)
+    // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details.
+    int64_t decode_bytes_per_chunk_timestep =
+            10 + m_decoder_options.beam_width * 4 + (1ull << (m_config.state_len * 2 + 2));
+
+    return {crfmodel_bytes_per_chunk_timestep * m_out_chunk_size,
+            decode_bytes_per_chunk_timestep * m_out_chunk_size};
+}
+
+int CudaCaller::determine_batch_size(float memory_limit_fraction, bool run_benchmark) {
     c10::cuda::CUDAGuard device_guard(m_options.device());
     constexpr float GB = 1.0e9f;
     int64_t available = utils::available_memory(m_options.device());
@@ -186,42 +220,13 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config,
     }
     spdlog::debug("Auto batchsize {}: memory limit {:.2f}GB", m_device, gpu_mem_limit / GB);
 
-    // Determine size of working memory for CRFModel divided by (batch_size * chunk_size)
-    // These values have been determined by running dorado with different models and
-    // reporting the actual allocation size per chunk-timestep.
-    int64_t crfmodel_bytes_per_chunk_timestep;
-    if (model_config.out_features.has_value()) {
-        auto out_features = model_config.out_features.value();
-        std::unordered_map<int, int64_t> out_features_map{{128, 2312}, {256, 8712}};
-        crfmodel_bytes_per_chunk_timestep = out_features_map[out_features];
-        if (crfmodel_bytes_per_chunk_timestep == 0) {
-            spdlog::warn("Auto batchsize detection failed. Unexpected model out_features {}.",
-                         out_features);
-            return granularity;
-        }
-    } else {
-        std::unordered_map<int, int64_t> insize_map{
-                {96, 960}, {128, 1280}, {384, 2816}, {768, 9728}, {1024, 10240}};
-        crfmodel_bytes_per_chunk_timestep = insize_map[model_config.lstm_size];
-        if (crfmodel_bytes_per_chunk_timestep == 0) {
-            spdlog::warn("Auto batchsize detection failed. Unexpected model insize {}.",
-                         model_config.lstm_size);
-            return granularity;
-        }
+    auto [crfmodel_bytes_per_chunk, decode_bytes_per_chunk] = calculate_memory_requirements();
+    if (crfmodel_bytes_per_chunk == 0) {
+        return granularity;
     }
 
-    // Determine size of working memory for decoder divided by (batch_size * chunk_size)
-    // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes
-    // where num_states = 4^(state_len+1)
-    // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details.
-    int64_t decode_bytes_per_chunk_timestep =
-            10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2));
-
-    auto bytes_per_chunk_timestep =
-            decode_bytes_per_chunk_timestep + crfmodel_bytes_per_chunk_timestep;
-    int64_t chunk_size_out = chunk_size_in / model_config.stride;
-
-    int max_batch_size = int(gpu_mem_limit / (bytes_per_chunk_timestep * chunk_size_out));
+    auto bytes_per_chunk = crfmodel_bytes_per_chunk + decode_bytes_per_chunk;
+    int max_batch_size = int(gpu_mem_limit / bytes_per_chunk);
     max_batch_size -= max_batch_size % granularity;
     if (max_batch_size <= granularity) {
         spdlog::warn("Maximum safe estimated batch size is only {}.", max_batch_size);
@@ -230,7 +235,7 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config,
 
     int best_batch_size = granularity;
     float best_time = std::numeric_limits<float>::max();
-    const int chunk_size = std::min(chunk_size_in, model_config.stride * 300);
+    const int chunk_size = std::min(m_in_chunk_size, m_config.stride * 300);
     if (run_benchmark) {
         // We limit the maximum when doing benchmarking to avoid excessive startup time.
         const int max_batch_size_limit = 10240;
@@ -239,8 +244,7 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config,
                       max_batch_size, granularity);
         for (int batch_size = granularity; batch_size <= max_batch_size;
              batch_size += granularity) {
-            auto input =
-                    torch::empty({batch_size, model_config.num_features, chunk_size}, m_options);
+            auto input = torch::empty({batch_size, m_config.num_features, chunk_size}, m_options);
 
             float time = std::numeric_limits<float>::max();
             for (int i = 0; i < 2; ++i) {  // run twice to eliminate outliers
@@ -275,10 +279,24 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config,
         best_batch_size = max_batch_size;
     }
 
+#if TORCH_VERSION_MAJOR >= 2
+    // Do not re-use smaller chunks of buffers larger than half the decode size
+    // This prevents small allocations from reusing large sections of cached allocated memory
+    // which can lead to OoM errors when the original large allocation is needed again
+    auto max_split_size_mb = decode_bytes_per_chunk * best_batch_size / (2 * 1024 * 1024);
+    auto device_stats = c10::cuda::CUDACachingAllocator::getDeviceStats(m_options.device().index());
+    if (max_split_size_mb < device_stats.max_split_size) {
+        std::string max_split_size_mb_settings =
+                "max_split_size_mb:" + std::to_string(max_split_size_mb);
+
+        c10::cuda::CUDACachingAllocator::setAllocatorSettings(max_split_size_mb_settings);
+    }
+#endif
+
     spdlog::debug("Device {} Model memory {:.2f}GB", m_device,
-                  (crfmodel_bytes_per_chunk_timestep * chunk_size_out * best_batch_size) / GB);
+                  (crfmodel_bytes_per_chunk * best_batch_size) / GB);
     spdlog::debug("Device {} Decode memory {:.2f}GB", m_device,
-                  (decode_bytes_per_chunk_timestep * chunk_size_out * best_batch_size) / GB);
+                  (decode_bytes_per_chunk * best_batch_size) / GB);
     return best_batch_size;
 }
 

diff --git a/dorado/basecall/CudaCaller.h b/dorado/basecall/CudaCaller.h
@@ -53,10 +53,9 @@ class CudaCaller {
         return 64;
     }
 
-    int determine_batch_size(const CRFModelConfig &model_config,
-                             int chunk_size_in,
-                             float memory_limit_fraction,
-                             bool run_benchmark);
+    std::pair<int64_t, int64_t> calculate_memory_requirements() const;
+    int determine_batch_size(float memory_limit_fraction, bool run_benchmark);
+
     void start_threads();
     void cuda_thread_fn();