diff --git a/dorado/basecall/CudaCaller.cpp b/dorado/basecall/CudaCaller.cpp index ddd335b1..9d97d0b6 100644 --- a/dorado/basecall/CudaCaller.cpp +++ b/dorado/basecall/CudaCaller.cpp @@ -56,13 +56,12 @@ CudaCaller::CudaCaller(const CRFModelConfig &model_config, // Batch size will be rounded up to a multiple of batch_size_granularity, regardless of // user choice. This makes sure batch size is compatible with GPU kernels. if (batch_size == 0) { - m_batch_size = determine_batch_size(model_config, chunk_size, memory_limit_fraction, true); + m_batch_size = determine_batch_size(memory_limit_fraction, true); } else { int batch_size_granularity = get_batch_size_granularity(); m_batch_size = utils::pad_to(batch_size, batch_size_granularity); // Make sure the requested batch size doesn't exceed the maximum for the memory available. - auto max_batch_size = - determine_batch_size(model_config, chunk_size, memory_limit_fraction, false); + auto max_batch_size = determine_batch_size(memory_limit_fraction, false); if (m_batch_size > max_batch_size) { spdlog::warn( "Specified batch size {} exceeds maximum batch size based on available " @@ -70,15 +69,17 @@ CudaCaller::CudaCaller(const CRFModelConfig &model_config, m_batch_size, max_batch_size); m_batch_size = max_batch_size; } - // Warmup - auto input = torch::empty({m_batch_size, m_num_input_features, m_in_chunk_size}, m_options); - m_module->forward(input); - torch::cuda::synchronize(m_options.device().index()); } c10::cuda::CUDAGuard device_guard(m_options.device()); c10::cuda::CUDACachingAllocator::emptyCache(); + // Warmup + auto input = torch::empty({m_batch_size, m_num_input_features, m_in_chunk_size}, m_options); + auto scores = m_module->forward(input); + m_decoder->beam_search_part_1({scores, m_batch_size, m_decoder_options}); + torch::cuda::synchronize(m_options.device().index()); + start_threads(); } @@ -153,10 +154,43 @@ stats::NamedStats CudaCaller::sample_stats() const { return stats; } -int CudaCaller::determine_batch_size(const CRFModelConfig &model_config, - int chunk_size_in, - float memory_limit_fraction, - bool run_benchmark) { +std::pair CudaCaller::calculate_memory_requirements() const { + // Determine size of working memory for CRFModel divided by (batch_size * chunk_size) + // These values have been determined by running dorado with different models and + // reporting the actual allocation size per chunk-timestep. + int64_t crfmodel_bytes_per_chunk_timestep; + if (m_config.out_features.has_value()) { + auto out_features = m_config.out_features.value(); + std::unordered_map out_features_map{{128, 2312}, {256, 8712}}; + crfmodel_bytes_per_chunk_timestep = out_features_map[out_features]; + if (crfmodel_bytes_per_chunk_timestep == 0) { + spdlog::warn("Auto batchsize detection failed. Unexpected model out_features {}.", + out_features); + return {0, 0}; + } + } else { + std::unordered_map insize_map{ + {96, 960}, {128, 1280}, {384, 2816}, {768, 9728}, {1024, 10240}}; + crfmodel_bytes_per_chunk_timestep = insize_map[m_config.lstm_size]; + if (crfmodel_bytes_per_chunk_timestep == 0) { + spdlog::warn("Auto batchsize detection failed. Unexpected model insize {}.", + m_config.lstm_size); + return {0, 0}; + } + } + + // Determine size of working memory for decoder divided by (batch_size * chunk_size) + // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes + // where num_states = 4^(state_len+1) + // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details. + int64_t decode_bytes_per_chunk_timestep = + 10 + m_decoder_options.beam_width * 4 + (1ull << (m_config.state_len * 2 + 2)); + + return {crfmodel_bytes_per_chunk_timestep * m_out_chunk_size, + decode_bytes_per_chunk_timestep * m_out_chunk_size}; +} + +int CudaCaller::determine_batch_size(float memory_limit_fraction, bool run_benchmark) { c10::cuda::CUDAGuard device_guard(m_options.device()); constexpr float GB = 1.0e9f; int64_t available = utils::available_memory(m_options.device()); @@ -186,42 +220,13 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config, } spdlog::debug("Auto batchsize {}: memory limit {:.2f}GB", m_device, gpu_mem_limit / GB); - // Determine size of working memory for CRFModel divided by (batch_size * chunk_size) - // These values have been determined by running dorado with different models and - // reporting the actual allocation size per chunk-timestep. - int64_t crfmodel_bytes_per_chunk_timestep; - if (model_config.out_features.has_value()) { - auto out_features = model_config.out_features.value(); - std::unordered_map out_features_map{{128, 2312}, {256, 8712}}; - crfmodel_bytes_per_chunk_timestep = out_features_map[out_features]; - if (crfmodel_bytes_per_chunk_timestep == 0) { - spdlog::warn("Auto batchsize detection failed. Unexpected model out_features {}.", - out_features); - return granularity; - } - } else { - std::unordered_map insize_map{ - {96, 960}, {128, 1280}, {384, 2816}, {768, 9728}, {1024, 10240}}; - crfmodel_bytes_per_chunk_timestep = insize_map[model_config.lstm_size]; - if (crfmodel_bytes_per_chunk_timestep == 0) { - spdlog::warn("Auto batchsize detection failed. Unexpected model insize {}.", - model_config.lstm_size); - return granularity; - } + auto [crfmodel_bytes_per_chunk, decode_bytes_per_chunk] = calculate_memory_requirements(); + if (crfmodel_bytes_per_chunk == 0) { + return granularity; } - // Determine size of working memory for decoder divided by (batch_size * chunk_size) - // Decoder needs roughly (beam_width * 4) + num_states + 10 extra bytes - // where num_states = 4^(state_len+1) - // See `dorado::basecall::decode::CUDADecoder::beam_search_part_1()` for more details. - int64_t decode_bytes_per_chunk_timestep = - 10 + m_decoder_options.beam_width * 4 + (1ull << (model_config.state_len * 2 + 2)); - - auto bytes_per_chunk_timestep = - decode_bytes_per_chunk_timestep + crfmodel_bytes_per_chunk_timestep; - int64_t chunk_size_out = chunk_size_in / model_config.stride; - - int max_batch_size = int(gpu_mem_limit / (bytes_per_chunk_timestep * chunk_size_out)); + auto bytes_per_chunk = crfmodel_bytes_per_chunk + decode_bytes_per_chunk; + int max_batch_size = int(gpu_mem_limit / bytes_per_chunk); max_batch_size -= max_batch_size % granularity; if (max_batch_size <= granularity) { spdlog::warn("Maximum safe estimated batch size is only {}.", max_batch_size); @@ -230,7 +235,7 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config, int best_batch_size = granularity; float best_time = std::numeric_limits::max(); - const int chunk_size = std::min(chunk_size_in, model_config.stride * 300); + const int chunk_size = std::min(m_in_chunk_size, m_config.stride * 300); if (run_benchmark) { // We limit the maximum when doing benchmarking to avoid excessive startup time. const int max_batch_size_limit = 10240; @@ -239,8 +244,7 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config, max_batch_size, granularity); for (int batch_size = granularity; batch_size <= max_batch_size; batch_size += granularity) { - auto input = - torch::empty({batch_size, model_config.num_features, chunk_size}, m_options); + auto input = torch::empty({batch_size, m_config.num_features, chunk_size}, m_options); float time = std::numeric_limits::max(); for (int i = 0; i < 2; ++i) { // run twice to eliminate outliers @@ -275,10 +279,24 @@ int CudaCaller::determine_batch_size(const CRFModelConfig &model_config, best_batch_size = max_batch_size; } +#if TORCH_VERSION_MAJOR >= 2 + // Do not re-use smaller chunks of buffers larger than half the decode size + // This prevents small allocations from reusing large sections of cached allocated memory + // which can lead to OoM errors when the original large allocation is needed again + auto max_split_size_mb = decode_bytes_per_chunk * best_batch_size / (2 * 1024 * 1024); + auto device_stats = c10::cuda::CUDACachingAllocator::getDeviceStats(m_options.device().index()); + if (max_split_size_mb < device_stats.max_split_size) { + std::string max_split_size_mb_settings = + "max_split_size_mb:" + std::to_string(max_split_size_mb); + + c10::cuda::CUDACachingAllocator::setAllocatorSettings(max_split_size_mb_settings); + } +#endif + spdlog::debug("Device {} Model memory {:.2f}GB", m_device, - (crfmodel_bytes_per_chunk_timestep * chunk_size_out * best_batch_size) / GB); + (crfmodel_bytes_per_chunk * best_batch_size) / GB); spdlog::debug("Device {} Decode memory {:.2f}GB", m_device, - (decode_bytes_per_chunk_timestep * chunk_size_out * best_batch_size) / GB); + (decode_bytes_per_chunk * best_batch_size) / GB); return best_batch_size; } diff --git a/dorado/basecall/CudaCaller.h b/dorado/basecall/CudaCaller.h index b5b855ac..c1a3dc55 100644 --- a/dorado/basecall/CudaCaller.h +++ b/dorado/basecall/CudaCaller.h @@ -53,10 +53,9 @@ class CudaCaller { return 64; } - int determine_batch_size(const CRFModelConfig &model_config, - int chunk_size_in, - float memory_limit_fraction, - bool run_benchmark); + std::pair calculate_memory_requirements() const; + int determine_batch_size(float memory_limit_fraction, bool run_benchmark); + void start_threads(); void cuda_thread_fn();