From 76cd6eeddd210d02fa07255c9224a747d533dcbb Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Sat, 1 Mar 2025 02:11:36 +0100 Subject: [PATCH 1/4] Implemented a way for llama.cpp to go through all the steps of loading a model without loading a model in order to validate the model and compute the memory required to load the model --- ggml/src/ggml-backend.cpp | 5 +++++ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ src/llama-kv-cache.cpp | 2 +- src/llama-model-loader.cpp | 2 +- src/llama-model.cpp | 21 +++++++++++++-------- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index dba7be33b88c0..ca9760687c05e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1934,12 +1934,17 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_LOG_ERROR("%s: Skip allocateing buffer of size %zu\n", __func__, size); + return NULL; + void * data = ggml_aligned_malloc(size); if (data == NULL) { GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; } + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ebb2ccae04065..46754d4969ed3 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -643,10 +643,14 @@ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { } static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); + GGML_LOG_ERROR("%s: Skipping allocating %ld bytes on device %d\n", __func__, size, buft_ctx->device); + return nullptr; + void * dev_ptr; cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); if (err != cudaSuccess) { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de52cf..1e1e06120a7a8 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -107,7 +107,7 @@ bool llama_kv_cache_init( ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return false; + return true; } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 05d58ad90eba9..13d81932c210d 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -813,7 +813,7 @@ void llama_model_loader::done_getting_tensors() const { } void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { - if (use_mmap) { + if (use_mmap && false) { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 36a0a009c4567..85c704f63edae 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3448,6 +3448,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); pimpl->bufs.reserve(n_max_backend_buffer); + for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; @@ -3471,6 +3472,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev); + if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer @@ -3493,9 +3495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); - } + //if (buf == nullptr) { + //throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + //} pimpl->bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { pimpl->mlock_bufs.emplace_back(new llama_mlock); @@ -3508,6 +3510,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (pimpl->bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } @@ -3515,12 +3518,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (auto & buf : buf_map) { // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight - ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + //ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); } ctx_bufs.emplace_back(ctx, buf_map); } + if (llama_supports_gpu_offload()) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3535,9 +3539,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } + // print memory requirements per buffer type for (auto & buf : pimpl->bufs) { - LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + //LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name @@ -3551,9 +3556,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { - return false; - } + //if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + //return false; + //} } if (use_mmap_buffer) { From 3a129a1c6b6d401ea0407ed071c7e6a6b153eace Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Sun, 9 Mar 2025 21:39:26 +0100 Subject: [PATCH 2/4] Introduced DRYRUN environment variable to toggle the dry-run functionality --- ggml/src/ggml-backend.cpp | 8 +++--- ggml/src/ggml-cuda/ggml-cuda.cu | 7 +++-- src/llama-kv-cache.cpp | 8 ++++-- src/llama-model-loader.cpp | 9 +++++- src/llama-model.cpp | 51 ++++++++++++++++++--------------- 5 files changed, 50 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ca9760687c05e..3bf459ae709a4 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1934,8 +1934,10 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - GGML_LOG_ERROR("%s: Skip allocateing buffer of size %zu\n", __func__, size); - return NULL; + if(getenv("DRYRUN")) { + GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size); + return NULL; + } void * data = ggml_aligned_malloc(size); @@ -1943,8 +1945,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; } - GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); - return NULL; return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 46754d4969ed3..6849ea256e1df 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -643,13 +643,14 @@ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { } static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); - GGML_LOG_ERROR("%s: Skipping allocating %ld bytes on device %d\n", __func__, size, buft_ctx->device); - return nullptr; + if(getenv("DRYRUN")) { + GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size); + return nullptr; + } void * dev_ptr; cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 1e1e06120a7a8..c982e6629125a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -106,8 +106,12 @@ bool llama_kv_cache_init( ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); - return true; + if(getenv("DRYRUN")) { + LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__); + return true; + } + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 13d81932c210d..7ef869ee95e97 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -681,6 +681,13 @@ llama_model_loader::llama_model_loader( use_mmap = false; } + if(getenv("DRYRUN")) { + if (use_mmap) { + LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__); + use_mmap = false; + } + } + this->use_mmap = use_mmap; this->check_tensors = check_tensors; } @@ -813,7 +820,7 @@ void llama_model_loader::done_getting_tensors() const { } void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { - if (use_mmap && false) { + if (use_mmap) { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 85c704f63edae..6ed34c040994c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3448,7 +3448,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); pimpl->bufs.reserve(n_max_backend_buffer); - for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; @@ -3472,7 +3471,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev); - if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer @@ -3495,9 +3493,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - //if (buf == nullptr) { - //throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); - //} + if (buf == nullptr) { + if(getenv("DRYRUN")) { + LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft)); + } else { + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + } + } pimpl->bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { pimpl->mlock_bufs.emplace_back(new llama_mlock); @@ -3510,21 +3512,21 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - if (pimpl->bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } - for (auto & buf : buf_map) { - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight - //ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - } + if(!getenv("DRYRUN")) { + for (auto & buf : buf_map) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + } ctx_bufs.emplace_back(ctx, buf_map); } - if (llama_supports_gpu_offload()) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3539,11 +3541,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } - // print memory requirements per buffer type - for (auto & buf : pimpl->bufs) { - //LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); - } + if(!getenv("DRYRUN")) { + for (auto & buf : pimpl->bufs) { + LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + } + } // populate tensors_by_name for (auto & ctx : pimpl->ctxs) { @@ -3553,13 +3556,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // load tensor data - for (auto & it : ctx_bufs) { - ggml_context * ctx = it.first; - auto & bufs = it.second; - //if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { - //return false; - //} - } + if(!getenv("DRYRUN")) { + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + auto & bufs = it.second; + if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + return false; + } + } + } if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { From e9988886db52892c04c0ac3cc5861575d7956bb8 Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Sun, 9 Mar 2025 23:47:45 +0100 Subject: [PATCH 3/4] Implemented dry-run support for ggml_backend_cuda_host_buffer_type_alloc_buffer to allow testing memory requirements of situations where some or all layers are in RAM but are processed using the GPU using pinned memory and implemented yet untested dry-run support for ggml_backend_cuda_split_buffer_init_tensor --- examples/main/main.cpp | 4 ++++ ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cf8659b037ee3..472224149c987 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -153,6 +153,10 @@ int main(int argc, char ** argv) { ctx = llama_init.context.get(); if (model == NULL) { + if(getenv("DRYRUN")) { + LOG_ERR("%s: Dryrun compleated!\n", __func__); + return 0; + } LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 6849ea256e1df..7d285538efd00 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -828,12 +828,18 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf // FIXME: do not crash if cudaMalloc fails // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ggml_cuda_set_device(id); + char * buf; - CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); + if(getenv("DRYRUN")) { + GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size); + buf = nullptr; + } else { + CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } } extra->data_device[id] = buf; @@ -1081,6 +1087,10 @@ static void * ggml_cuda_host_malloc(size_t size) { } static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + if(getenv("DRYRUN")) { + GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size); + return nullptr; + } void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { From 5f6b9e7063fbd321c571d9347abb87148bac6a96 Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Sun, 9 Mar 2025 23:51:41 +0100 Subject: [PATCH 4/4] Fixed indention consistency by always using spaces instead of taps to follow the llama.cpp code styling guidelines --- ggml/src/ggml-cuda/ggml-cuda.cu | 8 ++--- src/llama-kv-cache.cpp | 10 +++--- src/llama-model-loader.cpp | 12 +++---- src/llama-model.cpp | 56 ++++++++++++++++----------------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7d285538efd00..a9b3f229d5818 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -647,10 +647,10 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac ggml_cuda_set_device(buft_ctx->device); - if(getenv("DRYRUN")) { - GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size); - return nullptr; - } + if(getenv("DRYRUN")) { + GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size); + return nullptr; + } void * dev_ptr; cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index c982e6629125a..afdde3c200df7 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -106,11 +106,11 @@ bool llama_kv_cache_init( ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { - if(getenv("DRYRUN")) { - LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__); - return true; - } - LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + if(getenv("DRYRUN")) { + LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__); + return true; + } + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); return false; } ggml_backend_buffer_clear(buf, 0); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 7ef869ee95e97..911523f61df83 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -681,12 +681,12 @@ llama_model_loader::llama_model_loader( use_mmap = false; } - if(getenv("DRYRUN")) { - if (use_mmap) { - LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__); - use_mmap = false; - } - } + if(getenv("DRYRUN")) { + if (use_mmap) { + LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__); + use_mmap = false; + } + } this->use_mmap = use_mmap; this->check_tensors = check_tensors; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6ed34c040994c..c0604b7edaaf5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3493,13 +3493,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - if(getenv("DRYRUN")) { - LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft)); - } else { - throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); - } - } + if (buf == nullptr) { + if(getenv("DRYRUN")) { + LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft)); + } else { + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + } + } pimpl->bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { pimpl->mlock_bufs.emplace_back(new llama_mlock); @@ -3516,13 +3516,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error("failed to allocate buffer"); } - if(!getenv("DRYRUN")) { - for (auto & buf : buf_map) { - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight - ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - } - } + if(!getenv("DRYRUN")) { + for (auto & buf : buf_map) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + } ctx_bufs.emplace_back(ctx, buf_map); } @@ -3542,11 +3542,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // print memory requirements per buffer type - if(!getenv("DRYRUN")) { - for (auto & buf : pimpl->bufs) { - LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); - } - } + if(!getenv("DRYRUN")) { + for (auto & buf : pimpl->bufs) { + LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + } + } // populate tensors_by_name for (auto & ctx : pimpl->ctxs) { @@ -3556,15 +3556,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // load tensor data - if(!getenv("DRYRUN")) { - for (auto & it : ctx_bufs) { - ggml_context * ctx = it.first; - auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { - return false; - } - } - } + if(!getenv("DRYRUN")) { + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + auto & bufs = it.second; + if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + return false; + } + } + } if (use_mmap_buffer) { for (auto & mapping : ml.mappings) {