From 76cd6eeddd210d02fa07255c9224a747d533dcbb Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Sat, 1 Mar 2025 02:11:36 +0100
Subject: [PATCH 1/4] Implemented a way for llama.cpp to go through all the
 steps of loading a model without loading a model in order to validate the
 model and compute the memory required to load the model

---
 ggml/src/ggml-backend.cpp       |  5 +++++
 ggml/src/ggml-cuda/ggml-cuda.cu |  4 ++++
 src/llama-kv-cache.cpp          |  2 +-
 src/llama-model-loader.cpp      |  2 +-
 src/llama-model.cpp             | 21 +++++++++++++--------
 5 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index dba7be33b88c0..ca9760687c05e 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1934,12 +1934,17 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    GGML_LOG_ERROR("%s: Skip allocateing buffer of size %zu\n", __func__, size);
+    return NULL;
+
     void * data = ggml_aligned_malloc(size);
 
     if (data == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
     }
+    GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+    return NULL;
 
     return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
 }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ebb2ccae04065..46754d4969ed3 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -643,10 +643,14 @@ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
 }
 
 static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+
     ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     ggml_cuda_set_device(buft_ctx->device);
 
+    GGML_LOG_ERROR("%s: Skipping allocating %ld bytes on device %d\n", __func__, size, buft_ctx->device);
+    return nullptr;
+
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
     if (err != cudaSuccess) {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0de52cf..1e1e06120a7a8 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -107,7 +107,7 @@ bool llama_kv_cache_init(
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return false;
+            return true;
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 05d58ad90eba9..13d81932c210d 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -813,7 +813,7 @@ void llama_model_loader::done_getting_tensors() const {
 }
 
 void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
-    if (use_mmap) {
+    if (use_mmap && false) {
         mappings.reserve(files.size());
         mmaps_used.reserve(files.size());
         for (const auto & file : files) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 36a0a009c4567..85c704f63edae 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3448,6 +3448,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
     pimpl->bufs.reserve(n_max_backend_buffer);
 
+
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx              = it.second;
@@ -3471,6 +3472,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
         bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
 
+
         if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -3493,9 +3495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         }
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf == nullptr) {
-                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-            }
+            //if (buf == nullptr) {
+                //throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+            //}
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 pimpl->mlock_bufs.emplace_back(new llama_mlock);
@@ -3508,6 +3510,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             }
         }
 
+
         if (pimpl->bufs.empty()) {
             throw std::runtime_error("failed to allocate buffer");
         }
@@ -3515,12 +3518,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         for (auto & buf : buf_map) {
             // indicate that this buffer contains weights
             // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            //ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
         }
 
         ctx_bufs.emplace_back(ctx, buf_map);
     }
 
+
     if (llama_supports_gpu_offload()) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
@@ -3535,9 +3539,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
     }
 
+
     // print memory requirements per buffer type
     for (auto & buf : pimpl->bufs) {
-        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        //LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
     }
 
     // populate tensors_by_name
@@ -3551,9 +3556,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
         auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
-            return false;
-        }
+        //if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+            //return false;
+        //}
     }
 
     if (use_mmap_buffer) {

From 3a129a1c6b6d401ea0407ed071c7e6a6b153eace Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Sun, 9 Mar 2025 21:39:26 +0100
Subject: [PATCH 2/4] Introduced DRYRUN environment variable to toggle the
 dry-run functionality

---
 ggml/src/ggml-backend.cpp       |  8 +++---
 ggml/src/ggml-cuda/ggml-cuda.cu |  7 +++--
 src/llama-kv-cache.cpp          |  8 ++++--
 src/llama-model-loader.cpp      |  9 +++++-
 src/llama-model.cpp             | 51 ++++++++++++++++++---------------
 5 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index ca9760687c05e..3bf459ae709a4 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1934,8 +1934,10 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    GGML_LOG_ERROR("%s: Skip allocateing buffer of size %zu\n", __func__, size);
-    return NULL;
+    if(getenv("DRYRUN")) {
+		GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size);
+		return NULL;
+	}
 
     void * data = ggml_aligned_malloc(size);
 
@@ -1943,8 +1945,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
     }
-    GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
-    return NULL;
 
     return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
 }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 46754d4969ed3..6849ea256e1df 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -643,13 +643,14 @@ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
 }
 
 static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-
     ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     ggml_cuda_set_device(buft_ctx->device);
 
-    GGML_LOG_ERROR("%s: Skipping allocating %ld bytes on device %d\n", __func__, size, buft_ctx->device);
-    return nullptr;
+	if(getenv("DRYRUN")) {
+		GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
+		return nullptr;
+	}
 
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 1e1e06120a7a8..c982e6629125a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -106,8 +106,12 @@ bool llama_kv_cache_init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return true;
+			if(getenv("DRYRUN")) {
+				LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
+				return true;
+			}
+			LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            return false;
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 13d81932c210d..7ef869ee95e97 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -681,6 +681,13 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
+	if(getenv("DRYRUN")) {
+		if (use_mmap) {
+			LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
+			use_mmap = false;
+		}
+	}
+
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
 }
@@ -813,7 +820,7 @@ void llama_model_loader::done_getting_tensors() const {
 }
 
 void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
-    if (use_mmap && false) {
+    if (use_mmap) {
         mappings.reserve(files.size());
         mmaps_used.reserve(files.size());
         for (const auto & file : files) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 85c704f63edae..6ed34c040994c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3448,7 +3448,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
     pimpl->bufs.reserve(n_max_backend_buffer);
 
-
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx              = it.second;
@@ -3472,7 +3471,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
         bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
 
-
         if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -3495,9 +3493,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         }
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            //if (buf == nullptr) {
-                //throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-            //}
+			if (buf == nullptr) {
+				if(getenv("DRYRUN")) {
+					LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
+				} else {
+					throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+				}
+			}
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 pimpl->mlock_bufs.emplace_back(new llama_mlock);
@@ -3510,21 +3512,21 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             }
         }
 
-
         if (pimpl->bufs.empty()) {
             throw std::runtime_error("failed to allocate buffer");
         }
 
-        for (auto & buf : buf_map) {
-            // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-            //ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        }
+		if(!getenv("DRYRUN")) {
+			for (auto & buf : buf_map) {
+				// indicate that this buffer contains weights
+				// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+				ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+			}
+		}
 
         ctx_bufs.emplace_back(ctx, buf_map);
     }
 
-
     if (llama_supports_gpu_offload()) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
@@ -3539,11 +3541,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
     }
 
-
     // print memory requirements per buffer type
-    for (auto & buf : pimpl->bufs) {
-        //LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
-    }
+	if(!getenv("DRYRUN")) {
+		for (auto & buf : pimpl->bufs) {
+			LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+		}
+	}
 
     // populate tensors_by_name
     for (auto & ctx : pimpl->ctxs) {
@@ -3553,13 +3556,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // load tensor data
-    for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
-        //if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
-            //return false;
-        //}
-    }
+	if(!getenv("DRYRUN")) {
+		for (auto & it : ctx_bufs) {
+			ggml_context * ctx = it.first;
+			auto & bufs = it.second;
+			if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+			return false;
+			}
+		}
+	}
 
     if (use_mmap_buffer) {
         for (auto & mapping : ml.mappings) {

From e9988886db52892c04c0ac3cc5861575d7956bb8 Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Sun, 9 Mar 2025 23:47:45 +0100
Subject: [PATCH 3/4] Implemented dry-run support for
 ggml_backend_cuda_host_buffer_type_alloc_buffer to allow testing memory
 requirements of situations where some or all layers are in RAM but are
 processed using the GPU using pinned memory and implemented yet untested
 dry-run support for ggml_backend_cuda_split_buffer_init_tensor

---
 examples/main/main.cpp          |  4 ++++
 ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index cf8659b037ee3..472224149c987 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -153,6 +153,10 @@ int main(int argc, char ** argv) {
     ctx = llama_init.context.get();
 
     if (model == NULL) {
+        if(getenv("DRYRUN")) {
+            LOG_ERR("%s: Dryrun compleated!\n", __func__);
+            return 0;
+        }
         LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 6849ea256e1df..7d285538efd00 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -828,12 +828,18 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
         ggml_cuda_set_device(id);
+
         char * buf;
-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+        if(getenv("DRYRUN")) {
+            GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size);
+            buf = nullptr;
+        } else {
+            CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
 
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            // set padding to 0 to avoid possible NaN values
+            if (size > original_size) {
+                CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            }
         }
 
         extra->data_device[id] = buf;
@@ -1081,6 +1087,10 @@ static void * ggml_cuda_host_malloc(size_t size) {
 }
 
 static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size);
+        return nullptr;
+    }
     void * ptr = ggml_cuda_host_malloc(size);
 
     if (ptr == nullptr) {

From 5f6b9e7063fbd321c571d9347abb87148bac6a96 Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Sun, 9 Mar 2025 23:51:41 +0100
Subject: [PATCH 4/4] Fixed indention consistency by always using spaces
 instead of taps to follow the llama.cpp code styling guidelines

---
 ggml/src/ggml-cuda/ggml-cuda.cu |  8 ++---
 src/llama-kv-cache.cpp          | 10 +++---
 src/llama-model-loader.cpp      | 12 +++----
 src/llama-model.cpp             | 56 ++++++++++++++++-----------------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 7d285538efd00..a9b3f229d5818 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -647,10 +647,10 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
     ggml_cuda_set_device(buft_ctx->device);
 
-	if(getenv("DRYRUN")) {
-		GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
-		return nullptr;
-	}
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
+        return nullptr;
+    }
 
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index c982e6629125a..afdde3c200df7 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -106,11 +106,11 @@ bool llama_kv_cache_init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-			if(getenv("DRYRUN")) {
-				LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
-				return true;
-			}
-			LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            if(getenv("DRYRUN")) {
+                LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
+                return true;
+            }
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 7ef869ee95e97..911523f61df83 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -681,12 +681,12 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
-	if(getenv("DRYRUN")) {
-		if (use_mmap) {
-			LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
-			use_mmap = false;
-		}
-	}
+    if(getenv("DRYRUN")) {
+        if (use_mmap) {
+            LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
+            use_mmap = false;
+        }
+    }
 
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6ed34c040994c..c0604b7edaaf5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3493,13 +3493,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         }
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-			if (buf == nullptr) {
-				if(getenv("DRYRUN")) {
-					LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
-				} else {
-					throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-				}
-			}
+            if (buf == nullptr) {
+                if(getenv("DRYRUN")) {
+                    LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
+                } else {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                }
+            }
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 pimpl->mlock_bufs.emplace_back(new llama_mlock);
@@ -3516,13 +3516,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             throw std::runtime_error("failed to allocate buffer");
         }
 
-		if(!getenv("DRYRUN")) {
-			for (auto & buf : buf_map) {
-				// indicate that this buffer contains weights
-				// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-				ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-			}
-		}
+        if(!getenv("DRYRUN")) {
+            for (auto & buf : buf_map) {
+                // indicate that this buffer contains weights
+                // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+                ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            }
+        }
 
         ctx_bufs.emplace_back(ctx, buf_map);
     }
@@ -3542,11 +3542,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // print memory requirements per buffer type
-	if(!getenv("DRYRUN")) {
-		for (auto & buf : pimpl->bufs) {
-			LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
-		}
-	}
+    if(!getenv("DRYRUN")) {
+        for (auto & buf : pimpl->bufs) {
+            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        }
+    }
 
     // populate tensors_by_name
     for (auto & ctx : pimpl->ctxs) {
@@ -3556,15 +3556,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // load tensor data
-	if(!getenv("DRYRUN")) {
-		for (auto & it : ctx_bufs) {
-			ggml_context * ctx = it.first;
-			auto & bufs = it.second;
-			if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
-			return false;
-			}
-		}
-	}
+    if(!getenv("DRYRUN")) {
+        for (auto & it : ctx_bufs) {
+            ggml_context * ctx = it.first;
+            auto & bufs = it.second;
+            if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+            return false;
+            }
+        }
+    }
 
     if (use_mmap_buffer) {
         for (auto & mapping : ml.mappings) {