nicoboss · nicoboss · Mar 9, 2025 · Mar 1, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -153,6 +153,10 @@ int main(int argc, char ** argv) {
     ctx = llama_init.context.get();
 
     if (model == NULL) {
+        if(getenv("DRYRUN")) {
+            LOG_ERR("%s: Dryrun compleated!\n", __func__);
+            return 0;
+        }
         LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1934,6 +1934,11 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if(getenv("DRYRUN")) {
+		GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size);
+		return NULL;
+	}
+
     void * data = ggml_aligned_malloc(size);
 
     if (data == NULL) {

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -647,6 +647,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
     ggml_cuda_set_device(buft_ctx->device);
 
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
+        return nullptr;
+    }
+
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
     if (err != cudaSuccess) {
@@ -823,12 +828,18 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
         ggml_cuda_set_device(id);
+
         char * buf;
-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+        if(getenv("DRYRUN")) {
+            GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size);
+            buf = nullptr;
+        } else {
+            CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
 
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            // set padding to 0 to avoid possible NaN values
+            if (size > original_size) {
+                CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            }
         }
 
         extra->data_device[id] = buf;
@@ -1076,6 +1087,10 @@ static void * ggml_cuda_host_malloc(size_t size) {
 }
 
 static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size);
+        return nullptr;
+    }
     void * ptr = ggml_cuda_host_malloc(size);
 
     if (ptr == nullptr) {

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -106,6 +106,10 @@ bool llama_kv_cache_init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
+            if(getenv("DRYRUN")) {
+                LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
+                return true;
+            }
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
             return false;
         }

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -681,6 +681,13 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
+    if(getenv("DRYRUN")) {
+        if (use_mmap) {
+            LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
+            use_mmap = false;
+        }
+    }
+
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
 }

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3494,7 +3494,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf == nullptr) {
-                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                if(getenv("DRYRUN")) {
+                    LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
+                } else {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                }
             }
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -3512,10 +3516,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             throw std::runtime_error("failed to allocate buffer");
         }
 
-        for (auto & buf : buf_map) {
-            // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        if(!getenv("DRYRUN")) {
+            for (auto & buf : buf_map) {
+                // indicate that this buffer contains weights
+                // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+                ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            }
         }
 
         ctx_bufs.emplace_back(ctx, buf_map);
@@ -3536,8 +3542,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // print memory requirements per buffer type
-    for (auto & buf : pimpl->bufs) {
-        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    if(!getenv("DRYRUN")) {
+        for (auto & buf : pimpl->bufs) {
+            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        }
     }
 
     // populate tensors_by_name
@@ -3548,11 +3556,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // load tensor data
-    for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+    if(!getenv("DRYRUN")) {
+        for (auto & it : ctx_bufs) {
+            ggml_context * ctx = it.first;
+            auto & bufs = it.second;
+            if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
             return false;
+            }
         }
     }