Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ int main(int argc, char ** argv) {
ctx = llama_init.context.get();

if (model == NULL) {
if(getenv("DRYRUN")) {
LOG_ERR("%s: Dryrun compleated!\n", __func__);
return 0;
}
LOG_ERR("%s: error: unable to load model\n", __func__);
return 1;
}
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,11 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
}

static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
if(getenv("DRYRUN")) {
GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size);
return NULL;
}

void * data = ggml_aligned_malloc(size);

if (data == NULL) {
Expand Down
23 changes: 19 additions & 4 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

ggml_cuda_set_device(buft_ctx->device);

if(getenv("DRYRUN")) {
GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
return nullptr;
}

void * dev_ptr;
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) {
Expand Down Expand Up @@ -823,12 +828,18 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
// FIXME: do not crash if cudaMalloc fails
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
ggml_cuda_set_device(id);

char * buf;
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
if(getenv("DRYRUN")) {
GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size);
buf = nullptr;
} else {
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));

// set padding to 0 to avoid possible NaN values
if (size > original_size) {
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
// set padding to 0 to avoid possible NaN values
if (size > original_size) {
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
}
}

extra->data_device[id] = buf;
Expand Down Expand Up @@ -1076,6 +1087,10 @@ static void * ggml_cuda_host_malloc(size_t size) {
}

static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
if(getenv("DRYRUN")) {
GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size);
return nullptr;
}
void * ptr = ggml_cuda_host_malloc(size);

if (ptr == nullptr) {
Expand Down
4 changes: 4 additions & 0 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ bool llama_kv_cache_init(

ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
if(getenv("DRYRUN")) {
LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
return true;
}
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
return false;
}
Expand Down
7 changes: 7 additions & 0 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,13 @@ llama_model_loader::llama_model_loader(
use_mmap = false;
}

if(getenv("DRYRUN")) {
if (use_mmap) {
LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
use_mmap = false;
}
}

this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
}
Expand Down
32 changes: 21 additions & 11 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3494,7 +3494,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
else {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
if(getenv("DRYRUN")) {
LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
} else {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
}
pimpl->bufs.emplace_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
Expand All @@ -3512,10 +3516,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
throw std::runtime_error("failed to allocate buffer");
}

for (auto & buf : buf_map) {
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
if(!getenv("DRYRUN")) {
for (auto & buf : buf_map) {
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
}
}

ctx_bufs.emplace_back(ctx, buf_map);
Expand All @@ -3536,8 +3542,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}

// print memory requirements per buffer type
for (auto & buf : pimpl->bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
if(!getenv("DRYRUN")) {
for (auto & buf : pimpl->bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
}
}

// populate tensors_by_name
Expand All @@ -3548,11 +3556,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}

// load tensor data
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
if(!getenv("DRYRUN")) {
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false;
}
}
}

Expand Down