Skip to content

Commit

Permalink
update llama.cpp submodule to 66c1968f7 (#2618)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmorganca committed Feb 20, 2024
1 parent ace2cdf commit 4613a08
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 130 deletions.
7 changes: 6 additions & 1 deletion llm/dyn_ext_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)

if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}

sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
Expand Down
5 changes: 3 additions & 2 deletions llm/ext_server/ext_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = sparams->numa;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
Expand Down Expand Up @@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
}
#endif

llama_backend_init(params.numa);
llama_backend_init();
llama_numa_init(params.numa);

// load the model
if (!llama->load_model(params)) {
Expand Down
2 changes: 1 addition & 1 deletion llm/ext_server/ext_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ typedef struct ext_server_params {
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
bool numa; // attempt optimizations that help on some NUMA systems
int numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
Expand Down
2 changes: 1 addition & 1 deletion llm/llama.cpp
Submodule llama.cpp updated 75 files
+7 −3 .github/workflows/build.yml
+2 −0 .gitignore
+223 −183 CMakeLists.txt
+29 −0 Makefile
+1 −1 README.md
+5 −1 ci/run.sh
+57 −29 common/common.cpp
+2 −2 common/common.h
+1 −1 common/sampling.cpp
+7 −7 common/sampling.h
+23 −1 convert-hf-to-gguf.py
+21 −16 convert.py
+3 −2 examples/batched-bench/batched-bench.cpp
+1 −1 examples/batched.swift/Sources/main.swift
+3 −2 examples/batched/batched.cpp
+2 −1 examples/beam-search/beam-search.cpp
+19 −19 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+2 −1 examples/embedding/embedding.cpp
+0 −2 examples/export-lora/export-lora.cpp
+2 −1 examples/imatrix/imatrix.cpp
+2 −1 examples/infill/infill.cpp
+1 −2 examples/llama-bench/llama-bench.cpp
+2 −2 examples/llama.android/app/src/main/cpp/llama-android.cpp
+1 −1 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+49 −5 examples/llava/README.md
+642 −136 examples/llava/clip.cpp
+28 −21 examples/llava/clip.h
+59 −7 examples/llava/convert-image-encoder-to-gguf.py
+26 −3 examples/llava/llava-cli.cpp
+167 −0 examples/llava/llava-surgery-v2.py
+1 −5 examples/llava/llava-surgery.py
+280 −17 examples/llava/llava.cpp
+0 −2 examples/llava/llava.h
+2 −1 examples/lookahead/lookahead.cpp
+2 −1 examples/lookup/lookup.cpp
+5 −1 examples/main/README.md
+2 −1 examples/main/main.cpp
+2 −1 examples/parallel/parallel.cpp
+2 −1 examples/passkey/passkey.cpp
+3 −2 examples/perplexity/perplexity.cpp
+5 −3 examples/quantize/quantize.cpp
+9 −0 examples/server/README.md
+104 −33 examples/server/server.cpp
+17 −3 examples/server/utils.hpp
+2 −1 examples/simple/simple.cpp
+2 −1 examples/speculative/speculative.cpp
+1 −1 examples/tokenize/tokenize.cpp
+7 −7 examples/train-text-from-scratch/train-text-from-scratch.cpp
+3 −3 flake.lock
+3 −3 ggml-alloc.c
+18 −9 ggml-backend.c
+279 −203 ggml-cuda.cu
+55 −11 ggml-metal.m
+378 −6 ggml-metal.metal
+627 −30 ggml-quants.c
+12 −2 ggml-quants.h
+48 −29 ggml-vulkan.cpp
+196 −63 ggml.c
+22 −5 ggml.h
+11 −1 gguf-py/gguf/constants.py
+9 −2 gguf-py/gguf/gguf_writer.py
+1 −5 gguf-py/gguf/vocab.py
+158 −79 llama.cpp
+11 −1 llama.h
+37 −0 scripts/compare-commits.sh
+107 −0 scripts/hf.sh
+1 −1 tests/test-autorelease.cpp
+37 −39 tests/test-backend-ops.cpp
+10 −10 tests/test-grammar-parser.cpp
+2 −2 tests/test-llama-grammar.cpp
+1 −1 tests/test-model-load-cancel.cpp
+1 −1 tests/test-tokenizer-0-falcon.cpp
+1 −1 tests/test-tokenizer-0-llama.cpp
+1 −1 tests/test-tokenizer-1-bpe.cpp
+1 −1 tests/test-tokenizer-1-llama.cpp
57 changes: 28 additions & 29 deletions llm/patches/03-cudaleaks.diff → llm/patches/02-cudaleaks.diff
Original file line number Diff line number Diff line change
@@ -1,44 +1,38 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
index 7800c6e7..be30db23 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@ struct llama_client_slot
}
};
@@ -30,6 +30,10 @@
#include <atomic>
#include <signal.h>

+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@ struct llama_server_context
using json = nlohmann::json;

struct server_params
@@ -353,6 +357,9 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}

bool load_model(const gpt_params &params_)
@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
@@ -3143,6 +3150,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
index 933ebbc4..88a4f664 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
Expand All @@ -49,30 +43,30 @@ index 96976f24..3543920e 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
@@ -7991,10 +7992,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}

+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;

- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {

#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8004,7 +8005,7 @@ GGML_CALL void ggml_init_cublas() {
#endif

if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
}
@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8075,7 +8076,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

Expand All @@ -81,25 +75,30 @@ index 96976f24..3543920e 100644
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -11604,3 +11605,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
Expand Down
96 changes: 0 additions & 96 deletions llm/patches/02-shutdown.diff

This file was deleted.

0 comments on commit 4613a08

Please sign in to comment.