Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update llama.cpp submodule to 66c1968f7 #2618

Merged
merged 1 commit into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion llm/dyn_ext_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)

if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}

sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
Expand Down
5 changes: 3 additions & 2 deletions llm/ext_server/ext_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = sparams->numa;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
Expand Down Expand Up @@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
}
#endif

llama_backend_init(params.numa);
llama_backend_init();
llama_numa_init(params.numa);

// load the model
if (!llama->load_model(params)) {
Expand Down
2 changes: 1 addition & 1 deletion llm/ext_server/ext_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ typedef struct ext_server_params {
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
bool numa; // attempt optimizations that help on some NUMA systems
int numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
Expand Down
2 changes: 1 addition & 1 deletion llm/llama.cpp
Submodule llama.cpp updated 75 files
+7 −3 .github/workflows/build.yml
+2 −0 .gitignore
+223 −183 CMakeLists.txt
+29 −0 Makefile
+1 −1 README.md
+5 −1 ci/run.sh
+57 −29 common/common.cpp
+2 −2 common/common.h
+1 −1 common/sampling.cpp
+7 −7 common/sampling.h
+23 −1 convert-hf-to-gguf.py
+21 −16 convert.py
+3 −2 examples/batched-bench/batched-bench.cpp
+1 −1 examples/batched.swift/Sources/main.swift
+3 −2 examples/batched/batched.cpp
+2 −1 examples/beam-search/beam-search.cpp
+19 −19 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+2 −1 examples/embedding/embedding.cpp
+0 −2 examples/export-lora/export-lora.cpp
+2 −1 examples/imatrix/imatrix.cpp
+2 −1 examples/infill/infill.cpp
+1 −2 examples/llama-bench/llama-bench.cpp
+2 −2 examples/llama.android/app/src/main/cpp/llama-android.cpp
+1 −1 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+49 −5 examples/llava/README.md
+642 −136 examples/llava/clip.cpp
+28 −21 examples/llava/clip.h
+59 −7 examples/llava/convert-image-encoder-to-gguf.py
+26 −3 examples/llava/llava-cli.cpp
+167 −0 examples/llava/llava-surgery-v2.py
+1 −5 examples/llava/llava-surgery.py
+280 −17 examples/llava/llava.cpp
+0 −2 examples/llava/llava.h
+2 −1 examples/lookahead/lookahead.cpp
+2 −1 examples/lookup/lookup.cpp
+5 −1 examples/main/README.md
+2 −1 examples/main/main.cpp
+2 −1 examples/parallel/parallel.cpp
+2 −1 examples/passkey/passkey.cpp
+3 −2 examples/perplexity/perplexity.cpp
+5 −3 examples/quantize/quantize.cpp
+9 −0 examples/server/README.md
+104 −33 examples/server/server.cpp
+17 −3 examples/server/utils.hpp
+2 −1 examples/simple/simple.cpp
+2 −1 examples/speculative/speculative.cpp
+1 −1 examples/tokenize/tokenize.cpp
+7 −7 examples/train-text-from-scratch/train-text-from-scratch.cpp
+3 −3 flake.lock
+3 −3 ggml-alloc.c
+18 −9 ggml-backend.c
+279 −203 ggml-cuda.cu
+55 −11 ggml-metal.m
+378 −6 ggml-metal.metal
+627 −30 ggml-quants.c
+12 −2 ggml-quants.h
+48 −29 ggml-vulkan.cpp
+196 −63 ggml.c
+22 −5 ggml.h
+11 −1 gguf-py/gguf/constants.py
+9 −2 gguf-py/gguf/gguf_writer.py
+1 −5 gguf-py/gguf/vocab.py
+158 −79 llama.cpp
+11 −1 llama.h
+37 −0 scripts/compare-commits.sh
+107 −0 scripts/hf.sh
+1 −1 tests/test-autorelease.cpp
+37 −39 tests/test-backend-ops.cpp
+10 −10 tests/test-grammar-parser.cpp
+2 −2 tests/test-llama-grammar.cpp
+1 −1 tests/test-model-load-cancel.cpp
+1 −1 tests/test-tokenizer-0-falcon.cpp
+1 −1 tests/test-tokenizer-0-llama.cpp
+1 −1 tests/test-tokenizer-1-bpe.cpp
+1 −1 tests/test-tokenizer-1-llama.cpp
57 changes: 28 additions & 29 deletions llm/patches/03-cudaleaks.diff → llm/patches/02-cudaleaks.diff
Original file line number Diff line number Diff line change
@@ -1,44 +1,38 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
index 7800c6e7..be30db23 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@ struct llama_client_slot
}
};
@@ -30,6 +30,10 @@
#include <atomic>
#include <signal.h>

+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@ struct llama_server_context
using json = nlohmann::json;

struct server_params
@@ -353,6 +357,9 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}

bool load_model(const gpt_params &params_)
@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
@@ -3143,6 +3150,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
index 933ebbc4..88a4f664 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
Expand All @@ -49,30 +43,30 @@ index 96976f24..3543920e 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
@@ -7991,10 +7992,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}

+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;

- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {

#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8004,7 +8005,7 @@ GGML_CALL void ggml_init_cublas() {
#endif

if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
}
@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8075,7 +8076,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

Expand All @@ -81,25 +75,30 @@ index 96976f24..3543920e 100644
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -11604,3 +11605,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
Expand Down
96 changes: 0 additions & 96 deletions llm/patches/02-shutdown.diff

This file was deleted.