Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update llama.cpp submodule to c29af7e #2868

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions llm/ext_server/ext_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ void llama_server_start() {
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_all_tasks_finished(std::bind(
&llama_server_context::run_on_all_tasks_finished, llama));
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_run_slots(std::bind(
&llama_server_context::update_slots, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
Expand Down
6 changes: 3 additions & 3 deletions llm/patches/01-cache.diff
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d86d7e04..2694e92e 100644
index 2b2f4a0f..afac49af 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -901,13 +901,15 @@ struct llama_server_context
slot.sent_count += result.text_to_send.size();
@@ -997,13 +997,15 @@ struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);
Expand Down
35 changes: 17 additions & 18 deletions llm/patches/02-cudaleaks.diff
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7800c6e7..be30db23 100644
index 2b2f4a0f..25857bdd 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -30,6 +30,10 @@
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>

Expand All @@ -12,8 +12,8 @@ index 7800c6e7..be30db23 100644
+
using json = nlohmann::json;

struct server_params
@@ -353,6 +357,9 @@ struct llama_server_context
struct server_params {
@@ -363,6 +367,9 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
Expand All @@ -23,7 +23,7 @@ index 7800c6e7..be30db23 100644
}

bool load_model(const gpt_params &params_)
@@ -3143,6 +3150,7 @@ int main(int argc, char **argv)
@@ -3494,6 +3501,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
Expand All @@ -32,18 +32,18 @@ index 7800c6e7..be30db23 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 933ebbc4..88a4f664 100644
index 0c6501e9..75c12723 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
@@ -43,6 +43,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7991,10 +7992,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}

Expand All @@ -57,7 +57,7 @@ index 933ebbc4..88a4f664 100644

#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8004,7 +8005,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() {
#endif

if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
Expand All @@ -66,7 +66,7 @@ index 933ebbc4..88a4f664 100644
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8075,7 +8076,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

Expand All @@ -75,12 +75,11 @@ index 933ebbc4..88a4f664 100644
g_cublas_loaded = true;
}
}
@@ -11604,3 +11605,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
Expand All @@ -100,16 +99,16 @@ index 933ebbc4..88a4f664 100644
+ g_cublas_initialized = false;
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -20,6 +20,9 @@ extern "C" {
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);
@@ -23,6 +23,9 @@ GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);

+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);

Loading