update llama_cpp.jll (#12)

marcom · Mar 10, 2024 · f8c6a4d · f8c6a4d
1 parent 70c84db
commit f8c6a4d
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,20 @@
 ## [Unreleased]
 
+### Added
+
+### Fixed
+
+## [0.3.0]
+
 ### Added
 - Formatter spec (SciML) and a required CI check whether all files are formatted. All future contributions are expected to follow this spec. For convenience, on Unix-based systems, you can run `make format` to format all files in the repository (requires having JuliaFormatter installed).
 - Convenience shortcut to start the llama.cpp server with `make server model=path/to/model` (works on Unix-based systems only).
 
-### Fixed
+### Updated
 - Updated docstrings for `run_llama`, `run_chat`, and `run_server` to be more informative.
-- Changed default `run_*` parameters to be more sensible for first-time users, including the `run_server` port number (`port=10897`) to be unique and not clash with other services.
+- Changed default `run_*` parameters to be more sensible for first-time users, including the `run_server` port number (`port=10897`) to be unique and not clash with other services and for `embeddings` to be enabled by default.
 - Updated run context with the necessary files for fast inference on Metal GPUs (eg, Apple Macbooks M-series)
+- Updated `llama.cpp` to `0.0.16` (b2382) for better performance and stability.
 
 ## [0.2.0]
 

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Llama"
 uuid = "882c185a-eef0-4636-aa0b-94c4dba13695"
 authors = ["Marco Matthies <71844+marcom@users.noreply.github.com>"]
-version = "0.2.0"
+version = "0.3.0"
 
 [deps]
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
@@ -15,7 +15,7 @@ CEnum = "0.5"
 Downloads = "1.5, 1.6"
 ReplMaker = "0.2"
 julia = "1.9"
-llama_cpp_jll = "= 0.0.15"
+llama_cpp_jll = "= 0.0.16"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/src/run-programs.jl b/src/run-programs.jl
@@ -102,6 +102,8 @@ Interrupt the server with `Ctrl+C`.
 - `n_gpu_layers`: number of layers to offload on the GPU (a.k.a. `ngl` in llama.cpp). Requires more VRAM on your GPU but can speed up inference.
   Set to 0 to run inference on CPU-only. Defaults to 99 (=practically all layers)
 - `ctx_size`: context size, ie, how big can the prompt/inference be. Defaults to 2048 (but most models allow 4,000 and more)
+- `embeddings`: whether to allow generating of embeddings. Defaults to `true`
+- `args`: additional arguments to pass to the server
 
 See the [full documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) for more details.
 
@@ -128,10 +130,14 @@ function run_server(;
         nthreads::Int = Threads.nthreads(),
         n_gpu_layers::Int = 99,
         ctx_size::Int = 2048,
+        embeddings::Bool = true,
         args = ``)
-    cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $args`
+    embeddings_flag = embeddings ? `--embeddings` : ""
+    cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $(embeddings_flag) $args`
     # Provides the path to locate ggml-metal.metal file (must be provided separately)
-    cmd = addenv(cmd,
-        "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"))
+    # ggml-metal than requires ggml-common.h, which is in a separate folder, so we to add C_INCLUDE_PATH as well
+    cmd = addenv(
+        cmd, "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"),
+        "C_INCLUDE_PATH" => joinpath(llama_cpp_jll.artifact_dir, "include"))
     run(cmd)
 end