diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bcc94d..be0e676 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,20 @@ ## [Unreleased] +### Added + +### Fixed + +## [0.3.0] + ### Added - Formatter spec (SciML) and a required CI check whether all files are formatted. All future contributions are expected to follow this spec. For convenience, on Unix-based systems, you can run `make format` to format all files in the repository (requires having JuliaFormatter installed). - Convenience shortcut to start the llama.cpp server with `make server model=path/to/model` (works on Unix-based systems only). -### Fixed +### Updated - Updated docstrings for `run_llama`, `run_chat`, and `run_server` to be more informative. -- Changed default `run_*` parameters to be more sensible for first-time users, including the `run_server` port number (`port=10897`) to be unique and not clash with other services. +- Changed default `run_*` parameters to be more sensible for first-time users, including the `run_server` port number (`port=10897`) to be unique and not clash with other services and for `embeddings` to be enabled by default. - Updated run context with the necessary files for fast inference on Metal GPUs (eg, Apple Macbooks M-series) +- Updated `llama.cpp` to `0.0.16` (b2382) for better performance and stability. ## [0.2.0] diff --git a/Project.toml b/Project.toml index 0f6d5a0..a249ecc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Llama" uuid = "882c185a-eef0-4636-aa0b-94c4dba13695" authors = ["Marco Matthies <71844+marcom@users.noreply.github.com>"] -version = "0.2.0" +version = "0.3.0" [deps] CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" @@ -15,7 +15,7 @@ CEnum = "0.5" Downloads = "1.5, 1.6" ReplMaker = "0.2" julia = "1.9" -llama_cpp_jll = "= 0.0.15" +llama_cpp_jll = "= 0.0.16" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/src/run-programs.jl b/src/run-programs.jl index 220faf9..f017f3a 100644 --- a/src/run-programs.jl +++ b/src/run-programs.jl @@ -102,6 +102,8 @@ Interrupt the server with `Ctrl+C`. - `n_gpu_layers`: number of layers to offload on the GPU (a.k.a. `ngl` in llama.cpp). Requires more VRAM on your GPU but can speed up inference. Set to 0 to run inference on CPU-only. Defaults to 99 (=practically all layers) - `ctx_size`: context size, ie, how big can the prompt/inference be. Defaults to 2048 (but most models allow 4,000 and more) +- `embeddings`: whether to allow generating of embeddings. Defaults to `true` +- `args`: additional arguments to pass to the server See the [full documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) for more details. @@ -128,10 +130,14 @@ function run_server(; nthreads::Int = Threads.nthreads(), n_gpu_layers::Int = 99, ctx_size::Int = 2048, + embeddings::Bool = true, args = ``) - cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $args` + embeddings_flag = embeddings ? `--embeddings` : "" + cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $(embeddings_flag) $args` # Provides the path to locate ggml-metal.metal file (must be provided separately) - cmd = addenv(cmd, - "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin")) + # ggml-metal than requires ggml-common.h, which is in a separate folder, so we to add C_INCLUDE_PATH as well + cmd = addenv( + cmd, "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"), + "C_INCLUDE_PATH" => joinpath(llama_cpp_jll.artifact_dir, "include")) run(cmd) end