Fixed inference on Metal GPU backend + updated docs for run_* programs

marcom · Jan 11, 2024 · a2c9848 · a2c9848
2 parents dffa93d + 8e72bfc
commit a2c9848
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,16 @@
+## [Unreleased]
+
+### Added
+
+### Fixed
+- Updated docstrings for `run_llama`, `run_chat`, and `run_server` to be more informative.
+- Changed default `run_*` parameters to be more sensible for first-time users, including the `run_server` port number (`port=10897`) to be unique and not clash with other services.
+- Updated run context with the necessary files for fast inference on Metal GPUs (eg, Apple Macbooks M-series)
+
+## [0.2.0]
+
+### Added
+- Added `run_server` functionality that starts a simple HTTP server (interact either in your browser or use other LLM packages). It provides an OpenAI-compatible chat completion endpoint.
+
+### Fixed
+- Updated llama.cpp JLL bindings to "0.0.15" (llama.cpp b1796)
diff --git a/README.md b/README.md
@@ -30,8 +30,6 @@ LLaMA, are freely available.  They can be downloaded here in GGML
 format (choose one of the .bin files):
 https://huggingface.co/SlyEcho/open_llama_3b_v2_ggml
 
-Explore other models on the [HuggingFace Hub](https://huggingface.co).
-
 Once you have a `url` link to a `.gguf` file, you can simply download it via:
 
 ```julia
@@ -44,6 +42,16 @@ model = download_model(url)
 ```
 You can use the model variable directly in the `run_*` functions, like `run_server`.
 
+## Simple HTTP Server
+
+Given a `model` file, you can run a simple HTTP server that provides both an in-browser chat interface and an OpenAI-compatible chat completion endpoint.
+
+```julia
+using Llama
+Llama.run_server(; model)
+```
+Explore other models on the [HuggingFace Hub](https://huggingface.co).
+
 ## REPL mode
 
 The REPL mode is currently non-functional, but stay tuned!

diff --git a/src/run-programs.jl b/src/run-programs.jl
@@ -1,17 +1,67 @@
 # executables
 
-function run_llama(; model::AbstractString, prompt::AbstractString="", nthreads::Int=1, args=``)
-    cmd = `$(llama_cpp_jll.main()) --model $model --prompt $prompt --threads $nthreads $args`
+"""
+    run_llama(; model::AbstractString, prompt::AbstractString="", nthreads::Int=1, n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
+
+Runs `prompt` through the `model` provided and returns the result. This is a single-turn version of `run_chat`.
+
+See the [full documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md) for more details.
+
+# Arguments
+- `model`: path to the model to be used
+- `prompt`: prompt to be used. Most models expected these to be formatted in a specific way. Defaults to an empty string
+- `nthreads`: number of threads to use. Defaults to the number of available threads
+- `n_gpu_layers`: number of layers to offload on the GPU (a.k.a. `ngl` in llama.cpp). Requires more VRAM on your GPU but can speed up inference.
+  Set to 0 to run inference on CPU-only. Defaults to 99 (=practically all layers)
+- `ctx_size`: context size, ie, how big can the prompt/inference be. Defaults to 2048 (but most models allow 4,000 and more)
+
+Note: If you get odd responses AND you're using an instruction-tuned ("fine-tuned"), it might be that the format of your prompt is not correct. 
+See HuggingFace's model documentation for the correct prompt format or use a library that will do this for you (eg, PromptingTools.jl)
+
+See also: `run_chat`, `run_server`
+"""
+function run_llama(; model::AbstractString, prompt::AbstractString="", nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
+    cmd = `$(llama_cpp_jll.main()) --model $model --prompt $prompt --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $args`
+    # Provides the path to locate ggml-metal.metal file (must be provided separately)
+    cmd = addenv(cmd, "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"))
     return read(cmd, String)
 end
 
-function run_chat(; model::AbstractString, prompt::AbstractString="", nthreads::Int=1, args=``)
-    cmd = `$(llama_cpp_jll.main()) --model $model --prompt $prompt --threads $nthreads $args -ins`
+"""
+    run_chat(; model::AbstractString, prompt::AbstractString="", nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
+
+Opens an interactive console for the `model` and runs in "instruction" mode (especially useful for Alpaca-based models). 
+`prompt`, as the first message, is often used to provide instruction about the upcoming interactions (eg, style, tone, roles).
+
+Wait for model to reply and then type your response. Press `Enter` to send the message to the model.
+
+Interrupt the chat with `Ctrl+C`
+
+See the [full documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md) for more details.
+
+# Arguments
+- `model`: path to the model to be used
+- `prompt`: prompt to be used. Most models expected these to be formatted in a specific way. Defaults to an empty string
+- `nthreads`: number of threads to use. Defaults to the number of available threads
+- `n_gpu_layers`: number of layers to offload on the GPU (a.k.a. `ngl` in llama.cpp). Requires more VRAM on your GPU but can speed up inference.
+  Set to 0 to run inference on CPU-only. Defaults to 99 (=practically all layers)
+- `ctx_size`: context size, ie, how big can the prompt/inference be. Defaults to 2048 (but most models allow 4,000 and more)
+
+Note: If you get odd responses AND you're using an instruction-tuned ("fine-tuned"), it might be that the format of your prompt is not correct. 
+See HuggingFace's model documentation for the correct prompt format or use a library that will do this for you (eg, PromptingTools.jl)
+
+See also: `run_llama`, `run_server`
+"""
+function run_chat(; model::AbstractString, prompt::AbstractString="", nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
+    cmd = `$(llama_cpp_jll.main()) --model $model --prompt $prompt --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $args -ins`
+    # Provides the path to locate ggml-metal.metal file (must be provided separately)
+    cmd = addenv(cmd, "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"))
     run(cmd)
 end
 
 """
-    run_server(; model::AbstractString, host::AbstractString="127.0.0.1", port::Int=8080, nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=99, args=``)
+    run_server(; model::AbstractString, host::AbstractString="127.0.0.1", port::Int=10897, nthreads::Int=Threads.nthreads(), 
+    n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
 
 Starts a simple HTTP server with the `model` provided.
 
@@ -22,9 +72,11 @@ Interrupt the server with `Ctrl+C`.
 # Arguments
 - `model`: path to the model to be used
 - `host`: host address to bind to. Defaults to "127.0.0.1"
-- `port`: port to listen on. Defaults to 8080
+- `port`: port to listen on. Defaults to 10897
 - `nthreads`: number of threads to use. Defaults to the number of available threads
-- `n_gpu_layers`: number of layers to offload on the GPU. Requires more VRAM on your GPU but can speed up inference. Defaults to 0 (=no layers)
+- `n_gpu_layers`: number of layers to offload on the GPU (a.k.a. `ngl` in llama.cpp). Requires more VRAM on your GPU but can speed up inference.
+  Set to 0 to run inference on CPU-only. Defaults to 99 (=practically all layers)
+- `ctx_size`: context size, ie, how big can the prompt/inference be. Defaults to 2048 (but most models allow 4,000 and more)
 
 See the [full documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) for more details.
 
@@ -37,14 +89,16 @@ using Llama
 # See details [here](https://huggingface.co/TheBloke/dolphin-2_6-phi-2-GGUF)
 using Downloads
 model = joinpath("models", "dolphin-2_6-phi-2.Q6_K.gguf")
-mkpath(dirname(model))
+mkpath(dirname(model)) # ensure the folder exists
 Downloads.download("https://huggingface.co/TheBloke/dolphin-2_6-phi-2-GGUF/resolve/main/dolphin-2_6-phi-2.Q6_K.gguf", model)
 # go make a cup of tea while you wait... this is a 2.3GB download
 
 # Start the server
 run_server(; model)
 """
-function run_server(; model::AbstractString, host::AbstractString="127.0.0.1", port::Int=8080, nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=0, args=``)
-    cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers $args`
+function run_server(; model::AbstractString, host::AbstractString="127.0.0.1", port::Int=10897, nthreads::Int=Threads.nthreads(), n_gpu_layers::Int=99, ctx_size::Int=2048, args=``)
+    cmd = `$(llama_cpp_jll.server()) --model $model --host $host --port $port --threads $nthreads --n-gpu-layers $n_gpu_layers --ctx-size $ctx_size $args`
+    # Provides the path to locate ggml-metal.metal file (must be provided separately)
+    cmd = addenv(cmd, "GGML_METAL_PATH_RESOURCES" => joinpath(llama_cpp_jll.artifact_dir, "bin"))
     run(cmd)
 end