From f4833f2eee144989bab33531dcb4fbf37eecb1ba Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Thu, 11 Jan 2024 09:34:59 +0000 Subject: [PATCH 1/2] Clean up README and exported functions --- README.md | 58 ++++++++++++++++++---------------------------------- src/Llama.jl | 11 ++++++---- src/repl.jl | 9 ++++---- 3 files changed, 32 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index a200b46..4ca93e2 100644 --- a/README.md +++ b/README.md @@ -17,56 +17,30 @@ add https://github.com/marcom/Llama.jl The `llama_cpp_jll.jl` package used behind the scenes currently works on Linux, Mac, and FreeBSD on i686, x86_64, and aarch64 (note: only -tested on x86_64-linux so far). +tested on x86_64-linux and aarch64-macos so far). ## Downloading the model weights -You will need a file with quantized model weights, see -[llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions. +You will need a file with quantized model weights in the right format (GGUF). -The weights for OpenLLaMA, an open-source reproduction of Meta AI's -LLaMA, are freely available. They can be downloaded here in GGML -format (choose one of the .bin files): -https://huggingface.co/SlyEcho/open_llama_3b_v2_ggml +You can either download the weights from the [HuggingFace Hub](https://huggingface.co) (search for "GGUF" to download the right format) or convert them from the original PyTorch weights (see [llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.) +Good weights to start with are the OpenChat weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB). +In the future, there might be new releases of the OpenChat weights, so look you might want to check for new versions. -## REPL mode - -The REPL mode is currently non-functional, but stay tuned! - -## LibLlama - -```julia -ctx = LlamaContext("./ggml-alpaca-7b-q4.bin") -``` - -### `generate` - -```julia -generate(ctx, "Write me a hello world in python") # => currently prints text to screen -``` - -### `logits` - -```julia -logits(ctx) # => Vector{Float32}, length ctx.n_vocab -``` - -### `tokenize` - -```julia -tokenize(ctx, "Hello world") # => Vector{Int32} (token_ids), variable length -``` +TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF/resolve/main/openchat-3.5-0106.Q4_K_M.gguf" ## Running example executables from llama.cpp +### Llama Text Generation + ```julia using Llama -s = run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", args=`-n 16`) +s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello") -# use more threads -run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", nthreads=4) +# Provide additional arguments to llama.cpp (check the documentation for more details or the help text below) +s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`) # print the help text with more options run_llama(model="", prompt="", args=`-h`) @@ -75,5 +49,13 @@ run_llama(model="", prompt="", args=`-h`) ### Interactive chat mode ```julia -run_chat(model="./ggml-alpaca-7b-q4.bin", prompt="Hello chat mode", nthreads=4) +run_chat(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello chat mode") ``` + +## REPL mode + +The REPL mode is currently non-functional, but stay tuned! + +## LibLlama + +The `libllama` bindings are currently non-functional, but stay tuned! \ No newline at end of file diff --git a/src/Llama.jl b/src/Llama.jl index 2ae403b..76466c7 100644 --- a/src/Llama.jl +++ b/src/Llama.jl @@ -1,8 +1,11 @@ module Llama +# Use only these executables for now: export run_llama, run_chat -export LlamaContext, embeddings, llama_eval, logits, tokenize, - token_to_str + +# Temporarily unexport as the low-level API is broken! +# export LlamaContext, embeddings, llama_eval, logits, tokenize, +# token_to_str import llama_cpp_jll import ReplMaker @@ -10,8 +13,8 @@ import ReplMaker include("../lib/LibLlama.jl") import .LibLlama -__init__() = isdefined(Base, :active_repl) ? init_repl() : nothing - +# Temporarily disable as the low-level API is broken! +# __init__() = isdefined(Base, :active_repl) ? init_repl() : nothing include("api.jl") include("run-programs.jl") diff --git a/src/repl.jl b/src/repl.jl index 5cf81dd..f8996ba 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -16,6 +16,7 @@ function set_repl_llama(ctx::LlamaContext) end function repl_llama(s) + @warn "REPL Llama is not yet implemented. Please use `run_*` functions instead. See `?Llama.run_server` for more information." # TODO return s end @@ -26,9 +27,9 @@ function init_repl() end ReplMaker.initrepl( repl_llama, - prompt_text = "LLaMA> ", - prompt_color = :blue, - start_key = '}', - mode_name = "LLaMA_mode", + prompt_text="LLaMA> ", + prompt_color=:blue, + start_key='}', + mode_name="LLaMA_mode", ) end From ee3238a9c97fc20ce55b4f6304f78ac93cebfa0a Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Thu, 11 Jan 2024 10:07:39 +0000 Subject: [PATCH 2/2] Swap OpenChat for Doplhin --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4ca93e2..d5b0cb7 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ You will need a file with quantized model weights in the right format (GGUF). You can either download the weights from the [HuggingFace Hub](https://huggingface.co) (search for "GGUF" to download the right format) or convert them from the original PyTorch weights (see [llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.) -Good weights to start with are the OpenChat weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB). -In the future, there might be new releases of the OpenChat weights, so look you might want to check for new versions. +Good weights to start with are the Dolphin-family fine-tuned weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB). +In the future, there might be new releases, so you might want to check for new versions. -TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF/resolve/main/openchat-3.5-0106.Q4_K_M.gguf" +TODO: Add a note on how to download "https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF/resolve/main/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf" ## Running example executables from llama.cpp @@ -37,19 +37,23 @@ TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3. ```julia using Llama -s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello") +s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello") # Provide additional arguments to llama.cpp (check the documentation for more details or the help text below) -s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`) +s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`) # print the help text with more options run_llama(model="", prompt="", args=`-h`) ``` +> [!TIP] +> If you're getting gibberish output, it's likely that the model requires a "prompt template" (ie, structure to how you provide your instructions). Review the model page on HF Hub to see how to use your model or use the server. + + ### Interactive chat mode ```julia -run_chat(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello chat mode") +run_chat(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello chat mode") ``` ## REPL mode