marcom · marcom · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ Julia interface to
 Meta's [LLaMA](https://arxiv.org/abs/2302.13971) (a large language
 model).
 
+
 ## Installation
 
 Press `]` at the Julia REPL to enter pkg mode, then:
@@ -18,84 +19,70 @@ add https://github.com/marcom/Llama.jl
 
 The `llama_cpp_jll.jl` package used behind the scenes currently works
 on Linux, Mac, and FreeBSD on i686, x86_64, and aarch64 (note: only
-tested on x86_64-linux so far).
+tested on x86_64-linux and aarch64-macos so far).
 
 ## Downloading the model weights
 
-You will need a file with quantized model weights, see
-[llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.
+You will need a file with quantized model weights in the right format (GGUF).
+
+You can either download the weights from the [HuggingFace Hub](https://huggingface.co) (search for "GGUF" to download the right format) or convert them from the original PyTorch weights (see [llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.)
+
+Good weights to start with are the Dolphin-family fine-tuned weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB).
 
-The weights for OpenLLaMA, an open-source reproduction of Meta AI's
-LLaMA, are freely available.  They can be downloaded here in GGML
-format (choose one of the .bin files):
-https://huggingface.co/SlyEcho/open_llama_3b_v2_ggml
+In the future, there might be new releases, so you might want to check for new versions.
 
 Once you have a `url` link to a `.gguf` file, you can simply download it via:
 
 ```julia
-using Llama
-
-# Example for an Open-chat 7Bn parameter model (c. 4.4GB)
-url = "https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF/resolve/main/openchat-3.5-0106.Q4_K_M.gguf"
+# Example for a 7Bn parameter model (c. 4.4GB)
+url = "https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF/resolve/main/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf"
 model = download_model(url)
-# Output: "models/openchat-3.5-0106.Q4_K_M.gguf"
+# Output: "models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf"
 ```
+
 You can use the model variable directly in the `run_*` functions, like `run_server`.
 
-## Simple HTTP Server
+## Running example executables from llama.cpp
+
+### Simple HTTP Server
 
-Given a `model` file, you can run a simple HTTP server that provides both an in-browser chat interface and an OpenAI-compatible chat completion endpoint.
+Server mode is the easiest way to get started with Llama.jl. It provides both an in-browser chat interface and an OpenAI-compatible chat completion endpoint (for packages like [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl)).
 
 ```julia
 using Llama
+
+# Use the `model` downloaded above
 Llama.run_server(; model)
 ```
-Explore other models on the [HuggingFace Hub](https://huggingface.co).
 
-## REPL mode
-
-The REPL mode is currently non-functional, but stay tuned!
-
-## LibLlama
+### Llama Text Generation
 
 ```julia
-ctx = LlamaContext("./ggml-alpaca-7b-q4.bin")
-```
-
-### `generate`
+using Llama
 
-```julia
-generate(ctx, "Write me a hello world in python")  # => currently prints text to screen
-```
+s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello")
 
-### `logits`
+# Provide additional arguments to llama.cpp (check the documentation for more details or the help text below)
+s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`)
 
-```julia
-logits(ctx)  # => Vector{Float32}, length ctx.n_vocab
+# print the help text with more options
+run_llama(model="", prompt="", args=`-h`)
 ```
 
-### `tokenize`
+> [!TIP]
+> If you're getting gibberish output, it's likely that the model requires a "prompt template" (ie, structure to how you provide your instructions). Review the model page on HF Hub to see how to use your model or use the server.
 
-```julia
-tokenize(ctx, "Hello world")  # => Vector{Int32} (token_ids), variable length
-```
 
-## Running example executables from llama.cpp
+### Interactive chat mode
 
 ```julia
-using Llama
-
-s = run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", args=`-n 16`)
+run_chat(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello chat mode")
+```
 
-# use more threads
-run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", nthreads=4)
+## REPL mode
 
-# print the help text with more options
-run_llama(model="", prompt="", args=`-h`)
-```
+The REPL mode is currently non-functional, but stay tuned!
 
-### Interactive chat mode
+## LibLlama
 
-```julia
-run_chat(model="./ggml-alpaca-7b-q4.bin", prompt="Hello chat mode", nthreads=4)
-```
+The `libllama` bindings are currently non-functional, but stay tuned!
diff --git a/src/Llama.jl b/src/Llama.jl
@@ -1,8 +1,14 @@
 module Llama
 
-export run_llama, run_chat, run_server, download_model
-export LlamaContext, embeddings, llama_eval, logits, tokenize,
-    token_to_str
+# Use only these executables for now:
+export run_llama, run_chat, run_server
+
+export download_model
+
+# Temporarily unexport as the low-level API is broken!
+# export LlamaContext, embeddings, llama_eval, logits, tokenize,
+#     token_to_str
+
 
 import llama_cpp_jll
 import ReplMaker
@@ -11,7 +17,8 @@ import Downloads
 include("../lib/LibLlama.jl")
 import .LibLlama
 
-__init__() = isdefined(Base, :active_repl) ? init_repl() : nothing
+# Temporarily disable as the low-level API is broken!
+# __init__() = isdefined(Base, :active_repl) ? init_repl() : nothing
 
 include("utils.jl")
 include("api.jl")

diff --git a/src/repl.jl b/src/repl.jl
@@ -16,6 +16,7 @@ function set_repl_llama(ctx::LlamaContext)
 end
 
 function repl_llama(s)
+    @warn "REPL Llama is not yet implemented. Please use `run_*` functions instead. See `?Llama.run_server` for more information."
     # TODO
     return s
 end
@@ -26,9 +27,9 @@ function init_repl()
     end
     ReplMaker.initrepl(
         repl_llama,
-        prompt_text = "LLaMA> ",
-        prompt_color = :blue,
-        start_key = '}',
-        mode_name = "LLaMA_mode",
+        prompt_text="LLaMA> ",
+        prompt_color=:blue,
+        start_key='}',
+        mode_name="LLaMA_mode",
     )
 end