From f4833f2eee144989bab33531dcb4fbf37eecb1ba Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Thu, 11 Jan 2024 09:34:59 +0000
Subject: [PATCH 1/2] Clean up README and exported functions

---
 README.md    | 58 ++++++++++++++++++----------------------------------
 src/Llama.jl | 11 ++++++----
 src/repl.jl  |  9 ++++----
 3 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index a200b46..4ca93e2 100644
--- a/README.md
+++ b/README.md
@@ -17,56 +17,30 @@ add https://github.com/marcom/Llama.jl
 
 The `llama_cpp_jll.jl` package used behind the scenes currently works
 on Linux, Mac, and FreeBSD on i686, x86_64, and aarch64 (note: only
-tested on x86_64-linux so far).
+tested on x86_64-linux and aarch64-macos so far).
 
 ## Downloading the model weights
 
-You will need a file with quantized model weights, see
-[llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.
+You will need a file with quantized model weights in the right format (GGUF).
 
-The weights for OpenLLaMA, an open-source reproduction of Meta AI's
-LLaMA, are freely available.  They can be downloaded here in GGML
-format (choose one of the .bin files):
-https://huggingface.co/SlyEcho/open_llama_3b_v2_ggml
+You can either download the weights from the [HuggingFace Hub](https://huggingface.co) (search for "GGUF" to download the right format) or convert them from the original PyTorch weights (see [llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.)
 
+Good weights to start with are the OpenChat weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB).
+In the future, there might be new releases of the OpenChat weights, so look you might want to check for new versions.
 
-## REPL mode
-
-The REPL mode is currently non-functional, but stay tuned!
-
-## LibLlama
-
-```julia
-ctx = LlamaContext("./ggml-alpaca-7b-q4.bin")
-```
-
-### `generate`
-
-```julia
-generate(ctx, "Write me a hello world in python")  # => currently prints text to screen
-```
-
-### `logits`
-
-```julia
-logits(ctx)  # => Vector{Float32}, length ctx.n_vocab
-```
-
-### `tokenize`
-
-```julia
-tokenize(ctx, "Hello world")  # => Vector{Int32} (token_ids), variable length
-```
+TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF/resolve/main/openchat-3.5-0106.Q4_K_M.gguf"
 
 ## Running example executables from llama.cpp
 
+### Llama Text Generation
+
 ```julia
 using Llama
 
-s = run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", args=`-n 16`)
+s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello")
 
-# use more threads
-run_llama(model="./ggml-alpaca-7b-q4.bin", prompt="Hello", nthreads=4)
+# Provide additional arguments to llama.cpp (check the documentation for more details or the help text below)
+s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`)
 
 # print the help text with more options
 run_llama(model="", prompt="", args=`-h`)
@@ -75,5 +49,13 @@ run_llama(model="", prompt="", args=`-h`)
 ### Interactive chat mode
 
 ```julia
-run_chat(model="./ggml-alpaca-7b-q4.bin", prompt="Hello chat mode", nthreads=4)
+run_chat(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello chat mode")
 ```
+
+## REPL mode
+
+The REPL mode is currently non-functional, but stay tuned!
+
+## LibLlama
+
+The `libllama` bindings are currently non-functional, but stay tuned!
\ No newline at end of file
diff --git a/src/Llama.jl b/src/Llama.jl
index 2ae403b..76466c7 100644
--- a/src/Llama.jl
+++ b/src/Llama.jl
@@ -1,8 +1,11 @@
 module Llama
 
+# Use only these executables for now:
 export run_llama, run_chat
-export LlamaContext, embeddings, llama_eval, logits, tokenize,
-    token_to_str
+
+# Temporarily unexport as the low-level API is broken!
+# export LlamaContext, embeddings, llama_eval, logits, tokenize,
+#     token_to_str
 
 import llama_cpp_jll
 import ReplMaker
@@ -10,8 +13,8 @@ import ReplMaker
 include("../lib/LibLlama.jl")
 import .LibLlama
 
-__init__() = isdefined(Base, :active_repl) ? init_repl() : nothing
-
+# Temporarily disable as the low-level API is broken!
+# __init__() = isdefined(Base, :active_repl) ? init_repl() : nothing
 
 include("api.jl")
 include("run-programs.jl")
diff --git a/src/repl.jl b/src/repl.jl
index 5cf81dd..f8996ba 100644
--- a/src/repl.jl
+++ b/src/repl.jl
@@ -16,6 +16,7 @@ function set_repl_llama(ctx::LlamaContext)
 end
 
 function repl_llama(s)
+    @warn "REPL Llama is not yet implemented. Please use `run_*` functions instead. See `?Llama.run_server` for more information."
     # TODO
     return s
 end
@@ -26,9 +27,9 @@ function init_repl()
     end
     ReplMaker.initrepl(
         repl_llama,
-        prompt_text = "LLaMA> ",
-        prompt_color = :blue,
-        start_key = '}',
-        mode_name = "LLaMA_mode",
+        prompt_text="LLaMA> ",
+        prompt_color=:blue,
+        start_key='}',
+        mode_name="LLaMA_mode",
     )
 end

From ee3238a9c97fc20ce55b4f6304f78ac93cebfa0a Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Thu, 11 Jan 2024 10:07:39 +0000
Subject: [PATCH 2/2] Swap OpenChat for Doplhin

---
 README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4ca93e2..d5b0cb7 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,10 @@ You will need a file with quantized model weights in the right format (GGUF).
 
 You can either download the weights from the [HuggingFace Hub](https://huggingface.co) (search for "GGUF" to download the right format) or convert them from the original PyTorch weights (see [llama.cpp](https://github.com/ggerganov/llama.cpp) for instructions.)
 
-Good weights to start with are the OpenChat weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB).
-In the future, there might be new releases of the OpenChat weights, so look you might want to check for new versions.
+Good weights to start with are the Dolphin-family fine-tuned weights, which are Apache 2.0 licensed and can be downloaded [here](https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF). Click on the tab "Files" and download one of the `*.gguf` files. We recommend the Q4_K_M version (~4.4GB).
+In the future, there might be new releases, so you might want to check for new versions.
 
-TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF/resolve/main/openchat-3.5-0106.Q4_K_M.gguf"
+TODO: Add a note on how to download "https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-GGUF/resolve/main/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf"
 
 ## Running example executables from llama.cpp
 
@@ -37,19 +37,23 @@ TODO: Add a note on how to download "https://huggingface.co/TheBloke/openchat-3.
 ```julia
 using Llama
 
-s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello")
+s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello")
 
 # Provide additional arguments to llama.cpp (check the documentation for more details or the help text below)
-s = run_llama(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`)
+s = run_llama(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello", n_gpu_layers=0, args=`-n 16`)
 
 # print the help text with more options
 run_llama(model="", prompt="", args=`-h`)
 ```
 
+> [!TIP]
+> If you're getting gibberish output, it's likely that the model requires a "prompt template" (ie, structure to how you provide your instructions). Review the model page on HF Hub to see how to use your model or use the server.
+
+
 ### Interactive chat mode
 
 ```julia
-run_chat(model="models/openchat-3.5-0106.Q4_K_M.gguf", prompt="Hello chat mode")
+run_chat(model="models/dolphin-2.6-mistral-7b-dpo.Q4_K_M.gguf", prompt="Hello chat mode")
 ```
 
 ## REPL mode