llama.cpp: add 4-bit/8-bit kv cache options

oobabooga · Jun 29, 2024 · 4ea2600 · 4ea2600 · oobabooga · Jun 29, 2024
1 parent 220c179
commit 4ea2600
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 0 deletions.
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
@@ -221,6 +221,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
 

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
@@ -100,6 +100,13 @@ def from_pretrained(self, path):
             'flash_attn': shared.args.flash_attn
         }
 
+        if shared.args.cache_4bit:
+            params["type_k"] = 2
+            params["type_v"] = 2
+        elif shared.args.cache_8bit:
+            params["type_k"] = 8
+            params["type_v"] = 8
+
         result.model = Llama(**params)
         if cache_capacity > 0:
             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))

diff --git a/modules/loaders.py b/modules/loaders.py
@@ -30,6 +30,8 @@
     'llama.cpp': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',
@@ -51,6 +53,8 @@
     'llamacpp_HF': [
         'n_ctx',
         'n_gpu_layers',
+        'cache_8bit',
+        'cache_4bit',
         'tensor_split',
         'n_batch',
         'threads',