Skip to content

Commit

Permalink
llama.cpp: add 4-bit/8-bit kv cache options
Browse files Browse the repository at this point in the history
  • Loading branch information
oobabooga committed Jun 29, 2024
1 parent 220c179 commit 4ea2600
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 0 deletions.
7 changes: 7 additions & 0 deletions modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
'flash_attn': shared.args.flash_attn
}

if shared.args.cache_4bit:
params["type_k"] = 2
params["type_v"] = 2
elif shared.args.cache_8bit:
params["type_k"] = 8
params["type_v"] = 8

Llama = llama_cpp_lib().Llama
model = Llama(**params)

Expand Down
7 changes: 7 additions & 0 deletions modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ def from_pretrained(self, path):
'flash_attn': shared.args.flash_attn
}

if shared.args.cache_4bit:
params["type_k"] = 2
params["type_v"] = 2
elif shared.args.cache_8bit:
params["type_k"] = 8
params["type_v"] = 8

result.model = Llama(**params)
if cache_capacity > 0:
result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
Expand Down
4 changes: 4 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
'llama.cpp': [
'n_ctx',
'n_gpu_layers',
'cache_8bit',
'cache_4bit',
'tensor_split',
'n_batch',
'threads',
Expand All @@ -51,6 +53,8 @@
'llamacpp_HF': [
'n_ctx',
'n_gpu_layers',
'cache_8bit',
'cache_4bit',
'tensor_split',
'n_batch',
'threads',
Expand Down

1 comment on commit 4ea2600

@oobabooga
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Credits to @GodEmperor785 for finding the right values to use #6168 (comment)

Please sign in to comment.