Skip to content

Commit

Permalink
allow LlamaHFTokenizer to enable workaround for broken llama.cpp pre-…
Browse files Browse the repository at this point in the history
…tokenizer
  • Loading branch information
Andrew Lapp committed May 16, 2024
1 parent 784e65b commit 01b7390
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 11 deletions.
16 changes: 12 additions & 4 deletions outlines/integrations/llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,19 @@ def __init__(self, model: "Llama"):
self.special_tokens: Set[int] = set()

self.vocabulary: Dict[str, int] = dict()
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t

self.decode = model.tokenizer().decode
tokenizer = model.tokenizer()

self.decode = tokenizer.decode

# ### TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
try:
self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab()
except AttributeError:
# ###
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t

def convert_token_to_string(self, token: str) -> str:
return token
Expand Down
11 changes: 11 additions & 0 deletions outlines/models/llamacpp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import dataclasses
import warnings
from typing import TYPE_CHECKING, Iterator, List, Optional, TypedDict, Union

from typing_extensions import Unpack
Expand Down Expand Up @@ -288,6 +289,16 @@ def llamacpp(
if "verbose" not in llamacpp_model_params:
llamacpp_model_params["verbose"] = False

# TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
if "tokenizer" not in llamacpp_model_params:
warnings.warn(
"llama.cpp pre-tokenizer is broken. "
+ "You may recieve an Outlines error during Regex index construction.\n"
+ "To avoid this error when using `models.llamacpp` you may pass "
+ "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(<hf_repo_id>)` "
+ "to `models.llamacpp()`"
)

model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params)

return LlamaCpp(model)
20 changes: 13 additions & 7 deletions tests/generate/test_integration_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,22 @@ def test_llamacpp_cfg(model):


@pytest.mark.parametrize(
"repo",
"repo,model_path,hf_tokenizer_uri",
[
("Qwen/Qwen1.5-0.5B-Chat-GGUF", "*q2*.gguf"),
("QuantFactory/Meta-Llama-3-8B-Instruct-GGUF", "*Q2*.gguf"),
("TheBloke/phi-2-GGUF", "*Q2*.gguf"),
("QuantFactory/Phi-3-mini-4k-instruct-GGUF-v2", "*Q2*.gguf"),
("Qwen/Qwen1.5-0.5B-Chat-GGUF", "*q2*.gguf", "Qwen/Qwen1.5-0.5B-Chat"),
("TheBloke/phi-2-GGUF", "*Q2*.gguf", "microsoft/phi-2"),
],
)
def test_byte_tokenizer_regression(repo, model_path):
def test_byte_tokenizer_regression(repo, model_path, hf_tokenizer_uri):
"""Reproduce https://github.com/outlines-dev/outlines/issues/820"""
model = models.llamacpp(repo, model_path)
import llama_cpp

model = models.llamacpp(
repo,
model_path,
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
hf_tokenizer_uri
),
)
generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"])
generator("Pick the odd word out: skirt, dress, pen, jacket")

0 comments on commit 01b7390

Please sign in to comment.