Skip to content

Commit

Permalink
Update tokenizer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
astonzhang committed Apr 15, 2024
1 parent e2aab36 commit 71367a6
Showing 1 changed file with 7 additions and 8 deletions.
15 changes: 7 additions & 8 deletions llama/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Message(TypedDict):

class Tokenizer:
"""
tokenizing and encoding/decoding text using the Tiktoken tokenizer.

This comment has been minimized.

Copy link
@777easy

777easy Apr 29, 2024

Where the tiktoken tokenizer

Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
"""

special_tokens: Dict[str, int]
Expand All @@ -53,7 +53,6 @@ def __init__(self, model_path: str):
Args:
model_path (str): The path to the Tiktoken model file.
"""
# reload tokenizer
assert os.path.isfile(model_path), model_path

mergeable_ranks = load_tiktoken_bpe(model_path)
Expand Down Expand Up @@ -84,8 +83,8 @@ def __init__(self, model_path: str):
)
logger.info(f"Reloaded tiktoken model from {model_path}")

# BOS / EOS token IDs
self.n_words: int = self.model.n_vocab
# BOS / EOS token IDs
self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
self.eos_id: int = self.special_tokens["<|end_of_text|>"]
self.pad_id: int = -1
Expand Down Expand Up @@ -130,7 +129,7 @@ def encode(
assert type(s) is str

# The tiktoken tokenizer can handle <=400k chars without
# pyo3_runtime.PanicException (may go beyond 400k)
# pyo3_runtime.PanicException.
TIKTOKEN_MAX_ENCODE_CHARS = 400_000

# https://github.com/openai/tiktoken/issues/195
Expand Down Expand Up @@ -170,16 +169,16 @@ def decode(self, t: Sequence[int]) -> str:
Returns:
str: The decoded string.
"""
# typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
return self.model.decode(cast(List[int], t))

@staticmethod
def _split_whitespaces_or_nonwhitespaces(
s: str, max_consecutive_slice_len: int
) -> Iterator[str]:
"""
Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces.
"""
current_slice_len = 0
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
Expand Down Expand Up @@ -225,6 +224,6 @@ def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
for message in dialog:
tokens.extend(self.encode_message(message))
# Add the start of an assistant message for the model to complete
# Add the start of an assistant message for the model to complete.
tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
return tokens

0 comments on commit 71367a6

Please sign in to comment.