Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EOS token to rt_info #15

Merged
merged 4 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 101 additions & 101 deletions README.md

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ transformers = [
"transformers[sentencepiece] >= 4.36.0",
"tiktoken"
]
# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
torch = [
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.8"',
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.9"',
Expand All @@ -38,6 +39,7 @@ dev = [
"bandit",
"pytest",
"pytest_harvest",
"pandas",
"openvino_tokenizers[transformers, torch]"
]
fuzzing = [
Expand All @@ -53,15 +55,17 @@ all = [
convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"

[tool.ruff]
line-length = 119

[tool.ruff.lint]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
line-length = 119

[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]
"openvino_tokenizers/hf_parser.py" = ["F821"]

[tool.ruff.isort]
[tool.ruff.lint.isort]
lines-after-imports = 2

[tool.bandit]
Expand Down
1 change: 1 addition & 0 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models


_ext_name = "openvino_tokenizers"
if sys.platform == "win32":
_ext_name = f"{_ext_name}.dll"
Expand Down
2 changes: 2 additions & 0 deletions python/openvino_tokenizers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
TOKEN_IDS_OUTPUT_NAME = "token_ids"
STRING_OUTPUT_NAME = "string_output"

EOS_TOKEN_ID_NAME = "eos_token_id"

GREEDY_DECODER_NAME = "greedy_decoder"

TOKENIZER_NAME = "tokenizer"
Expand Down
32 changes: 24 additions & 8 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
from openvino.runtime import Node, op
from openvino.runtime.exceptions import OVTypeError
from openvino.runtime.utils.types import as_node, make_constant_node
from transformers import PreTrainedTokenizerBase
from transformers.convert_slow_tokenizer import import_protobuf

from . import _get_factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
EOS_TOKEN_ID_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
Expand Down Expand Up @@ -93,7 +95,7 @@ def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep:


def parse_byte_level_pretokenization_step(
pretokenizer_dict: Dict[str, Any]
pretokenizer_dict: Dict[str, Any],
) -> List[Union[NormalizationStep, PreTokenizatinStep]]:
steps = []
if pretokenizer_dict.get("add_prefix_space"):
Expand Down Expand Up @@ -145,6 +147,7 @@ def parse(
),
]:
add_steps()
self.pipeline.eos_token_id = getattr(self.original_tokenizer, "eos_token_id", None)

return self.pipeline

Expand Down Expand Up @@ -298,7 +301,7 @@ def decoding(
return


def parse_special_tokens(hf_tokenizer: "PreTrainedTokenizerBase") -> Dict[int, str]:
def parse_special_tokens(hf_tokenizer: PreTrainedTokenizerBase) -> Dict[int, str]:
# the order matters
if getattr(hf_tokenizer, "added_tokens_decoder", False):
return {
Expand All @@ -315,7 +318,7 @@ def parse_special_tokens(hf_tokenizer: "PreTrainedTokenizerBase") -> Dict[int, s


def convert_fast_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
number_of_inputs: int = 1,
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
Expand Down Expand Up @@ -348,13 +351,16 @@ def convert_fast_tokenizer(
filtered_outputs.append(ov_tokenizer.output(i))

tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_NAME)
for path, info in ov_tokenizer.get_rt_info().items():
tokenizer_model.set_rt_info(info.value, path)

if with_detokenizer:
return tokenizer_model, pipeline.get_detokenizer_ov_subgraph()

return tokenizer_model


def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model")


Expand Down Expand Up @@ -397,7 +403,7 @@ def modify_sentencepiece_model(


def convert_sentencepiece_model_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
add_attention_mask: bool = True,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
Expand Down Expand Up @@ -491,18 +497,26 @@ def convert_sentencepiece_model_tokenizer(
tokenizer = Model(outputs, [input_node], TOKENIZER_NAME)
tokenizer.validate_nodes_and_infer_types()

if hf_tokenizer.eos_token_id is not None:
tokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)

if not with_detokenizer:
return tokenizer

if clean_up_tokenization_spaces is None:
clean_up_tokenization_spaces = hf_tokenizer.clean_up_tokenization_spaces

return tokenizer, get_sp_detokenizer(
detokenizer = get_sp_detokenizer(
sp_model_node,
streaming_detokenizer=streaming_detokenizer,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
)

if hf_tokenizer.eos_token_id is not None:
detokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)

return tokenizer, detokenizer


def get_sp_detokenizer(
sp_model_node: Node, streaming_detokenizer: bool = False, clean_up_tokenization_spaces: bool = False
Expand Down Expand Up @@ -531,7 +545,7 @@ def get_sp_detokenizer(
return tokenizer_detokenizer


def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def is_tiktoken_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
try:
from tiktoken import Encoding
except ImportError:
Expand All @@ -543,7 +557,7 @@ def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:


def convert_tiktoken_model_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: Optional[bool] = None,
Expand Down Expand Up @@ -577,4 +591,6 @@ def convert_tiktoken_model_tokenizer(
if not with_detokenizer:
return pipeline.get_tokenizer_ov_subgraph()

pipeline.eos_token_id = hf_tokenizer.eos_token_id

return pipeline.get_tokenizer_ov_subgraph(), pipeline.get_detokenizer_ov_subgraph()
19 changes: 15 additions & 4 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
EOS_TOKEN_ID_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
Expand All @@ -26,8 +27,9 @@
from .str_pack import pack_string, pack_strings


@dataclass
class BasePipelineStep:
_pipeline = field(default=None, init=False, repr=False)
_pipeline: Optional[weakref.ReferenceType["TokenizerPipeline"]] = field(default=None, init=False, repr=False)

def __str__(self) -> str:
params_string = ", ".join(f"{key}={val!r}" for key, val in self.get_config().items())
Expand All @@ -44,7 +46,7 @@ def get_config(self) -> Dict[str, Any]:
return config

def get_pipeline(self) -> Optional["TokenizerPipeline"]:
return self._pipeline()
return self._pipeline() if self._pipeline is not None else None

def set_pipeline(self, pipeline: "TokenizerPipeline") -> None:
self._pipeline = weakref.ref(pipeline)
Expand Down Expand Up @@ -475,6 +477,9 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
if vocab is not None and self.token in vocab:
self._token_id = vocab.index(self.token)

@property
def token_id(self) -> Optional[int]:
return self._token_id

@dataclass
class TokenWithTypeId:
Expand Down Expand Up @@ -658,7 +663,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
"RaggedToDense",
input_nodes[3 * i : 3 * (i + 1)]
+ max_length.outputs()
+ make_constant_node(0, Type.i32).outputs(),
+ make_constant_node(self.token_id or 0, Type.i32).outputs(),
)
.outputs()
)
Expand Down Expand Up @@ -753,6 +758,7 @@ class TokenizerPipeline:
skip_tokens: Optional[List[int]] = field(default=None, repr=False)
number_of_inputs: int = 1
vocab_node_outputs: Optional[List[Output]] = field(default=None, repr=False)
eos_token_id: Optional[int] = None

def get_config(self) -> Dict[str, Dict[str, Any]]:
return {type(step).__name__: step.get_config() for step in self.steps}
Expand Down Expand Up @@ -793,7 +799,10 @@ def get_tokenizer_ov_subgraph(self) -> Model:
for step in self.post_tokenization_steps:
processing_outputs = step.get_ov_subgraph(processing_outputs)

return Model(processing_outputs, string_inputs, name=TOKENIZER_NAME)
model = Model(processing_outputs, string_inputs, name=TOKENIZER_NAME)
if self.eos_token_id is not None:
model.set_rt_info(self.eos_token_id, EOS_TOKEN_ID_NAME)
return model

@property
def normalization_steps(self) -> List[NormalizationStep]:
Expand Down Expand Up @@ -841,4 +850,6 @@ def get_detokenizer_ov_subgraph(self) -> Model:
outputs = self.create_decoding_pipeline([token_ids])
model = Model(outputs, [input_node], name=DETOKENIZER_NAME)
model.output().tensor.add_names({STRING_OUTPUT_NAME})
if self.eos_token_id is not None:
model.set_rt_info(self.eos_token_id, EOS_TOKEN_ID_NAME)
return model
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.9104394066610692
"tokenizers_test.py::test_": 0.9110740586355426
}
2 changes: 1 addition & 1 deletion tests/tokenizer_differential_fuzzing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import sys
import unicodedata
from functools import lru_cache

import atheris
import numpy as np
from openvino import compile_model
from transformers import AutoTokenizer
import unicodedata


with atheris.instrument_imports():
Expand Down
48 changes: 46 additions & 2 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from typing import Optional

import numpy as np
import pytest
from openvino import Core
from openvino import Core, Model
from openvino_tokenizers import convert_tokenizer
from openvino_tokenizers.constants import EOS_TOKEN_ID_NAME
from transformers import AutoTokenizer


Expand Down Expand Up @@ -117,7 +119,7 @@ def unpack_strings(strings):
# "THUDM/chatglm-6b", # hf_tokenizer init error
"THUDM/chatglm2-6b", # detokenizer cannot filter special tokens
"THUDM/chatglm3-6b",
# "t5-base", # crashes tests
# "t5-base", # no <s> token in the vocab, sentencepiece check error
]
tiktiken_models = [
"stabilityai/stablelm-2-1_6b",
Expand Down Expand Up @@ -468,3 +470,45 @@ def test_detokenizer_results_align_with_hf_on_multitoken_symbols_for_streaming()
hf_detokenized_stream += hf_output

assert detokenized_stream == hf_detokenized_stream


def check_eos_id(eos_token_id: Optional[int], *models: Model) -> None:
for model in models:
if eos_token_id is None:
assert not model.has_rt_info(EOS_TOKEN_ID_NAME)
else:
assert model.has_rt_info(EOS_TOKEN_ID_NAME)
assert model.get_rt_info(EOS_TOKEN_ID_NAME).value == eos_token_id


def test_eos_token_id_rt_info_wordpiece(hf_wordpiece_tokenizers):
eos_token_id = hf_wordpiece_tokenizers.eos_token_id
ov_tokenizer = convert_tokenizer(hf_wordpiece_tokenizers)
check_eos_id(eos_token_id, ov_tokenizer)


def test_eos_token_id_rt_info_bpe(hf_bpe_tokenizers):
eos_token_id = hf_bpe_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_bpe_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)


def test_eos_token_id_rt_info_tiktoken(hf_tiktoken_tokenizers):
eos_token_id = hf_tiktoken_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_tiktoken_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)


def test_eos_token_id_rt_info_sentencepiece(hf_sentencepiece_tokenizers):
eos_token_id = hf_sentencepiece_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_sentencepiece_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)
Loading