diff --git a/nemo/collections/common/tokenizers/__init__.py b/nemo/collections/common/tokenizers/__init__.py index ae6f0950d6ac..44537eac882b 100644 --- a/nemo/collections/common/tokenizers/__init__.py +++ b/nemo/collections/common/tokenizers/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer diff --git a/nemo/collections/common/tokenizers/bytelevel_tokenizers.py b/nemo/collections/common/tokenizers/bytelevel_tokenizers.py new file mode 100644 index 000000000000..f834b7b97e92 --- /dev/null +++ b/nemo/collections/common/tokenizers/bytelevel_tokenizers.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from pathlib import Path +from typing import List +from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['ByteLevelProcessor', 'ByteLevelTokenizer'] + + +class ByteLevelProcessor: + """ + A very basic tokenization and detokenization class for use with byte-level + tokenization. + """ + + def detokenize(self, tokens: List[str]) -> str: + return ' '.join(tokens) + + def tokenize(self, text) -> str: + return text + + def normalize(self, text) -> str: + return text + + +class ByteLevelTokenizer(TokenizerSpec): + def __init__(self): + self.vocab_size = 259 + self.special_tokens = [self.bos_id, self.eos_id, self.pad_id] + + # no distinction between tokens and ids. + def text_to_tokens(self, text): + return self.text_to_ids(text) + + def tokens_to_text(self, tokens): + return self.ids_to_text(tokens) + + def text_to_ids(self, text): + return list(text.encode('utf-8')) + + def ids_to_text(self, ids): + # remove special tokens. + ids = [x for x in ids if x < 256] + return bytes(ids).decode('utf-8', errors='ignore').rstrip() + + def tokens_to_ids(self, tokens): + return tokens + + def ids_to_tokens(self, ids): + return ids + + @property + def pad_id(self): + return 256 + + @property + def bos_id(self): + return 257 + + @property + def eos_id(self): + return 258 + + @property + def unk_id(self): + return 259 # unused diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py index b69d7f31bf91..b7fe492b396a 100644 --- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py +++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py @@ -32,6 +32,7 @@ from nemo.collections.common.losses import NLLLoss, SmoothedCrossEntropyLoss from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init +from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelProcessor from nemo.collections.common.tokenizers.chinese_tokenizers import ChineseProcessor from nemo.collections.common.tokenizers.en_ja_tokenizers import EnJaProcessor from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor @@ -70,17 +71,20 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): self.multilingual = cfg.get("multilingual", False) self.multilingual_ids = [] + self.encoder_tokenizer_library = cfg.encoder_tokenizer.get('library', 'yttm') + self.decoder_tokenizer_library = cfg.decoder_tokenizer.get('library', 'yttm') + # Instantiates tokenizers and register to be saved with NeMo Model archive # After this call, ther will be self.encoder_tokenizer and self.decoder_tokenizer # Which can convert between tokens and token_ids for SRC and TGT languages correspondingly. self.setup_enc_dec_tokenizers( - encoder_tokenizer_library=cfg.encoder_tokenizer.get('library', 'yttm'), + encoder_tokenizer_library=self.encoder_tokenizer_library, encoder_tokenizer_model=cfg.encoder_tokenizer.get('tokenizer_model'), encoder_bpe_dropout=cfg.encoder_tokenizer.get('bpe_dropout', 0.0) if cfg.encoder_tokenizer.get('bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, - decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), + decoder_tokenizer_library=self.decoder_tokenizer_library, decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0) if cfg.decoder_tokenizer.get('bpe_dropout', 0.0) is not None @@ -112,15 +116,13 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): self.source_processor_list = [] self.target_processor_list = [] for src_lng, tgt_lng in zip(self.src_language, self.tgt_language): - src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils( - source_lang=src_lng, target_lang=tgt_lng - ) + src_prcsr, tgt_prscr = self.setup_pre_and_post_processing_utils(src_lng, tgt_lng) self.source_processor_list.append(src_prcsr) self.target_processor_list.append(tgt_prscr) else: # After this call, the model will have self.source_processor and self.target_processor objects - self.setup_pre_and_post_processing_utils(source_lang=self.src_language, target_lang=self.tgt_language) + self.setup_pre_and_post_processing_utils(self.src_language, self.tgt_language) self.multilingual_ids = [None] # TODO: Why is this base constructor call so late in the game? @@ -385,7 +387,7 @@ def setup_enc_dec_tokenizers( decoder_model_name=None, ): - supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron'] + supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron', 'byte-level'] if ( encoder_tokenizer_library not in supported_tokenizers or decoder_tokenizer_library not in supported_tokenizers @@ -661,18 +663,24 @@ def setup_pre_and_post_processing_utils(self, source_lang, target_lang): Creates source and target processor objects for input and output pre/post-processing. """ self.source_processor, self.target_processor = None, None - if (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'): + + if self.encoder_tokenizer_library == 'byte-level': + self.source_processor = ByteLevelProcessor() + elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'): self.source_processor = EnJaProcessor(source_lang) + elif source_lang == 'zh': + self.source_processor = ChineseProcessor() + elif source_lang is not None and source_lang not in ['ja', 'zh']: + self.source_processor = MosesProcessor(source_lang) + + if self.decoder_tokenizer_library == 'byte-level': + self.target_processor = ByteLevelProcessor() + elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'): self.target_processor = EnJaProcessor(target_lang) - else: - if source_lang == 'zh': - self.source_processor = ChineseProcessor() - if target_lang == 'zh': - self.target_processor = ChineseProcessor() - if source_lang is not None and source_lang not in ['ja', 'zh']: - self.source_processor = MosesProcessor(source_lang) - if target_lang is not None and target_lang not in ['ja', 'zh']: - self.target_processor = MosesProcessor(target_lang) + elif target_lang == 'zh': + self.target_processor = ChineseProcessor() + elif target_lang is not None and target_lang not in ['ja', 'zh']: + self.target_processor = MosesProcessor(target_lang) return self.source_processor, self.target_processor diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 8eaa2b3f7707..36d953e807c0 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ from typing import Dict, List, Optional import nemo +from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer @@ -138,6 +139,9 @@ def get_nmt_tokenizer( return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens_dict ) + elif library == 'byte-level': + logging.info(f'Using byte-level tokenization') + return ByteLevelTokenizer() elif library == 'megatron': logging.info( f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}' @@ -145,5 +149,5 @@ def get_nmt_tokenizer( return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file) else: raise NotImplementedError( - 'Currently we only support "yttm", "huggingface", "megatron", and "sentencepiece" tokenizer library.' + 'Currently we only support "yttm", "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer libraries.' )