In [1]:
!pip install transformers sentencepiece google-api-python-client -qq

### Instead of using new tokenizer, we can merge llama or mistral tokenizer and extend with own trained tokenizer
### This way we can do transfer learning without training from scratch since we keep the index of original embeddings and just extend the new total index
### As gpu poor, this method can help reduce training time and save cost
### Based on https://github.com/ymcui/Chinese-LLaMA-Alpaca

In [2]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
from transformers import LlamaTokenizer,AutoTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
from tokenizers import Tokenizer
import sentencepiece as spm
import unicodedata

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
# load
# base_tokenizer_dir = "meta-llama/Llama-2-7b-hf" # base tokenizer
base_tokenizer_dir = "mistralai/Mistral-7B-v0.1" # base tokenizer

sp_model_file = "malaysia-ai/sentencepiece-tokenizer" # steal tokenizer from
# sp_model_file = "malaysia-ai/bpe-tokenizer" # steal tokenizer from

In [4]:
# Download model
from huggingface_hub import snapshot_download
snapshot_download(repo_id=sp_model_file, revision="main", local_dir=f"./{sp_model_file.replace('/', '_')}")

print(f"Model dir: './{sp_model_file.replace('/', '_')}'")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Model dir: './malaysia-ai_sentencepiece-tokenizer'


In [5]:
base_tokenizer = AutoTokenizer.from_pretrained(base_tokenizer_dir,use_fast=False)

In [6]:
my_tokenizer = Tokenizer.from_file(f"./{sp_model_file.replace('/', '_')}/tokenizer.json")

In [7]:
llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(base_tokenizer.sp_model.serialized_model_proto())
my_spm = sp_pb2_model.ModelProto()

In [8]:
# print number of tokens
print(len(base_tokenizer))
print(base_tokenizer.all_special_tokens)
print(base_tokenizer.all_special_ids)
print(base_tokenizer.special_tokens_map)

32000
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}


In [9]:
## Add Malaysia tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(f"Before:{len(llama_spm_tokens_set)}")

Before:32000


In [10]:
my_vocab = my_tokenizer.get_vocab() # Dict[str, int]

for vocab in my_vocab.keys():
    piece = vocab
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

New model pieces: 54877


In [11]:
output_sp_dir = 'merged_tokenizer_sp'
output_hf_dir = 'merged_tokenizer_hf' # the path to save Malaysia-LLaMA tokenizer
os.makedirs(output_sp_dir,exist_ok=True)
with open(output_sp_dir+'/malaysia_llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())
tokenizer = type(base_tokenizer)(vocab_file=output_sp_dir+'/malaysia_llama.model')

tokenizer.save_pretrained(output_hf_dir)
print(f"Malaysia-LLaMA tokenizer has been saved to {output_hf_dir}")

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Malaysia-LLaMA tokenizer has been saved to merged_tokenizer_hf


In [12]:
base_tokenizer = AutoTokenizer.from_pretrained(base_tokenizer_dir)
my_base_tokenizer = AutoTokenizer.from_pretrained(output_hf_dir)
print(tokenizer.all_special_tokens)
print(tokenizer.all_special_ids)
print(tokenizer.special_tokens_map)

['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}


In [13]:
# len just to proof the table 1 from their paper

In [14]:
texts = [
    "husein comel",
    "husein cute",
    "حسين چوميل",
    "侯赛因很可爱",
    "ஹுசைன் அழகாக இருக்கிறார்"
]
for text in texts:
    print("Test text:\n",text)
    norm_text = unicodedata.normalize('NFKC', text)
    print("Normalized text:\n", norm_text)
    print("---"*20)
    og = base_tokenizer.tokenize(norm_text)
    og_ids = base_tokenizer.encode(norm_text)
    recoverog = base_tokenizer.decode(og_ids)
    
    nw = my_base_tokenizer.tokenize(norm_text)
    nw_ids = my_base_tokenizer.encode(norm_text)
    print(og_ids)
    print(nw_ids)
    recovernw = my_base_tokenizer.decode(nw_ids)
    print(f"Tokenized by LLaMA tokenizer({len(og)}):         {og}")
    print(f"Tokenized by Malaysia-LLaMA tokenizer({len(nw)}):{nw}")
    print(f"Recover llama tokenizer({len(recoverog)}):{recoverog}")
    print(f"Recover new   tokenizer({len(recovernw)}):{recovernw}")
    print("---"*20)

Test text:
 husein comel
Normalized text:
 husein comel
------------------------------------------------------------
[1, 295, 1730, 262, 432, 301]
[1, 4946, 38982, 35502]
Tokenized by LLaMA tokenizer(5):         ['▁h', 'use', 'in', '▁com', 'el']
Tokenized by Malaysia-LLaMA tokenizer(3):['▁hus', 'ein', '▁comel']
Recover llama tokenizer(16):<s> husein comel
Recover new   tokenizer(16):<s> husein comel
------------------------------------------------------------
Test text:
 husein cute
Normalized text:
 husein cute
------------------------------------------------------------
[1, 295, 1730, 262, 17949]
[1, 4946, 38982, 17949]
Tokenized by LLaMA tokenizer(4):         ['▁h', 'use', 'in', '▁cute']
Tokenized by Malaysia-LLaMA tokenizer(3):['▁hus', 'ein', '▁cute']
Recover llama tokenizer(15):<s> husein cute
Recover new   tokenizer(15):<s> husein cute
------------------------------------------------------------
Test text:
 حسين چوميل
Normalized text:
 حسين چوميل
---------------------------------