In [1]:
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [86]:
llama_tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
han_sp_model = spm.SentencePieceProcessor()
han_sp_model.Load("./hanzi.model")

llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(han_sp_model.serialized_model_proto())

# print number of tokens
print(len(llama_tokenizer),len(han_sp_model))
print(llama_tokenizer.all_special_tokens)
print(llama_tokenizer.all_special_ids)
print(llama_tokenizer.special_tokens_map)

32001 5456
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}


In [87]:
## Add Chinese tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set)+len(chinese_spm.pieces))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in chinese_spm.pieces:
    piece = p.piece
    if piece == "▁":
        print("Skipping ▁")
        continue
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")


37456
Before:32000
Skipping ▁
New model pieces: 36843


In [88]:
with open('./han_llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())
tokenizer = LlamaTokenizer(vocab_file='./han_llama.model')
tokenizer.save_pretrained("./dist")


('./dist/tokenizer_config.json',
 './dist/special_tokens_map.json',
 './dist/tokenizer.model',
 './dist/added_tokens.json')

In [43]:
#llama_tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
han_llama_tokenizer = LlamaTokenizer.from_pretrained("./dist")

text='新华社北京电中共中央国务院关于促进民营经济发展壮大的意见民营经济是推进中国式现代化的生力军是高质量发展的重要基础是推动我国全面建成社会主义现代化强国实现第二个百年奋斗目标的重要力量为促进民营经济发展壮大现提出如下意见一总体要求以习近平新时代中国特色社会主义思想为指导深入贯彻党的二十大精神坚持稳中求进工作总基调完整准确全面贯彻新发展理念加快构建新发展格局着力推动高质量发展坚持社会主义市场经济改革方向坚持两个毫不动摇加快营造市场化法治化国际化一流营商环境'
#la = llama_tokenizer.tokenize(text)
max_len=115
han_la = han_llama_tokenizer(text)
#print(f"Tokenized by LLaMA tokenizer:{la}")
print(f"Tokenized by Han-LLaMA tokenizer:{han_la}")


def chunk_with_overlap(input_list,chunk_size=max_len, overlap=2):
    assert chunk_size > overlap, "max_len should be larger than overlap size"
    
    chunks = []
    start_index = 0

    # 处理第一个chunk
    if len(input_list) >= chunk_size:
        chunks.append(input_list[:chunk_size])
        start_index += chunk_size

    # 当剩余的元素足够构成一个chunk时，循环继续
    while start_index + chunk_size - overlap <= len(input_list):
        chunk = input_list[start_index - overlap : start_index + chunk_size - overlap]
        chunks.append(chunk)
        start_index += chunk_size - overlap
    
    return chunks



for x in chunk_with_overlap(text):
    display(x)

Tokenized by Han-LLaMA tokenizer:{'input_ids': [1, 29871, 30374, 31266, 30564, 30662, 30675, 31679, 30275, 31611, 30275, 32809, 30356, 31358, 30963, 31057, 30909, 32173, 31174, 30855, 35496, 31412, 34105, 30910, 31599, 32799, 30257, 30210, 31474, 35761, 30855, 35496, 31412, 34105, 30392, 33545, 31174, 30275, 30356, 30607, 31424, 30690, 30705, 30210, 30486, 31074, 31867, 30392, 30528, 35930, 31180, 30910, 31599, 30210, 30908, 30698, 31359, 34753, 30392, 33545, 30846, 30672, 30356, 30753, 30806, 30886, 30494, 30564, 30437, 30888, 31349, 31424, 30690, 30705, 33185, 30356, 31195, 31424, 30622, 30685, 30502, 31047, 30470, 32818, 33652, 30895, 31062, 30210, 30908, 30698, 31074, 31180, 30573, 32173, 31174, 30855, 35496, 31412, 34105, 30910, 31599, 32799, 30257, 31424, 31302, 30544, 30847, 30557, 31474, 35761, 30287, 33266, 30988, 30698, 31376, 30651, 32045, 31830, 30606, 30374, 30594, 30690, 30275, 30356, 31141, 31085, 30564, 30437, 30888, 31349, 31579, 31522, 30573, 31084, 31943, 31947, 3075

'新华社北京电中共中央国务院关于促进民营经济发展壮大的意见民营经济是推进中国式现代化的生力军是高质量发展的重要基础是推动我国全面建成社会主义现代化强国实现第二个百年奋斗目标的重要力量为促进民营经济发展壮大现提出如下意见一总体要求以习'