In [1]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

In [2]:
llama_tokenizer_dir = "dnahlm-merge-hf-4g"
dna_sp_model_file = "gene_llama_seg.model"

# load
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
dna_sp_model = spm.SentencePieceProcessor()
dna_sp_model.Load(dna_sp_model_file)

llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
dna_spm = sp_pb2_model.ModelProto()
dna_spm.ParseFromString(dna_sp_model.serialized_model_proto())

# print number of tokens
print(len(llama_tokenizer),len(dna_sp_model))
print(llama_tokenizer.all_special_tokens)
print(llama_tokenizer.all_special_ids)
print(llama_tokenizer.special_tokens_map)

61972 30000
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}


In [3]:
## Add dna tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in dna_spm.pieces:
    piece = p.piece
    score = p.score
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = score # 0?
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

61972
Before:61972
New model pieces: 91597


In [4]:
## Save
output_sp_dir = 'merged_protein_eng_tokenizer_sp'
output_hf_dir = 'merged_gene_eng_tokenizer_hf' # the path to save dna-LLaMA tokenizer
os.makedirs(output_sp_dir,exist_ok=True)
with open(output_sp_dir+'/protein_eng_llama_tokenizer.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())

tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/protein_eng_llama_tokenizer.model')
tokenizer.save_pretrained(output_hf_dir)
print(f"dna-LLaMA tokenizer has been saved to {output_hf_dir}")

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


dna-LLaMA tokenizer has been saved to merged_protein_eng_tokenizer_hf


In [5]:
# Test
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
protein_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
print(tokenizer.all_special_tokens)
print(tokenizer.all_special_ids)
print(tokenizer.special_tokens_map)
text='''LPPYMVPIEPQVGKFYSPVALGAGAGSVLSVTFAALGCKLTWTYRWMAALMVWLRRCTHYLFIVVVAVSTLLTITGDYIFYTDWAWTSYTVFSIGILMISVGATYYLLFTGVPGTASYY,
The primary use of LLaMA is research on large language models, including'''
print("Test text:\n",text)
print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}")
print(f"Tokenized by LLaMA-gene tokenizer:{protein_llama_tokenizer.tokenize(text)}")

['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
Test text:
 LPPYMVPIEPQVGKFYSPVALGAGAGSVLSVTFAALGCKLTWTYRWMAALMVWLRRCTHYLFIVVVAVSTLLTITGDYIFYTDWAWTSYTVFSIGILMISVGATYYLLFTGVPGTASYY,
The primary use of LLaMA is research on large language models, including
Tokenized by LLaMA tokenizer:['▁L', 'PP', 'Y', 'M', 'V', 'PI', 'EP', 'Q', 'V', 'G', 'K', 'F', 'Y', 'SP', 'VAL', 'GAGA', 'G', 'SV', 'L', 'SV', 'T', 'FA', 'AL', 'G', 'CK', 'LT', 'W', 'TY', 'R', 'W', 'MA', 'AL', 'M', 'V', 'W', 'LR', 'R', 'CT', 'H', 'Y', 'L', 'FI', 'V', 'V', 'V', 'AV', 'ST', 'LL', 'TI', 'TG', 'D', 'Y', 'IF', 'Y', 'TD', 'WA', 'W', 'TS', 'Y', 'TV', 'F', 'SI', 'G', 'IL', 'M', 'IS', 'V', 'GAT', 'YY', 'LL', 'F', 'TG', 'V', 'PG', 'T', 'AS', 'YY', ',', '<0x0A>', 'The', '▁primary', '▁use', '▁of', '▁L', 'La', 'MA', '▁is', '▁research', '▁on', '▁large', '▁language', '▁models', ',', '▁including']
Tokenized by DNA-LLaMA tokenizer:['▁L', 'PP', 'Y', 'MVP', 'IE', 'P', 'QVG', 'KF', 'YSPV', 