In [1]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llama_tokenizer_dir = "llama-7b-hf" 
dna_sp_model_file = "dna_llama_seg.model"

# load
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
dna_sp_model = spm.SentencePieceProcessor()
dna_sp_model.Load(dna_sp_model_file)

llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
dna_spm = sp_pb2_model.ModelProto()
dna_spm.ParseFromString(dna_sp_model.serialized_model_proto())

# print number of tokens
print(len(llama_tokenizer),len(dna_sp_model))
print(llama_tokenizer.all_special_tokens)
print(llama_tokenizer.all_special_ids)
print(llama_tokenizer.special_tokens_map)

32000 30000
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}


In [3]:
## Add dna tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in dna_spm.pieces:
    piece = p.piece
    score = p.score
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = score # 0?
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

32000
Before:32000
New model pieces: 61972


In [4]:
## Save
output_sp_dir = 'merged_dna_eng_tokenizer_sp'
output_hf_dir = 'merged_dna_eng_tokenizer_hf' # the path to save dna-LLaMA tokenizer
os.makedirs(output_sp_dir,exist_ok=True)
with open(output_sp_dir+'/dna_eng_llama_tokenizer.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())

tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/dna_eng_llama_tokenizer.model')
tokenizer.save_pretrained(output_hf_dir)
print(f"dna-LLaMA tokenizer has been saved to {output_hf_dir}")

dna-LLaMA tokenizer has been saved to merged_dna_eng_tokenizer_hf


In [5]:
# Test
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
dna_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
print(tokenizer.all_special_tokens)
print(tokenizer.all_special_ids)
print(tokenizer.special_tokens_map)
text='''GCTGACTCTGCCAGGATGGAATGAAATTAGGTTGTTTTAATTATAATGTAAAGTCAGTTCTAGTCAGACATAGTCACATAGGCAAGTAAGGGAACCTAAAATTGCTTGGAAT,
The primary use of LLaMA is research on large language models, including'''
print("Test text:\n",text)
print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}")
print(f"Tokenized by DNA-LLaMA tokenizer:{dna_llama_tokenizer.tokenize(text)}")

['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
Test text:
 GCTGACTCTGCCAGGATGGAATGAAATTAGGTTGTTTTAATTATAATGTAAAGTCAGTTCTAGTCAGACATAGTCACATAGGCAAGTAAGGGAACCTAAAATTGCTTGGAAT,
The primary use of LLaMA is research on large language models, including
Tokenized by LLaMA tokenizer:['▁G', 'CT', 'GA', 'CT', 'CT', 'G', 'CC', 'AG', 'G', 'AT', 'G', 'GA', 'AT', 'G', 'AA', 'AT', 'TAG', 'G', 'TT', 'G', 'TT', 'T', 'TA', 'AT', 'T', 'ATA', 'AT', 'G', 'TA', 'AA', 'GT', 'CA', 'G', 'TT', 'CT', 'AG', 'T', 'CA', 'G', 'AC', 'ATA', 'G', 'TC', 'AC', 'ATA', 'GG', 'CA', 'AG', 'TA', 'AG', 'G', 'GA', 'AC', 'CT', 'AA', 'A', 'AT', 'T', 'G', 'CT', 'T', 'G', 'GA', 'AT', ',', '<0x0A>', 'The', '▁primary', '▁use', '▁of', '▁L', 'La', 'MA', '▁is', '▁research', '▁on', '▁large', '▁language', '▁models', ',', '▁including']
Tokenized by DNA-LLaMA tokenizer:['▁GCTGA', 'CTCTG', 'CCAGGA', 'TGGAA', 'TGAAATTA', 'GGT', 'TG', 'TTTTA', 'AT', 'TATAATG', 'TAAAGTCA', 'GT', 'TCTAGTCA', 'GA