In [1]:
import os

# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 打印环境变量以确认设置成功
print(os.environ.get('HF_ENDPOINT'))

https://hf-mirror.com


In [2]:
from tokenizers import Tokenizer

# 加载分词器
tokenizer = Tokenizer.from_file("gpt2_gene_eng_tokenizer.json")

# 测试分词
text = "This is a test sentence for GPT-2 tokenizer."
tokens = tokenizer.encode(text)
print(tokens.ids)  # 输出 token ID
print(tokens.tokens)  # 输出 token

[16854, 16069, 67, 17387, 29706, 16149, 16641, 54, 15, 20, 43291, 37903, 16]
['This', 'is', 'a', 'test', 'sentence', 'for', 'GP', 'T', '-', '2', 'token', 'izer', '.']


In [3]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="gpt2_gene_eng_tokenizer.json",  # 加载训练好的分词器
    eos_token="<|endoftext|>",
    pad_token="<pad>",
    unk_token="<unk>"
)

# 测试分词器
text = "This is another test sentence for the tokenizer."
print(hf_tokenizer.encode(text))

[16854, 16069, 17841, 17387, 29706, 16149, 16040, 43291, 37903, 16]


In [4]:
hf_tokenizer.save_pretrained("gpt2_gene_eng_tokenizer")

('gpt2_gene_eng_tokenizer/tokenizer_config.json',
 'gpt2_gene_eng_tokenizer/special_tokens_map.json',
 'gpt2_gene_eng_tokenizer/tokenizer.json')

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2_gene_eng_tokenizer")

In [9]:
tokenizer.tokenize("This is a test sentence for GPT-2 tokenizer.")

['This',
 'is',
 'a',
 'test',
 'sentence',
 'for',
 'GP',
 'T',
 '-',
 '2',
 'token',
 'izer',
 '.']

In [7]:
tokenizer.tokenize("ATAGGAAAACTGTGTATCAATGACGTTTCTTGGATCAATGACTTTTCTTGGATACAGG")

['ATAGG',
 'AAAACTGTG',
 'TATCAA',
 'TGACG',
 'TTTCTTGG',
 'ATCAA',
 'TGAC',
 'TTTTCTTGG',
 'ATACAGG']

In [8]:
tokenizer.tokenize("YPTTLEDHFGGSQRGTSLSAAAGSAVAIATGGAGLSGWYLCMYVHKEALGRLGFFGYDLQDQCGATVLSYQ")

['YP',
 'TTL',
 'ED',
 'HFGG',
 'SQRG',
 'TSL',
 'SAAAG',
 'SAV',
 'AI',
 'ATGG',
 'AGL',
 'SGW',
 'YL',
 'CM',
 'YVH',
 'KE',
 'ALG',
 'RLGF',
 'FG',
 'YDL',
 'QDQ',
 'CG',
 'ATVL',
 'SYQ']