-
Notifications
You must be signed in to change notification settings - Fork 0
/
custom_vocab.py
55 lines (40 loc) · 1.54 KB
/
custom_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
from tokenizers import (
decoders,
models,
pre_tokenizers,
processors,
trainers,
Tokenizer,
)
from transformers import GPT2TokenizerFast
# Create a tokenizer
tokenizer = Tokenizer(models.BPE())
# GPT-2 does not use a normalizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# the only special token is the end of text token
trainer = trainers.BpeTrainer(min_frequency=3, special_tokens=["<|endoftext|>"], vocab_size=65536)
# train the tokenizer
tokenizer.train(["abstracts.txt"], trainer=trainer)
# print number of words in vocab
vocab = tokenizer.get_vocab()
print(f"Vocab size: {len(vocab)}")
# save vocab
#tokenizer.save("exomachina_vocab.json")
if not os.path.exists("models"):
os.mkdir("models")
# save model
if not os.path.exists("models/exomachina"):
os.makedirs("models/exomachina")
tokenizer.model.save("models/exomachina")
# apply the byte-level post-processing for the GPT-2 tokenizer
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
# byte-level decoder
tokenizer.decoder = decoders.ByteLevel()
# save the tokenizer as GPT2TokenizerFast
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, vocab_file="models/exomachina/vocab.json", merges_file="models/exomachina/merges.txt", model_max_length=1024)
# test the tokenizer
print(wrapped_tokenizer("This is a test"))
# Resources:
# https://huggingface.co/course/chapter6/8?fw=pt#building-a-bpe-tokenizer-from-scratch
# https://huggingface.co/docs/transformers/tokenizer_summary#bytepair-encoding-bpe