In [1]:
# uncomment and run the following installation lines ONLY if you havent installed these libraries already outside of the notebook
#!pip install ipdb -q
#!pip install sentencepiece -q

In [None]:
# This file will train a tokenizer on the small wiki.txt dataset
# and save the tokenizer in the files {model_prefix}.model and {model_prefix}.vocab

# Import libraries
import sentencepiece as spm
import os, sys

vocab_size = 4096 # Size of the vocabulary you wish to have

# Official notebook #vj30

In [None]:
# If you are running this online (for example at Google Colab), 
# make sure you have the support files on the same folder
# Otherwise run this cell to download them

# NOTE: Downloading will take a while, be patient. You can refresh your folder from time to time to see when the files
# have been created.

import os, requests, zipfile, io 

files_url = "https://ideami.com/llm_train"

# Downloading proceeds if we detect that one of the key files to download is not present
if not os.path.exists(f"encoded_data.pt"):
    print("Downloading files using Python")
    response = requests.get(files_url)
    zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")
else:
    print("you seem to have already downloaded the files. If you wish to re-download them, delete the encoded_data.pt file")


In [2]:
# Training the Sentence Piece Tokenizer

spm.SentencePieceTrainer.train(
        input='wiki.txt',
        model_prefix="test_wiki_tokenizer", # pick the name for your trained tokenizer
        model_type="bpe",
        vocab_size=vocab_size,
        self_test_sample_size=0,
        input_format="text",
        character_coverage=0.995,
        num_threads=os.cpu_count(),
        split_digits=True,
        allow_whitespace_only_pieces=True,
        byte_fallback=True,
        unk_surface=r" \342\201\207 ",
        normalization_rule_name="identity"
    )

print("Tokenizer training completed")

# The character_coverage parameter specifies the proportion of characters in the training corpus
# that are considered when building the tokenizer model. This is important for languages with
# large character sets, such as Japanese or Chinese, where it is impractical to include all characters.
# A character_coverage value of 0.995 means that the tokenizer will include the most frequent 99.5%
# of characters in the training corpus. The remaining 0.5% of less frequent characters will be
# treated as unknown. This helps in managing the vocabulary size and ensures that the tokenizer
# focuses on the most common characters, improving efficiency and performance.

# The model_type parameter determines the algorithm used to create the tokenizer model.
# You can choose from several types, including bpe (Byte Pair Encoding), unigram, word, and char.
# BPE stands for Byte Pair Encoding. BPE is a subword tokenization algorithm that iteratively merges
# the most frequent pairs of bytes (or characters) in the corpus to form subwords. This process continues
# until the desired vocabulary size is reached. BPE can split rare words into more frequent subword units,
# improving the model's ability to generalize.

Tokenizer training completed


In [5]:
#########################
## Validate that training was successful

# Load the trained model
sp = spm.SentencePieceProcessor(model_file='test_wiki_tokenizer.model')

# Print Vocabulary Size
vocab_size = sp.get_piece_size()
print(f"SentencePiece vocab_size: {vocab_size}")

# Create helper encoding/decoding Functions
encode = lambda s: sp.Encode(s)  
decode = lambda l: sp.Decode(l)

# Test the encoding and decoding functions
print(encode("What is a healthy dish that includes strawberry?"))
print(decode(encode("What is a healthy dish that includes strawberry?")))

SentencePiece vocab_size: 4096
[1233, 275, 299, 261, 2682, 4049, 297, 460, 392, 2126, 353, 2347, 382, 511, 66]
What is a healthy dish that includes strawberry?
