In [8]:
import sentencepiece as spm
import os

## SentencePiece algorithm
   * Unlike Byte-Level BPE, it deals with unicode codepoints directly
   * Takes in sentences, (vague) but sequence of unicode codepoints
   * Has an option for byte-fallback in case of rare tokens through, character_coverage parameter.
   * Has a lot of options i.e. settings can be overwhelming, all options [here](https://github.com/google/sentencepiece/blob/master/doc/options.md)

### Data

In [2]:
import requests

def get_wikipedia_page_text(page_title):
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "explaintext": True,
        "titles": page_title
    }
    response = requests.get(url, params=params).json()
    page = next(iter(response['query']['pages'].values()))
    return page.get('extract', 'No content found')



page_title = "Andrej Karpathy"
text = get_wikipedia_page_text(page_title)
print(text)


Andrej Karpathy (born 23 October 1986) is a Slovak-Canadian computer scientist who served as the director of artificial intelligence and Autopilot Vision at Tesla. He co-founded and formerly worked at OpenAI, where he specialized in deep learning and computer vision.

Karpathy was born in Bratislava, Czechoslovakia (now Slovakia) and moved with his family to Toronto when he was 15. He completed his Computer Science and Physics bachelor's degrees at University of Toronto in 2009 and his master's degree at University of British Columbia in 2011, where he worked on physically-simulated figures (for example, a simulated runner or a simulated person in a crowd). 
Karpathy received a PhD from Stanford University in 2015 under the supervision of Fei-Fei Li, focusing on the intersection of natural language processing and computer vision, and deep learning models suited for this task.

He authored and was the primary instructor of the first deep learning course at Stanford, CS 231n: Convolution

In [3]:
with open("train_data.txt", "w") as fp:
    fp.write(text)

### Training
 * Let's try to replicate the llama-2 tokenizer for our little train_data.txt.
 * Keep the vocabulary ~= 500
 

In [19]:
# download the Llama 2 tokenizer file

# ! wget "https://download.llamameta.net/*?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiemVyaHMzZ28zMzQ1ZXAxY2ljdWJodGRhIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZG93bmxvYWQubGxhbWFtZXRhLm5ldFwvKiIsIkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTg4NjMwNn19fV19&Signature=Ot2xCJY%7EQ-ocr7fkf7IeuIncnhLaj-LH%7EC5N%7EuYSTQWzFeW-hVbe%7EqKCNvAt7NcFCosLWAjECitLyF%7Ej8w7nxQKEEdXcQCpZWqpTqpnGRYkVbKQKLhYY0OIq519SiomJQSw42bcYfISoAsoUTSiPnCq9sQcNAoXUVzNrfT1x8UQqGF8fMnHoULu2sDGuzbDjViWtTXDhkK4kvdNNJ7iAS99KK5n%7EM0AeXU3fgdM0FovwHAtyTl6UUxdOgykJqMPe1zzz1QN2AVC5SPvb5B8HWmm70Gn-CGIUKPyMTy8IuGH4gPKLzjP8jpOt5eRg7gF2NLWqhi-8RMC3hV8ezkBX5w__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=408334535596825/tokenizer.model" -O "./llama2-tokenizer/tokenizer.model"

# DOESNT work for some reason


--2024-07-24 11:31:02--  https://download.llamameta.net/*?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiemVyaHMzZ28zMzQ1ZXAxY2ljdWJodGRhIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZG93bmxvYWQubGxhbWFtZXRhLm5ldFwvKiIsIkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTg4NjMwNn19fV19&Signature=Ot2xCJY%7EQ-ocr7fkf7IeuIncnhLaj-LH%7EC5N%7EuYSTQWzFeW-hVbe%7EqKCNvAt7NcFCosLWAjECitLyF%7Ej8w7nxQKEEdXcQCpZWqpTqpnGRYkVbKQKLhYY0OIq519SiomJQSw42bcYfISoAsoUTSiPnCq9sQcNAoXUVzNrfT1x8UQqGF8fMnHoULu2sDGuzbDjViWtTXDhkK4kvdNNJ7iAS99KK5n%7EM0AeXU3fgdM0FovwHAtyTl6UUxdOgykJqMPe1zzz1QN2AVC5SPvb5B8HWmm70Gn-CGIUKPyMTy8IuGH4gPKLzjP8jpOt5eRg7gF2NLWqhi-8RMC3hV8ezkBX5w__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=408334535596825/tokenizer.model
Resolving download.llamameta.net (download.llamameta.net)... 108.159.61.83, 108.159.61.7, 108.159.61.34, ...
Connecting to download.llamameta.net (download.llamameta.net)|108.159.61.83|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2024-07-24

In [7]:
# generate the protobuf python file from the sentencepiece_model.proto
# Need the protobuf compiler for this
# need version less than or equal to 3.20.x

# ! sudo apt install protobuf-compiler
%env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
! protoc --python_out=. "./llama2_tokenizer/sentencepiece_model.proto"

env: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python


In [4]:
import llama2_tokenizer.sentencepiece_model_pb2 as spm_pb2

def load_model(model_file):
    model = spm_pb2.ModelProto()
    with open(model_file, 'rb') as f:
        model.ParseFromString(f.read())
    return model


model = load_model("./llama2_tokenizer/tokenizer.model")

spec = model.trainer_spec

In [5]:
spec

input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
model_type: BPE
vocab_size: 32000
self_test_sample_size: 0
input_format: "text"
character_coverage: 0.9999499917030334
input_sentence_size: 200000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
num_threads: 80
num_sub_iterations: 2
max_sentence_length: 4192
shuffle_input_sentence: true
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_whitespace: true
split_by_number: true
treat_whitespace_as_suffix: false
split_digits: true
allow_whitespace_only_pieces: true
vocabulary_output_piece_score: true
hard_vocab_limit: true
use_all_vocab: false
byte_fallback: true
required_chars: ""
unk_id: 0
bos_id: 1
eos_id: 2
pad_id: -1
unk_surface: " \342\201\207 "
unk_piece: "<unk>"
bos_piece: "<s>"
eos_piece: "</s>"
pad_piece: "<pad>"
train_extremely_large_corpus: false
enable_differential_privacy: false
differential_privacy_noise_level: 0.0
dif

In [11]:
# Will try to replicate the LLama2 tokenizer options with less vocab.
# Load the protobuf .model file of the LLama tokenizer.

options = dict(
    # data arguments
    input="train_data.txt",                             # our toy train_data
    input_format='text',
    model_prefix="spm_model_500_base",
    vocab_size=500,                                     # vocab size 
    # normalization parameters
    normalization_rule_name='identity',                 # don't use any normalization
    remove_extra_whitespaces=False,                     # Kind of like .strip(), don't do it
    input_sentence_size=200000000,
    max_sentence_length=4192,
    seed_sentencepiece_size=1000000,
    shuffle_input_sentence=True,
    # rare word treatment
    character_coverage=0.99995,
    byte_fallback=True,                                 # fallback to byte-level BPE for rare codepoints.
    # Rules like using a regex
    split_digits=True,                                  # split by individual digits i.e. 0, 1, 
    split_by_unicode_script=True,                       # split by unicode codepoints 
    split_by_whitespace=True,                           # good'ol whitespace split
    split_by_number=True,                               # kind of using like /p{N} in the regex?
    max_sentencepiece_length=16,
    add_dummy_prefix=True,                              # add a dummy `_` prefix to make `world` & ` world` the same.
    allow_whitespace_only_pieces=True,                  # allow whitespace only tokens.
    # special tokens
    unk_id=0,                                           # specify the unk_id (not used in this case cus of the byte-level fallback?)
    bos_id=1,                                           # begining of sentence. 
    eos_id=2,                                           # end of sentence.
    pad_id=-1,                                          # -1 means don't use.
    # system params
    num_threads=os.cpu_count() - 8                      # full util.                             
    )

# train on the options now
spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: train_data.txt
  input_format: text
  model_prefix: spm_model_500_base
  model_type: UNIGRAM
  vocab_size: 500
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 24
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differe

In [13]:
# use the tokenizer
sp = spm.SentencePieceProcessor()
sp.load("spm_model_500_base.model")

vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]


In [14]:
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5