## Using SentencePiece to Train a Tokenizer on a mini-batch of data from enwikisource

In [1]:
# Import dependencies
import sentencepiece as sp
import pandas as pd

In [2]:
# Link to data and set var for model prefix
input_file = '../../data/enwiki_20240320_minibatch.txt'
model_prefix = '../models/sptokenizer_256'

In [3]:
# Train model
sp.SentencePieceTrainer.train(input=input_file,
                               model_prefix=model_prefix,
                               vocab_size=256,
                               character_coverage=0.9995,
                               model_type='bpe')

print(f'Model and vocabulary have been generated: {model_prefix}.model and {model_prefix}.vocab')

Model and vocabulary have been generated: ../models/sptokenizer_256.model and ../models/sptokenizer_256.vocab


## Inspect Vocabulary
---

In [4]:
def load_and_print_vocab_samples(vocab_file, start_index=0, num_samples=10):
    """
    Load vocabulary from a SentencePiece .vocab file and print a specified number of samples
    starting from a specified index.
    
    :param vocab_file: Path to the SentencePiece .vocab file
    :param start_index: Index to start printing samples from
    :param num_samples: Number of vocabulary entries to print
    """
    with open(vocab_file, 'r', encoding='utf-8') as f:
        vocab = [line.split('\t')[0] for line in f.readlines()]  # Extract tokens
    
    # Ensure start_index and num_samples are within bounds
    end_index = min(start_index + num_samples, len(vocab))
    
    # Print specified samples
    for i in range(start_index, end_index):
        print(f'Index {i}: {vocab[i]}')

In [6]:
vocab_file = '../models/sptokenizer_256.vocab'
load_and_print_vocab_samples(vocab_file, start_index=0, num_samples=256)

Index 0: <unk>
Index 1: <s>
Index 2: </s>
Index 3: ▁t
Index 4: he
Index 5: ▁a
Index 6: ▁the
Index 7: in
Index 8: ▁o
Index 9: re
Index 10: nd
Index 11: ▁s
Index 12: ▁w
Index 13: er
Index 14: at
Index 15: on
Index 16: ▁of
Index 17: it
Index 18: ▁b
Index 19: is
Index 20: en
Index 21: ▁and
Index 22: ▁c
Index 23: ▁f
Index 24: ▁m
Index 25: es
Index 26: or
Index 27: ed
Index 28: ▁p
Index 29: ▁h
Index 30: ou
Index 31: ▁th
Index 32: ▁to
Index 33: ▁in
Index 34: al
Index 35: an
Index 36: ing
Index 37: ar
Index 38: ▁d
Index 39: ion
Index 40: ic
Index 41: ▁n
Index 42: as
Index 43: le
Index 44: ▁be
Index 45: om
Index 46: ll
Index 47: ent
Index 48: ▁I
Index 49: ▁e
Index 50: ▁l
Index 51: ▁re
Index 52: ve
Index 53: ot
Index 54: ▁u
Index 55: st
Index 56: se
Index 57: ▁A
Index 58: ▁that
Index 59: ut
Index 60: ▁g
Index 61: ce
Index 62: ▁T
Index 63: ▁he
Index 64: ct
Index 65: ▁for
Index 66: id
Index 67: ▁wh
Index 68: ly
Index 69: im
Index 70: ro
Index 71: ur
Index 72: ▁ha
Index 73: ld
Index 74: ▁is
Index 7