## Using SentencePiece to Train a Tokenizer on a mini-batch of data from enwikisource

In [1]:
# Import dependencies
import sentencepiece as sp
import pandas as pd

In [2]:
# Link to data and set var for model prefix
input_file = '../data/enwiki_20240320_minibatch.txt'
model_prefix = 'sp_tokenizer_model'

In [3]:
# Train model
sp.SentencePieceTrainer.train(input=input_file,
                               model_prefix=model_prefix,
                               vocab_size=16000,
                               character_coverage=0.9995,
                               model_type='bpe')

print(f'Model and vocabulary have been generated: {model_prefix}.model and {model_prefix}.vocab')

Model and vocabulary have been generated: sp_tokenizer_model.model and sp_tokenizer_model.vocab


## Inspect Vocabulary
---

In [4]:
def load_and_print_vocab_samples(vocab_file, start_index=0, num_samples=10):
    """
    Load vocabulary from a SentencePiece .vocab file and print a specified number of samples
    starting from a specified index.
    
    :param vocab_file: Path to the SentencePiece .vocab file
    :param start_index: Index to start printing samples from
    :param num_samples: Number of vocabulary entries to print
    """
    with open(vocab_file, 'r', encoding='utf-8') as f:
        vocab = [line.split('\t')[0] for line in f.readlines()]  # Extract tokens
    
    # Ensure start_index and num_samples are within bounds
    end_index = min(start_index + num_samples, len(vocab))
    
    # Print specified samples
    for i in range(start_index, end_index):
        print(f'Index {i}: {vocab[i]}')

In [10]:
vocab_file = 'models/sp_tokenizer_model.vocab'
load_and_print_vocab_samples(vocab_file, start_index=2000, num_samples=1000)

Index 2000: ait
Index 2001: ▁san
Index 2002: ▁wall
Index 2003: lement
Index 2004: ▁former
Index 2005: rodu
Index 2006: ▁hor
Index 2007: fully
Index 2008: ▁ways
Index 2009: ▁pursu
Index 2010: ▁understanding
Index 2011: iles
Index 2012: ▁leave
Index 2013: ▁often
Index 2014: ▁17
Index 2015: io
Index 2016: ijack
Index 2017: ns
Index 2018: ches
Index 2019: ▁dark
Index 2020: ▁move
Index 2021: ulation
Index 2022: ▁months
Index 2023: aff
Index 2024: raft
Index 2025: ▁Convention
Index 2026: ▁conditions
Index 2027: ption
Index 2028: ▁Parties
Index 2029: ▁vol
Index 2030: ource
Index 2031: ▁greater
Index 2032: ories
Index 2033: ▁stat
Index 2034: iron
Index 2035: ▁deliver
Index 2036: ▁commanded
Index 2037: ▁bread
Index 2038: ▁hijack
Index 2039: ▁legisl
Index 2040: ince
Index 2041: ▁places
Index 2042: ▁unless
Index 2043: 3,
Index 2044: gu
Index 2045: ▁women
Index 2046: ▁accordance
Index 2047: ▁Co
Index 2048: ▁pure
Index 2049: ▁receive
Index 2050: use
Index 2051: ires
Index 2052: ▁May
Index 2053: ▁ag