In [25]:
from tokenizers import Tokenizer, SentencePieceBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import unicodedata
import re
import pandas as pd

# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Customize pre-tokenization and normalization
def unicodeToAscii(s):
	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
	)

def normalizeString(s):
	s = s.lower().strip()
	s = unicodeToAscii(s)
	s = re.sub(r"([.!?])", r" \1", s)
	s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
	return s

# tokenizer.pre_tokenizer = Whitespace()

df = pd.read_csv('train_2024.csv', quoting=3)
sentences = df['text'].tolist()
sentences = [normalizeString(sentence) for sentence in sentences]
print(sentences[0])
# save the list of sentence into a file
# with open('sentences.txt', 'w') as f:
# 	for sentence in sentences:
# 		f.write("%s\n" % sentence)

# Train the tokenizer on your text data
# trainer = BpeTrainer(vocab_size=32000, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
# tokenizer.train_from_iterator(sentences, trainer)

tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    sentences,
    vocab_size=30_000,
    min_frequency=2,
    show_progress=True,
    limit_alphabet=500,
)

except that desmond played first base last night . tapia was in lf and reynolds had a night off .





In [26]:
# Tokenize a sentence
sentence = "Except that Desmond played first base last night. Tapia was in LF and Reynolds playing a night off."
sentence = normalizeString(sentence)

encoded = tokenizer.encode(sentence)
print(encoded.ids)

# Decode the token IDs back to text
decoded = tokenizer.decode(encoded.ids)
print(decoded)

[1481, 87, 12549, 3059, 620, 1927, 690, 1869, 35, 29645, 157, 67, 61, 9, 66, 18483, 2353, 32, 1869, 309, 35]
except that desmond played first base last night . tapia was in lf and reynolds playing a night off .


In [5]:
# Tokenize a sentence
sentence = "Except that Desmond played first base last night. Tapia was in LF and Reynolds playing a night off."

encoded = tokenizer.encode(sentence.lower())
print(encoded.ids)

# Decode the token IDs back to text
decoded = tokenizer.decode(encoded.ids)
print(decoded)

[2376, 522, 1095, 8463, 4079, 1206, 2842, 1212, 7621, 10454, 850, 595, 498, 494, 70, 496, 519, 27505, 3192, 463, 2580, 4817]
except that desmond played first base last night. tapia was in lf and reynolds playing a night off.
