#### Tokenizer Training

In [11]:
from tokenizers.implementations import ByteLevelBPETokenizer
import os
import json
from transformers import BertTokenizer, AutoModel, AutoTokenizer
import kagglehub
import pandas as pd
import tqdm

# Step 0: Hyperparameters
vocab_size = 30_000
min_frequency = 2

# Step 1: Download and load the dataset
print("Downloading dataset...")
path = kagglehub.dataset_download("rmisra/news-category-dataset")
dataset_path = f"{path}/News_Category_Dataset_v3.json"

print("Loading dataset...")
with open(dataset_path, "r", encoding="utf-8") as f:
    raw_data = [json.loads(line) for line in f]

# Preprocess the dataset
data_df = pd.DataFrame(raw_data)
# Filter to a specific category
data_df = data_df[data_df["category"] == "POLITICS"]
# data_df = data_df.sample(50000, random_state=42)
text_data = data_df["headline"].tolist() + data_df["short_description"].tolist()

print(f"Number of samples: {len(text_data)}")

# Save text data for tokenizer training
text_file_path = "tokenizer_training_corpus.txt"
with open(text_file_path, "w", encoding="utf-8") as f:
    for text in text_data:
        f.write(f"{text}\n")

# Step 2: Train the tokenizer
print("Training tokenizer...")
pretrained_model = "bert-large-uncased"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def dataset_iterator(data):
    for line in data:
        yield line

new_tokenizer = tokenizer.train_new_from_iterator(
    dataset_iterator(text_data),
    vocab_size=vocab_size
)

# Save tokenizer files
tokenizer_dir = "archive/custom_tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
new_tokenizer.save_pretrained(tokenizer_dir)

# Save config.json for BertTokenizer
config = {
    "model_type": "bert",
    "tokenizer_class": "BertTokenizer",
    "vocab_size": vocab_size,
    "do_lower_case": True,
}
with open(os.path.join(tokenizer_dir, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

# Generate vocab.txt for BertTokenizer
vocab_path = os.path.join(tokenizer_dir, "vocab.txt")
with open(vocab_path, "w", encoding="utf-8") as f:
    vocab = tokenizer.get_vocab()
    for token, index in sorted(vocab.items(), key=lambda x: x[1]):
        f.write(f"{token}\n")

# Step 3: Load the tokenizer
print("Loading tokenizer...")
try:
    tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
    print("Tokenizer loaded successfully: ", tokenizer)
except Exception as e:
    print("Error loading tokenizer:", e)

# Step 4: Load SBERT model
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
print("Model loaded successfully.")

Downloading dataset...
Loading dataset...
Number of samples: 71204
Training tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loading tokenizer...
Tokenizer loaded successfully:  BertTokenizer(name_or_path='archive/custom_tokenizer', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Model loaded successfully.


#### Tokenizer Coverage

In [12]:
from collections import Counter

# Tokenize the dataset
all_text = " ".join(text_data)
tokenized_output = tokenizer.tokenize(all_text)

# Check unique tokens
unique_tokens = Counter(tokenized_output)
print(f"Number of unique tokens: {len(unique_tokens)}")

# Compare coverage
total_words = len(all_text.split())
coverage = len(tokenized_output) / total_words
print(f"Tokenizer coverage: {coverage:.2f}")

Number of unique tokens: 19600
Tokenizer coverage: 1.29


#### Sample Tokenization

In [13]:
sample_texts = text_data[:5]
for text in sample_texts:
    tokens = tokenizer.tokenize(text)
    print(f"Original: {text}")
    print(f"Tokens: {tokens}")

Original: Biden Says U.S. Forces Would Defend Taiwan If China Invaded
Tokens: ['bid', '##en', 'says', 'u', '.', 's', '.', 'forces', 'would', 'defend', 'taiwan', 'if', 'china', 'invaded']
Original: ‘Beautiful And Sad At The Same Time’: Ukrainian Cultural Festival Takes On A Deeper Meaning This Year
Tokens: ['‘', 'beautiful', 'and', 'sad', 'at', 'the', 'same', 'time', '’', ':', 'ukrainian', 'cultural', 'festival', 'takes', 'on', 'a', 'deeper', 'meaning', 'this', 'year']
Original: Biden Says Queen's Death Left 'Giant Hole' For Royal Family
Tokens: ['bid', '##en', 'says', 'queen', "'", 's', 'death', 'left', "'", 'giant', 'hole', "'", 'for', 'royal', 'family']
Original: Bill To Help Afghans Who Escaped Taliban Faces Long Odds In The Senate
Tokens: ['bill', 'to', 'help', 'afghan', '##s', 'who', 'escaped', 'taliban', 'faces', 'long', 'odds', 'in', 'the', 'senate']
Original: Mark Meadows Complies With Justice Dept. Subpoena: Report
Tokens: ['mark', 'meadows', 'com', '##pl', '##ies', 'with', 'j

#### Sentence-BERT Tokenizer

In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load Sentence-BERT and custom tokenizer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)

# Example input using the custom tokenizer
text = "Donald Trump recently visited the UK."
inputs = tokenizer(text, return_tensors="pt")
print(inputs)

decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
print(decoded_text)


{'input_ids': tensor([[   2, 6221, 8398, 3728, 4716, 1996, 2866, 1012,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
donald trump recently visited the uk .
