# ðŸ”¤ ExtendedTokenizer - Train 300K BPE Tokenizer

This notebook trains a 300,000 token BPE tokenizer on Simple English Wikipedia.

**Estimated time:** ~1-2 hours on free Colab

**Requirements:** None (free Colab works fine)

## 1. Install Dependencies

In [None]:
!pip install -q git+https://github.com/marcelo-earth/extended-tokenizer.git
!pip install -q huggingface_hub

## 2. Download Simple English Wikipedia

In [None]:
import os
import requests
from tqdm import tqdm

# Simple English Wikipedia dump URL
DUMP_URL = "https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2"
OUTPUT_DIR = "data"
DUMP_PATH = f"{OUTPUT_DIR}/simplewiki-latest-pages-articles.xml.bz2"

os.makedirs(OUTPUT_DIR, exist_ok=True)

if os.path.exists(DUMP_PATH):
    print(f"File already exists: {DUMP_PATH}")
else:
    print(f"Downloading Simple English Wikipedia...")
    print(f"URL: {DUMP_URL}")
    
    response = requests.get(DUMP_URL, stream=True)
    response.raise_for_status()
    
    total_size = int(response.headers.get("content-length", 0))
    
    with open(DUMP_PATH, "wb") as f:
        with tqdm(total=total_size, unit="B", unit_scale=True, desc="Downloading") as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))
    
    print(f"\nDownloaded to {DUMP_PATH}")

# Show file size
size_mb = os.path.getsize(DUMP_PATH) / (1024 * 1024)
print(f"File size: {size_mb:.1f} MB")

## 3. Train the Tokenizer

This will:
1. Extract and preprocess articles from the Wikipedia dump
2. Train BPE with 300,000 vocabulary size
3. Save the vocabulary files

In [None]:
from extended_tokenizer.trainer import BPETrainer
from extended_tokenizer.data.wikipedia import stream_wikipedia_texts

# Configuration
VOCAB_SIZE = 300_000
MIN_FREQUENCY = 2
OUTPUT_PATH = "vocab/bpe_300k"

print("="*60)
print("BPE Tokenizer Training")
print("="*60)
print(f"\nVocab size: {VOCAB_SIZE:,}")
print(f"Min frequency: {MIN_FREQUENCY}")
print(f"Output: {OUTPUT_PATH}")
print()

# Initialize trainer
trainer = BPETrainer(
    vocab_size=VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
)

# Stream articles from Wikipedia
print("Loading Wikipedia articles...")
corpus = stream_wikipedia_texts(
    DUMP_PATH,
    max_articles=None,  # Use all articles
    min_length=200,
    show_progress=True,
)

# Train
print("\nStarting BPE training...")
print("This may take 1-2 hours.\n")

vocab = trainer.train(corpus, show_progress=True)

print(f"\n" + "="*60)
print("Training Complete!")
print("="*60)
print(f"Final vocabulary size: {vocab.vocab_size:,}")
print(f"Number of merges: {vocab.num_merges:,}")

## 4. Save Vocabulary

In [None]:
import os

os.makedirs(OUTPUT_PATH, exist_ok=True)
trainer.save(OUTPUT_PATH)

print(f"Vocabulary saved to {OUTPUT_PATH}/")
print()
print("Files created:")
for f in os.listdir(OUTPUT_PATH):
    size = os.path.getsize(f"{OUTPUT_PATH}/{f}") / (1024 * 1024)
    print(f"  - {f}: {size:.2f} MB")

## 5. Test the Tokenizer

In [None]:
from extended_tokenizer import ExtendedTokenizer

# Load the trained tokenizer
tokenizer = ExtendedTokenizer(vocab_path=OUTPUT_PATH)

print(f"Loaded tokenizer with {tokenizer.vocab_size:,} tokens")
print()

# Test encoding/decoding
test_texts = [
    "Hello, world!",
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is a subset of artificial intelligence.",
    "Python is a programming language.",
    "æ—¥æœ¬èªžãƒ†ã‚¹ãƒˆ",  # Japanese
    "ðŸŽ‰ Emoji test! ðŸš€",
]

print("Encoding tests:")
print("-" * 60)

for text in test_texts:
    tokens = tokenizer.encode(text)
    decoded = tokenizer.decode(tokens)
    status = "âœ“" if decoded == text else "âœ—"
    print(f"{status} '{text}'")
    print(f"   Tokens: {len(tokens)} -> {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
    print()

## 6. Upload to HuggingFace Hub (Optional)

Run this cell to upload the trained vocabulary to HuggingFace Hub.

In [None]:
# First, login to HuggingFace
from huggingface_hub import login, HfApi

# This will prompt for your HuggingFace token
login()

In [None]:
from huggingface_hub import HfApi

# Configuration - change this to your repo
REPO_ID = "marcelo-earth/extended-tokenizer-300k"

api = HfApi()

# Create repo if it doesn't exist
try:
    api.create_repo(repo_id=REPO_ID, exist_ok=True)
    print(f"Repository ready: https://huggingface.co/{REPO_ID}")
except Exception as e:
    print(f"Note: {e}")

# Upload files
print("\nUploading files...")
api.upload_folder(
    folder_path=OUTPUT_PATH,
    repo_id=REPO_ID,
    repo_type="model",
)

print(f"\nâœ“ Upload complete!")
print(f"View at: https://huggingface.co/{REPO_ID}")

## 7. Download Vocab Files (Alternative)

If you don't want to upload to HuggingFace, you can download the files directly.

In [None]:
from google.colab import files
import shutil

# Create a zip file
shutil.make_archive("extended_tokenizer_300k", "zip", OUTPUT_PATH)

# Download
files.download("extended_tokenizer_300k.zip")
print("\nDownload started!")