# 🧮 Tokenizer Vocab Coverage Lab – Module 11

This notebook complements your tokenizer experiments with:
- UNK detection
- Vocabulary coverage scoring
- Visualization of token fragmentation

## 🔍 Step 1: Load 100 Prompt Lines

In [None]:
from pathlib import Path
import random
import json

samples = []
with open("data/internal_curated/clean.jsonl") as f:
    for line in f:
        item = json.loads(line)
        samples.append(item['instruction'])

random.seed(42)
samples = random.sample(samples, min(100, len(samples)))
print(samples[:2])

## 🔢 Step 2: Compare Token Count per Line

In [None]:
from transformers import AutoTokenizer

base_tok = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
# Replace with your own tokenizer path
# from tokenizers import Tokenizer
# custom_tok = Tokenizer.from_file("tokenizers/custom-vocab/tokenizer.json")

base_counts = [len(base_tok.encode(line)) for line in samples]
# custom_counts = [len(custom_tok.encode(line).ids) for line in samples]

## 📊 Step 3: Visualize Token Counts

In [None]:
import matplotlib.pyplot as plt
plt.hist(base_counts, bins=20, alpha=0.7, label="Base tokenizer")
# plt.hist(custom_counts, bins=20, alpha=0.7, label="Custom tokenizer")
plt.legend()
plt.title("Token Distribution per Prompt")
plt.xlabel("Tokens")
plt.ylabel("Count")
plt.grid(True)
plt.show()

## 🧾 Step 4: Avg and Fragmentation Summary

In [None]:
print("Base tokenizer:")
print(f"Min: {min(base_counts)} | Max: {max(base_counts)} | Avg: {sum(base_counts)/len(base_counts):.2f}")

# Uncomment below if comparing
# print("Custom tokenizer:")
# print(f"Min: {min(custom_counts)} | Max: {max(custom_counts)} | Avg: {sum(custom_counts)/len(custom_counts):.2f}")