# Phase 1.2: Filter Tokens by Frequency

Filter Korean tokens by frequency (EEVE method: keep tokens >= 6000 occurrences).

## Contents
1. Load Tokenizer and Corpus
2. Count Token Frequencies
3. Filter by Threshold
4. Analyze Filtered Tokens
5. Save Filtered Token List

In [None]:
# Setup
import sys
import os
sys.path.append("..")

import sentencepiece as spm
from collections import Counter
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

# Directories
DATA_DIR = "../data"
RAW_DIR = f"{DATA_DIR}/raw"
MODEL_DIR = "../models/tokenizer"

print(f"Model directory: {MODEL_DIR}")

---
## 1. Load Tokenizer and Corpus

In [None]:
# Load trained tokenizer
model_path = f"{MODEL_DIR}/korean_sp.model"
sp = spm.SentencePieceProcessor()
sp.Load(model_path)

print(f"Loaded tokenizer: {model_path}")
print(f"Vocabulary size: {sp.GetPieceSize()}")

In [None]:
# Load corpus path
corpus_path = f"{RAW_DIR}/korean_corpus_for_tokenizer.txt"

if os.path.exists(corpus_path):
    file_size_gb = os.path.getsize(corpus_path) / (1024**3)
    print(f"Corpus: {corpus_path}")
    print(f"Size: {file_size_gb:.2f} GB")
else:
    print(f"Corpus not found: {corpus_path}")

---
## 2. Count Token Frequencies

In [None]:
# Count token frequencies in corpus
print("Counting token frequencies...")
print("This may take several minutes for large corpora.")

token_counts = Counter()
total_tokens = 0
lines_processed = 0

with open(corpus_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Processing corpus"):
        line = line.strip()
        if not line:
            continue
        
        # Tokenize
        tokens = sp.EncodeAsPieces(line)
        token_counts.update(tokens)
        total_tokens += len(tokens)
        lines_processed += 1
        
        # Progress update
        if lines_processed % 500000 == 0:
            print(f"  Processed {lines_processed:,} lines, {total_tokens:,} tokens")

print(f"\nTotal lines processed: {lines_processed:,}")
print(f"Total tokens: {total_tokens:,}")
print(f"Unique tokens: {len(token_counts):,}")

In [None]:
# Show most common tokens
print("\nTop 50 most common tokens:")
for token, count in token_counts.most_common(50):
    pct = count / total_tokens * 100
    print(f"  {token}: {count:,} ({pct:.3f}%)")

In [None]:
# Frequency distribution
frequencies = list(token_counts.values())

print("\nFrequency distribution:")
print(f"  Min: {min(frequencies):,}")
print(f"  Max: {max(frequencies):,}")
print(f"  Mean: {np.mean(frequencies):,.1f}")
print(f"  Median: {np.median(frequencies):,.1f}")
print(f"  Std: {np.std(frequencies):,.1f}")

---
## 3. Filter by Threshold (EEVE: >= 6000)

In [None]:
# EEVE threshold
MIN_FREQUENCY = 6000

# Analyze different thresholds
thresholds = [1000, 3000, 6000, 10000, 20000, 50000]

print("Tokens remaining at different thresholds:")
print(f"{'Threshold':>10} | {'Tokens':>10} | {'% of Vocab':>12}")
print("-" * 40)

for threshold in thresholds:
    count = sum(1 for freq in frequencies if freq >= threshold)
    pct = count / len(frequencies) * 100
    marker = " <-- EEVE" if threshold == MIN_FREQUENCY else ""
    print(f"{threshold:>10,} | {count:>10,} | {pct:>11.1f}%{marker}")

In [None]:
# Filter tokens
filtered_tokens = [
    token for token, count in token_counts.items()
    if count >= MIN_FREQUENCY
]

print(f"\nFiltering with threshold >= {MIN_FREQUENCY:,}")
print(f"Original tokens: {len(token_counts):,}")
print(f"Filtered tokens: {len(filtered_tokens):,}")
print(f"Removed: {len(token_counts) - len(filtered_tokens):,}")

In [None]:
# Separate Korean tokens from others
def is_korean_token(token):
    """Check if token contains Korean characters"""
    clean = token.replace("▁", "")
    return any('가' <= c <= '힣' for c in clean)

filtered_korean_tokens = [t for t in filtered_tokens if is_korean_token(t)]
filtered_other_tokens = [t for t in filtered_tokens if not is_korean_token(t)]

print(f"\nFiltered token breakdown:")
print(f"  Korean tokens: {len(filtered_korean_tokens):,}")
print(f"  Other tokens: {len(filtered_other_tokens):,}")

---
## 4. Analyze Filtered Tokens

In [None]:
# Show sample filtered Korean tokens
print("Sample filtered Korean tokens (sorted by frequency):")

korean_with_freq = [(t, token_counts[t]) for t in filtered_korean_tokens]
korean_with_freq.sort(key=lambda x: -x[1])

for token, freq in korean_with_freq[:50]:
    print(f"  {token}: {freq:,}")

In [None]:
# Visualize frequency distribution
plt.figure(figsize=(12, 5))

# Histogram of frequencies (log scale)
plt.subplot(1, 2, 1)
plt.hist([token_counts[t] for t in filtered_korean_tokens], bins=50, color='#4ECDC4', edgecolor='white')
plt.xlabel('Token Frequency')
plt.ylabel('Count')
plt.title(f'Frequency Distribution of Filtered Korean Tokens\n(threshold >= {MIN_FREQUENCY:,})')
plt.yscale('log')

# Cumulative coverage
plt.subplot(1, 2, 2)
sorted_freqs = sorted([token_counts[t] for t in filtered_korean_tokens], reverse=True)
cumsum = np.cumsum(sorted_freqs) / sum(sorted_freqs) * 100
plt.plot(range(len(cumsum)), cumsum, color='#FF6B6B')
plt.xlabel('Number of Tokens (ranked by frequency)')
plt.ylabel('Cumulative Coverage (%)')
plt.title('Token Coverage')
plt.axhline(y=90, color='gray', linestyle='--', label='90% coverage')
plt.legend()

plt.tight_layout()
plt.savefig(f"{MODEL_DIR}/filtered_token_analysis.png", dpi=150)
plt.show()

print(f"Chart saved to {MODEL_DIR}/filtered_token_analysis.png")

In [None]:
# Check medical term coverage
medical_terms_ko = [
    "의사", "환자", "병원", "치료", "진단", "증상", "질병", "약물",
    "수술", "검사", "혈액", "심장", "폐", "간", "신장", "뇌",
    "당뇨", "고혈압", "암", "감염", "염증", "통증", "발열", "기침",
]

print("Medical term coverage check:")
covered = 0
not_covered = []

for term in medical_terms_ko:
    # Check if term exists as token or subtoken
    found = False
    for token in filtered_korean_tokens:
        if term in token.replace("▁", ""):
            found = True
            break
    
    if found:
        covered += 1
    else:
        not_covered.append(term)

print(f"  Covered: {covered}/{len(medical_terms_ko)} ({covered/len(medical_terms_ko)*100:.1f}%)")
if not_covered:
    print(f"  Not covered: {not_covered}")

---
## 5. Save Filtered Token List

In [None]:
# Save filtered Korean tokens
filtered_tokens_path = f"{MODEL_DIR}/filtered_korean_tokens.txt"

with open(filtered_tokens_path, "w", encoding="utf-8") as f:
    for token in filtered_korean_tokens:
        f.write(f"{token}\n")

print(f"Saved {len(filtered_korean_tokens)} filtered Korean tokens to {filtered_tokens_path}")

In [None]:
# Save token frequencies for reference
frequencies_path = f"{MODEL_DIR}/token_frequencies.json"

token_freq_data = {
    token: count for token, count in token_counts.items()
    if token in filtered_tokens
}

with open(frequencies_path, "w", encoding="utf-8") as f:
    json.dump(token_freq_data, f, ensure_ascii=False, indent=2)

print(f"Saved token frequencies to {frequencies_path}")

In [None]:
# Save filtering summary
filter_summary = {
    "min_frequency_threshold": MIN_FREQUENCY,
    "total_corpus_tokens": total_tokens,
    "unique_tokens_before": len(token_counts),
    "filtered_tokens_total": len(filtered_tokens),
    "filtered_korean_tokens": len(filtered_korean_tokens),
    "filtered_other_tokens": len(filtered_other_tokens),
    "files": {
        "filtered_tokens": filtered_tokens_path,
        "frequencies": frequencies_path,
    },
}

summary_path = f"{MODEL_DIR}/filter_summary.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(filter_summary, f, indent=2)

print("\n" + "=" * 60)
print("Token Filtering Summary")
print("=" * 60)
print(json.dumps(filter_summary, indent=2))

In [None]:
print("\n" + "=" * 60)
print("Token Filtering Complete!")
print("=" * 60)
print(f"\nFiltered Korean tokens: {len(filtered_korean_tokens):,}")
print(f"Saved to: {filtered_tokens_path}")
print("\nNext steps:")
print("  1. Run 03_merge_tokenizers.ipynb to merge with MedGemma tokenizer")