In [3]:
PRE_TOKENIZER = ["ws", "gpt2", "llama3"]
VOCAB_SIZE = [500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, 512000]
FITTING_CORPORA = ["wikipedia", "mixed"]
OUT_PATH = "/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer"

In [5]:
def get_name(corpus_name, pre_tokenizer, vocab_size):
    dir_name = f"{OUT_PATH}/{corpus_name}-{pre_tokenizer}-{vocab_size}/tokenizer.json"
    return dir_name

### For mixed, 32k, look at overlap vocabulary pre-tokenizer

In [28]:
tokenizer_paths = [get_name("mixed", "ws", 32000), get_name("mixed", "gpt2", 32000), get_name("mixed", "llama3", 32000)]

In [29]:
tokenizer_paths

['/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-ws-32000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-32000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-llama3-32000/tokenizer.json']

In [40]:
tokenizer = Tokenizer.from_file(tokenizer_paths[0])
vocab = tokenizer.get_vocab()
sorted_tokens = sorted(vocab.items(), key=lambda item: item[1])

In [41]:
sorted_tokens

[('[UNK]', 0),
 ('[CLS]', 1),
 ('[SEP]', 2),
 ('[PAD]', 3),
 ('[MASK]', 4),
 ('!', 5),
 ('"', 6),
 ('#', 7),
 ('$', 8),
 ('%', 9),
 ('&', 10),
 ("'", 11),
 ('(', 12),
 (')', 13),
 ('*', 14),
 ('+', 15),
 (',', 16),
 ('-', 17),
 ('.', 18),
 ('/', 19),
 ('0', 20),
 ('1', 21),
 ('2', 22),
 ('3', 23),
 ('4', 24),
 ('5', 25),
 ('6', 26),
 ('7', 27),
 ('8', 28),
 ('9', 29),
 (':', 30),
 (';', 31),
 ('<', 32),
 ('=', 33),
 ('>', 34),
 ('?', 35),
 ('@', 36),
 ('A', 37),
 ('B', 38),
 ('C', 39),
 ('D', 40),
 ('E', 41),
 ('F', 42),
 ('G', 43),
 ('H', 44),
 ('I', 45),
 ('J', 46),
 ('K', 47),
 ('L', 48),
 ('M', 49),
 ('N', 50),
 ('O', 51),
 ('P', 52),
 ('Q', 53),
 ('R', 54),
 ('S', 55),
 ('T', 56),
 ('U', 57),
 ('V', 58),
 ('W', 59),
 ('X', 60),
 ('Y', 61),
 ('Z', 62),
 ('[', 63),
 ('\\', 64),
 (']', 65),
 ('^', 66),
 ('_', 67),
 ('`', 68),
 ('a', 69),
 ('b', 70),
 ('c', 71),
 ('d', 72),
 ('e', 73),
 ('f', 74),
 ('g', 75),
 ('h', 76),
 ('i', 77),
 ('j', 78),
 ('k', 79),
 ('l', 80),
 ('m', 81),
 ('n

In [47]:
from tokenizers import Tokenizer
import os

# Load tokenizers and extract vocabularies
vocabularies = {}
for path in tokenizer_paths:
    # Extract the last folder name from the path
    folder_name = os.path.basename(os.path.dirname(path))
    tokenizer = Tokenizer.from_file(path)
    vocab = tokenizer.get_vocab()
    
    # Sort tokens based on their frequency (ID)
    sorted_tokens = sorted(vocab.items(), key=lambda item: item[1])
    
    # Store sorted tokens (by frequency) as a list of token strings
    vocabularies[folder_name] = [token for token, _ in sorted_tokens]

# Calculate truly unique tokens for each tokenizer
unique_tokens = {}
special_unique_tokens = {}

for name, vocab_list in vocabularies.items():
    # Union of vocabularies from the other tokenizers
    others = set.union(*(set(vocabularies[other_name]) for other_name in vocabularies if other_name != name))
    
    # Subtract the other vocabularies from the current one to get truly unique tokens
    unique_tokens[name] = [token for token in vocab_list if token not in others]
    
    # Special condition for tokenizers with "ws" in their name
    if "ws" in name:
        # Tokens not present in others, and not present when prefixed with "Ġ"
        special_unique_tokens[name] = [token for token in unique_tokens[name] if f"Ġ{token}" not in others]

# Function to get examples from sorted tokens (assuming order by frequency)
def get_ordered_examples(tokens):
    n = len(tokens)
    return {
        "most_common": tokens[:10],
        "middle": tokens[n // 2 - 5: n // 2 + 4],
        "least_common": tokens[-10:]
    }

# Prepare the results including token examples with frequency considerations
results_with_special_unique = {
    'vocab_sizes': {name: len(vocab_list) for name, vocab_list in vocabularies.items()},
    'common_token_count_all': len(set.intersection(*(set(vocabularies[name]) for name in vocabularies))),
    'unique_tokens_count': {name: len(tokens) for name, tokens in unique_tokens.items()},
    'unique_tokens_examples': {name: get_ordered_examples(tokens) for name, tokens in unique_tokens.items()},
    'special_unique_tokens_count': {name: len(tokens) for name, tokens in special_unique_tokens.items()},
    'special_unique_tokens_examples': {name: get_ordered_examples(tokens) for name, tokens in special_unique_tokens.items()},
    'pairwise_common_tokens': {f"{folder_names[i]} & {folder_names[j]}": len(set(vocabularies[folder_names[i]]).intersection(set(vocabularies[folder_names[j]]))) for i in range(len(folder_names)) for j in range(i + 1, len(folder_names))}
}

In [48]:
# Print the results
print("Vocabulary Sizes:")
for name, size in results_with_special_unique['vocab_sizes'].items():
    print(f"{name}: {size} tokens")

print(f"\nCommon Tokens Across All: {results_with_special_unique['common_token_count_all']}")

print("\nTruly Unique Tokens with Frequency Examples:")
for name, tokens in unique_tokens.items():
    print(f"{name}: {len(tokens)} unique tokens")
    examples = results_with_special_unique['unique_tokens_examples'][name]
    print(f"Most Common: {examples['most_common']}")
    print(f"Middle: {examples['middle']}")
    print(f"Least Common: {examples['least_common']}")

print("\nSpecial Unique Tokens for 'ws' Tokenizers with Frequency Examples:")
for name, tokens in special_unique_tokens.items():
    print(f"{name}: {len(tokens)} special unique tokens")
    examples = results_with_special_unique['special_unique_tokens_examples'][name]
    print(f"Most Common: {examples['most_common']}")
    print(f"Middle: {examples['middle']}")
    print(f"Least Common: {examples['least_common']}")

print("\nPairwise Common Tokens:")
for pair, count in results_with_special_unique['pairwise_common_tokens'].items():
    print(f"{pair}: {count} common tokens")

Vocabulary Sizes:
mixed-ws-32000: 32000 tokens
mixed-gpt2-32000: 32000 tokens
mixed-llama3-32000: 32000 tokens

Common Tokens Across All: 9576

Truly Unique Tokens with Frequency Examples:
mixed-ws-32000: 20150 unique tokens
Most Common: ['wor', 'fir', 'dif', 'coun', 'htt', 'differ', 'stem', 'blic', 'proble', 'pon']
Middle: ['Holo', 'bridges', 'Bry', 'Proposition', 'mixer', 'rookie', 'Hebrew', 'Subscri', 'enhan']
Least Common: ['informing', 'Gmail', 'lebih', 'ulis', 'braw', 'feminists', "#'", 'leur', 'lya', 'unnoticed']
mixed-gpt2-32000: 1752 unique tokens
Most Common: ['Ġ1', 'Ġ2', 'ĊĠĠĠ', 'ĊĠĠĠĠĠĠĠĠ', 'ĊĠĠĠĠĠĠĠ', 'Ġ0', 'Ġ3', 'ĊĠ', 'Ġ4', 'Ġ5']
Middle: ['ĠMurphy', 'Ġpassions', 'ĠAppendix', 'Ġunsuccessful', 'Ġpsychologist', 'Ġhasht', 'irections', 'Ġdistancing', 'ĠSilicon']
Least Common: ['Ġwil', 'Ġyogurt', 'ĠHinata', 'Ġadminister', 'Ġuno', 'Ġslender', 'Ġomit', 'Ġchords', 'Ġamet', 'Ġurgency']
mixed-llama3-32000: 2954 unique tokens
Most Common: ['.Ċ', '.ĊĊ', ';Ċ', ',Ċ', ')Ċ', '>Ċ', ');Ċ', 

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

# Set sizes and intersections
ws_only = 20150
gpt2_only = 1752
llama3_only = 2954

ws_gpt2_exclusive = 11314
ws_llama3_exclusive = 10112
gpt2_llama3_exclusive = 28510

common_all = 9576

# Create the Venn diagram
venn3(subsets=(ws_only, gpt2_only, ws_gpt2_exclusive, llama3_only, ws_llama3_exclusive, gpt2_llama3_exclusive, common_all),
      set_labels=('ws', 'gpt2', 'llama3'))

plt.title("Venn Diagram of Vocabulary Sizes")
plt.show()

### For gpt2, mixed, but varying vocab size

In [49]:
VOCAB_SIZE = [500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, 512000]
tokenizer_paths = [get_name("mixed", "gpt2", v_size) for v_size in VOCAB_SIZE]

In [50]:
tokenizer_paths

['/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-500/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-1000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-2000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-4000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-8000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-16000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-32000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-64000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-128000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-256000/tokenizer.json',
 '/shared/3/projects/hiatus/TOKENIZER_wegmann/tokenizer/mixed-gpt2-512000/tokenizer.json']

In [None]:
# Load tokenizers and extract vocabularies
vocabularies = {}
for path in tokenizer_paths:
    # Extract the last folder name from the path
    folder_name = os.path.basename(os.path.dirname(path))
    tokenizer = Tokenizer.from_file(path)
    vocab = tokenizer.get_vocab()
    
    # Sort tokens based on their frequency (ID) and store them
    sorted_tokens = sorted(vocab.items(), key=lambda item: item[1])
    vocabularies[folder_name] = [token for token, _ in sorted_tokens]

# Calculate the overlap for consecutive tokenizers and find newly added tokens
overlap_ratios = {}
added_tokens_examples = {}
folder_names = list(vocabularies.keys())

for i in range(len(folder_names) - 1):
    name_1 = folder_names[i]
    name_2 = folder_names[i + 1]
    print(f"At pair {(name_1, name_2)}")
    
    # Get vocabularies as ordered lists
    vocab_1 = vocabularies[name_1]
    vocab_2 = vocabularies[name_2]
    
    # Calculate the overlap
    overlap = len(set(vocab_1).intersection(set(vocab_2)))
    smaller_vocab_size = min(len(vocab_1), len(vocab_2))
    
    # Calculate the overlap ratio
    overlap_ratio = overlap / smaller_vocab_size
    overlap_ratios[f"{name_1} & {name_2}"] = overlap_ratio
    
    print("Getting newly added tokens.")
    # Find newly added tokens
    added_tokens = [token for token in vocab_2 if token not in vocab_1]
    n = len(added_tokens)
    
    # Get examples of the most frequent, middle, and least frequent added tokens
    examples = {
        "most_frequent": added_tokens[:10],
        "middle": added_tokens[max(0, n//2 - 5):max(0, n//2 + 5)],
        "least_frequent": added_tokens[-10:]
    }
    added_tokens_examples[f"{name_1} -> {name_2}"] = examples

# Print the results
print("Overlap Ratios between Consecutive Tokenizers:")
for pair, ratio in overlap_ratios.items():
    print(f"{pair}: {ratio:.2f}")

print("\nExamples of Added Tokens:")
for transition, examples in added_tokens_examples.items():
    print(f"\nTransition: {transition}")
    print("Most Frequent Added Tokens:", examples['most_frequent'])
    print("Middle Added Tokens:", examples['middle'])
    print("Least Frequent Added Tokens:", examples['least_frequent'])

At pair ('mixed-gpt2-500', 'mixed-gpt2-1000')
Getting newly added tokens.
At pair ('mixed-gpt2-1000', 'mixed-gpt2-2000')
Getting newly added tokens.
At pair ('mixed-gpt2-2000', 'mixed-gpt2-4000')
Getting newly added tokens.
At pair ('mixed-gpt2-4000', 'mixed-gpt2-8000')
Getting newly added tokens.
At pair ('mixed-gpt2-8000', 'mixed-gpt2-16000')
Getting newly added tokens.
At pair ('mixed-gpt2-16000', 'mixed-gpt2-32000')
Getting newly added tokens.
At pair ('mixed-gpt2-32000', 'mixed-gpt2-64000')
Getting newly added tokens.
At pair ('mixed-gpt2-64000', 'mixed-gpt2-128000')
Getting newly added tokens.
At pair ('mixed-gpt2-128000', 'mixed-gpt2-256000')
Getting newly added tokens.
At pair ('mixed-gpt2-256000', 'mixed-gpt2-512000')
Getting newly added tokens.


In [None]:
# figure showing the % overlap w.r.t. the next tokenizer, expectation is at 100% always ??, i.e, larger vocabulary size "jsut adds more"