In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [2]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [3]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [4]:
kelime_frekanslari = {}

for cumle in corpus:
    for kelime in cumle.split():
        kelime_frekanslari[kelime] = kelime_frekanslari.get(kelime, 0) + 1

print(kelime_frekanslari)

{'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course.': 1, 'chapter': 1, 'about': 1, 'tokenization.': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms.': 1, 'Hopefully,': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens.': 1}


In [5]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [8]:
vocab = ["<|endoftext|>"] + alphabet.copy()


In [11]:
splits = {word: [c for c in word] for word in word_freqs.keys()}


In [12]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [13]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
('Ġ', 'i'): 2
('Ġ', 't'): 7
('t', 'h'): 3


In [14]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 7


In [15]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [16]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [17]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [43]:
vocab_size = 100

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [44]:
print(merges)


{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni', ('Ġtokeni', 'z'): 'Ġtokeniz', ('a', 't'): 'at', ('i', 'o'): 'io', ('io', 'n'): 'ion', ('Ġ', 'se'): 'Ġse', ('h', 'o'): 'ho', ('ho', 'w'): 'how', ('l', 'l'): 'll', ('Ġ', 'H'): 'ĠH', ('ĠH', 'u'): 'ĠHu', ('ĠHu', 'g'): 'ĠHug', ('ĠHug', 'g'): 'ĠHugg', ('ĠHugg', 'in'): 'ĠHuggin', ('ĠHuggin', 'g'): 'ĠHugging', ('Ġ', 'F'): 'ĠF', ('ĠF', 'a'): 'ĠFa', ('ĠFa', 'c'): 'ĠFac', ('ĠFac', 'e'): 'ĠFace', ('Ġ', 'C'): 'ĠC', ('ĠC', 'ou'): 'ĠCou', ('ĠCou', 'r'): 'ĠCour', ('ĠCour', 'se'): 'ĠCourse', ('Ġ', 'c'): 'Ġc', ('Ġc', 'h'): 'Ġch', ('Ġch', 'a'): 'Ġcha', ('Ġcha', 'p'): 'Ġchap', ('Ġchap', 't'): 'Ġchapt', ('Ġchapt', 'er'): 'Ġchapter',

In [49]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni', 'Ġtokeniz', 'at', 'io', 'ion', 'Ġse', 'ho', 'how', 'll', 'ĠH', 'ĠHu', 'ĠHug', 'ĠHugg', 'ĠHuggin', 'ĠHugging', 'ĠF', 'ĠFa', 'ĠFac', 'ĠFace', 'ĠC', 'ĠCou', 'ĠCour', 'ĠCourse', 'Ġc', 'Ġch', 'Ġcha', 'Ġchap', 'Ġchapt', 'Ġchapter', 'Ġabou', 'Ġabout', 'Ġtokenizat', 'Ġtokenization', 'Ġsec', 'Ġsect', 'Ġsection', 'Ġs', 'Ġshow', 'Ġshows', 'Ġsev', 'Ġsever', 'Ġsevera', 'Ġseveral', 'Ġtokenizer', 'Ġal', 'Ġalg', 'Ġalgo', 'Ġalgor', 'Ġalgori', 'Ġalgorit', 'Ġalgorith', 'Ġalgorithm', 'Ġalgorithms', 'Ho', 'Hop', 'Hope', 'Hopef', 'Hopefu', 'Hopefull', 'Hopefully', 'Ġy', 'Ġyou', 'Ġw', 'Ġwi', 'Ġwill', 'Ġb', 'Ġbe', 'Ġabl', 'Ġable', 'Ġu', 'Ġund', 'Ġunder', 'Ġunders', 'Ġunderst', 'Ġundersta', 'Ġunderstand', 'Ġhow', '

In [48]:
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str("Ahmet nasilsin?")
pre_tokenized_text = [word for word, offset in pre_tokenize_result]

pre_tokenize_result, pre_tokenized_text, merges, splits

([('Ahmet', (0, 5)), ('Ġnasilsin', (5, 14)), ('?', (14, 15))],
 ['Ahmet', 'Ġnasilsin', '?'],
 {('Ġ', 't'): 'Ġt',
  ('i', 's'): 'is',
  ('e', 'r'): 'er',
  ('Ġ', 'a'): 'Ġa',
  ('Ġt', 'o'): 'Ġto',
  ('e', 'n'): 'en',
  ('T', 'h'): 'Th',
  ('Th', 'is'): 'This',
  ('o', 'u'): 'ou',
  ('s', 'e'): 'se',
  ('Ġto', 'k'): 'Ġtok',
  ('Ġtok', 'en'): 'Ġtoken',
  ('n', 'd'): 'nd',
  ('Ġ', 'is'): 'Ġis',
  ('Ġt', 'h'): 'Ġth',
  ('Ġth', 'e'): 'Ġthe',
  ('i', 'n'): 'in',
  ('Ġa', 'b'): 'Ġab',
  ('Ġtoken', 'i'): 'Ġtokeni',
  ('Ġtokeni', 'z'): 'Ġtokeniz',
  ('a', 't'): 'at',
  ('i', 'o'): 'io',
  ('io', 'n'): 'ion',
  ('Ġ', 'se'): 'Ġse',
  ('h', 'o'): 'ho',
  ('ho', 'w'): 'how',
  ('l', 'l'): 'll',
  ('Ġ', 'H'): 'ĠH',
  ('ĠH', 'u'): 'ĠHu',
  ('ĠHu', 'g'): 'ĠHug',
  ('ĠHug', 'g'): 'ĠHugg',
  ('ĠHugg', 'in'): 'ĠHuggin',
  ('ĠHuggin', 'g'): 'ĠHugging',
  ('Ġ', 'F'): 'ĠF',
  ('ĠF', 'a'): 'ĠFa',
  ('ĠFa', 'c'): 'ĠFac',
  ('ĠFac', 'e'): 'ĠFace',
  ('Ġ', 'C'): 'ĠC',
  ('ĠC', 'ou'): 'ĠCou',
  ('ĠCou', 'r'): 'ĠCo

In [61]:
print(tokenize("nasilsin"))

['n', 'a', 's', 'i', 'l', 's', 'in']


In [51]:
for idx, split in enumerate(splits):
  print(split)

This
Ġis
Ġthe
ĠHugging
ĠFace
ĠCourse
.
Ġchapter
Ġabout
Ġtokenization
Ġsection
Ġshows
Ġseveral
Ġtokenizer
Ġalgorithms
Hopefully
,
Ġyou
Ġwill
Ġbe
Ġable
Ġto
Ġunderstand
Ġhow
Ġthey
Ġare
Ġtrained
Ġand
Ġgenerate
Ġtokens


In [52]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [28]:
print(sum({'a': []}, []))

TypeError: can only concatenate list (not "str") to list

In [22]:
tokenize("This is not a token.")

['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']

In [63]:
import re
import collections

# Haber dosyasını oku
with open("trmor2006.train", encoding="utf-8") as f:
    text = f.read()
    text = text.replace("+", " ")

# Metni kelimelere böl
words = re.findall(r'\w+', text.lower())

# Her kelimenin sonuna </w> ekleyin (kelimenin sonunu belirtmek için)
vocab = collections.Counter([' '.join(word) + ' </w>' for word in words])

# BPE işlemi için yardımcı işlevler
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = re.sub(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)', replacement, word)
        v_out[w_out] = v_in[word]
    return v_out

# 32 bin kelimelik bir sözlük oluştur
num_merges = 3200
bpe_merges = []  # BPE birleştirmelerini saklamak için
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    bpe_merges.append(best)
    if i % 200 == 0:
        print(f'Merge {i}: {best}')

# Token sözlüğünü oluştur
bpe_tokens = set()
for word in vocab:
    bpe_tokens.update(word.split())

print(f'\nToplam Token Sayısı: {len(bpe_tokens)}')
print('İlk 100 token:', list(bpe_tokens)[:100])


Merge 0: ('n', 'o')
Merge 200: ('i', 'l</w>')


KeyboardInterrupt: 

In [None]:

# Encode ve Decode işlevleri
def encode(text, merges):
    """Verilen metni BPE tokenlarına dönüştürür"""
    words = re.findall(r'\w+', text.lower())
    tokens = []

    # Her kelimeyi BPE tokenlarına böl
    for word in words:
        word = list(word) + ['</w>']
        while len(word) > 1:
            pairs = [(word[i], word[i + 1]) for i in range(len(word) - 1)]
            pair_freq = {pair: merges.index(pair) for pair in pairs if pair in merges}
            if not pair_freq:
                break
            best_pair = min(pair_freq, key=pair_freq.get)
            i = pairs.index(best_pair)
            word = word[:i] + [''.join(best_pair)] + word[i + 2:]
        tokens.extend(word)

    return tokens

def decode(tokens):
    """BPE tokenlarından orijinal metne dönüştürür"""
    words = []
    current_word = []

    for token in tokens:
        if token == '</w>':
            words.append(''.join(current_word))
            current_word = []
        else:
            current_word.append(token)

    if current_word:
        words.append(''.join(current_word))

    return ' '.join(words)

# Örnek kullanım
sample_text = "Bu bir deneme metnidir"
encoded_tokens = encode(sample_text, bpe_merges)
print("\nEncoded Tokens:", encoded_tokens)

decoded_text = decode(encoded_tokens)
print("Decoded Text:", decoded_text)