In [17]:
from collections import Counter

def bpe_tokenize(sentence, num_merges):
    """
    Perform Byte Pair Encoding (BPE) tokenization on a given sentence.
    """

    # Initialize vocabulary with individual characters
    vocabulary = list(sentence)
    tokenized_sentence = list(sentence)

    for i in range(num_merges):
        # Find the most common pair
        pairs = Counter()
        for pair in zip(tokenized_sentence[:-1], tokenized_sentence[1:]):
            pairs[pair] += 1
        most_common_pair, _ = pairs.most_common(1)[0]

        # If no more pairs can be merged, break
        if pairs[most_common_pair] <= 1:
            break

        # Create a new token for the most common pair
        new_token = most_common_pair[0] + most_common_pair[1]

        # Update the sentence by replacing instances of the most common pair with the new token
        new_tokenized_sentences = []
        skip = False
        for index, token in enumerate(tokenized_sentence[:-1]):
            if skip:
                skip = False
                continue
            if token == most_common_pair[0] and tokenized_sentence[index + 1] == most_common_pair[1]:
                new_tokenized_sentences.append(new_token)
                skip = True
            else:
                new_tokenized_sentences.append(token)
        if not skip:
            new_tokenized_sentences.append(tokenized_sentence[-1])
        tokenized_sentence = new_tokenized_sentences

        # Update vocabulary
        vocabulary.append(new_token)

    return tokenized_sentence, vocabulary

# Sample sentence
sentence = "veritably very super supper"
# Perform BPE with a small number of merges to illustrate the process
tokenized_sentence, _ = bpe_tokenize(sentence, 15)

print("Tokenized Sentence:", tokenized_sentence)


Tokenized Sentence: ['ver', 'i', 't', 'a', 'b', 'l', 'y ', 'ver', 'y ', 'sup', 'er', ' ', 'sup', 'p', 'er']
