<a href="https://colab.research.google.com/github/olgagasowska/Machine-Learning-for-Linguists/blob/main/Byte_Pair%20_Encoding_(BPE)_and%20_WordPiece2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### tokenization techniques, focusing on: Byte Pair Encoding (BPE) and WordPiece

In [None]:
!pip install datasets unidecode

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m471.6/471.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
# exercise_2.py
# BPE and WordPiece Tokenization with Data Download and Preprocessing

from datasets import load_dataset
import string
from collections import Counter

# Step 1: Pre-implemented Byte Pair Encoding (BPE)
class BPEncoder:
    def __init__(self, alphabet, merge_rules, bpe_cache=dict()):
        self.alphabet = alphabet
        self.merge_rules = merge_rules
        self.bpe_cache = bpe_cache

    def split_seq(self, s):
        """
        Split the input into alphabet units.
        """
        t = sorted([a for a in self.alphabet if s.startswith(a)], key=lambda x: -len(x))[0]
        if len(t) < len(s):
            return [t] + self.split_seq(s[len(t):])
        else:
            return [t]

    def apply_merge_rule(self, merge_rule, bpe_seq):
        ret = []
        i = 0
        while i < len(bpe_seq) - 1:
            if merge_rule == (bpe_seq[i], bpe_seq[i+1]):
                ret.append(bpe_seq[i] + bpe_seq[i+1])
                i += 2
            else:
                ret.append(bpe_seq[i])
                i += 1
        if i == len(bpe_seq) - 1:
            ret.append(bpe_seq[i])
        return ret, Counter()

    def encode(self, s):
        """
        Encode the input string using BPE.
        """
        if s in self.bpe_cache:
            return self.bpe_cache[s]
        else:
            ret = self.split_seq(s)
            for mr in self.merge_rules:
                ret, _ = self.apply_merge_rule(mr, ret)
            self.bpe_cache[s] = ret
            return ret

    def token_mapping(self):
        tokens = self.alphabet + [a + b for a, b in self.merge_rules]
        return {tok: i for i, tok in enumerate(tokens)}


# WordPiece Tokenizer (simplified)
class WordPieceTokenizer:
    def __init__(self, vocab, unk_token='[UNK]'):
        self.vocab = vocab
        self.unk_token = unk_token

    def tokenize(self, word):
        """
        Tokenize the input word using WordPiece.
        """
        if word in self.vocab:
            return [word]
        tokens = []
        for i in range(len(word)):
            subword = word[:i+1]
            if subword in self.vocab:
                tokens.append(subword)
            else:
                tokens.append(self.unk_token)
        return tokens


# Step 2: Download and Preprocess Data
def download_and_preprocess_data():
    """
    Download the Wikipedia dataset and preprocess it.
    Students will run this function to get a few sentences for tokenization.
    """
    print("Downloading and preprocessing data...")

    # TODO: Load the Wikipedia dataset
    # Hint: Use the load_dataset function and limit to 1000 sentences
    dataset = load_dataset("wikipedia", "20220301.en", split = "train[0:1000]")

    # TODO: Extract the first 5 sentences from the dataset
    # Hint: For each example, extract the first sentence by splitting on '.'
    sentences = []
    for example in dataset:
        sentences.extend(example['text'].split('.')[:5])

    print("Sample sentences for tokenization:")
    for i, sentence in enumerate(sentences):
        print(f"{i+1}. {sentence}")

    return sentences


# Step 3: Tokenization Process and Comparison (students will fill in parts here)
def tokenization_demo(sentences):
    """
    Demo of BPE and WordPiece tokenization on preprocessed sentences.
    Students will run this function and compare the tokenized output.
    """
    # TODO: Define a sample alphabet and merge rules for BPE
    # Hint: Use letters, special characters like '_t', and add tokens like [CLS], [SEP]
    alphabet = (["[CLS]", "[MASK]", "[SEP]", "[PAD]"] + # TODO: change only this line
                [c for c in string.ascii_lowercase] +
                [f"_{c}" for c in string.ascii_lowercase] +  # Preloaded example
                [symbol for symbol in '0123456789!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'] +
                ["\n"])

    # TODO: Define merge rules for BPE
    # Hint: Use common pairs of letters like ("h", "e") and ("_t", "he")
    merge_rules = [("h", "e"), ("_t", "he"), ("s", "a"), ("_s", "he")]

    # Instantiate the BPE encoder
    bpe_encoder = BPEncoder(alphabet, merge_rules)

    # Define WordPiece vocab (students can leave this as is for the demo)
    wordpiece_vocab = ["un", "happiness", "hap", "##pi", "##ness", "[UNK]"]
    wp_tokenizer = WordPieceTokenizer(vocab=wordpiece_vocab)

    # Tokenize each sentence using both BPE and WordPiece
    print("\nTokenizing with BPE and WordPiece:\n")

    for sentence in sentences:
        print(f"Original sentence: {sentence}")

        # BPE Tokenization
        bpe_tokens = bpe_encoder.encode('_' + sentence.lower().replace(' ', '_'))
        tok2idx_bpe = bpe_encoder.token_mapping()
        bpe_numeric = [tok2idx_bpe[tok] for tok in bpe_tokens]

        print(f"BPE tokens: {bpe_tokens}")
        print(f"BPE numeric: {bpe_numeric}")

        # WordPiece Tokenization
        wp_tokens = wp_tokenizer.tokenize(sentence.lower())
        print(f"WordPiece tokens: {wp_tokens}")
        print("-" * 40)


# Step 4: Run the full process
if __name__ == "__main__":
    # Step 1: Download and preprocess data
    sentences = download_and_preprocess_data()

    # Step 2: Run the tokenization demo and compare outputs
    tokenization_demo(sentences)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Biography
Agnes Maria was the daughter of Berthold, Duke of Merania, who was Count of Andechs, a castle and territory near Ammersee, Bavaria
2359.  Her mother was Agnes of Rochlitz
2360. 

In June 1196 Agnes married Philip II of France, who had repudiated his second wife Ingeborg of Denmark in 1193
2361. Agrippina "the Elder" (also, in Latin, , "Germanicus's Agrippina";  ‚Äì AD 33) was a prominent member of the Julio-Claudian dynasty
2362.  She was the daughter of Marcus Vipsanius Agrippa (a close supporter of the first Roman emperor, Augustus) and Augustus' daughter, Julia the Elder
2363.  Her brothers Lucius and Gaius Caesar were the adoptive sons of Augustus, and were his heirs until their deaths in AD 2 and 4, respectively
2364.  Following their deaths, her second cousin Germanicus was made the adoptive son of Tiberius, Augustus' stepson, as part of Augustus' succession scheme in the adoptions of AD 4 (in which Tiber

IndexError: list index out of range

In [None]:
# exercise_2.py
# BPE and WordPiece Tokenization with Data Download and Preprocessing

from datasets import load_dataset
import string
from collections import Counter

class BPEncoder:
    def __init__(self, alphabet, merge_rules, bpe_cache=dict()):
        self.alphabet = alphabet
        self.merge_rules = merge_rules
        self.bpe_cache = bpe_cache

    def split_seq(self, s):

        t = sorted([a for a in self.alphabet if s.startswith(a)], key=lambda x: -len(x))[0]
        if len(t) < len(s):
            return [t] + self.split_seq(s[len(t):])
        else:
            return [t]

    def apply_merge_rule(self, merge_rule, bpe_seq):
        ret = []
        i = 0
        while i < len(bpe_seq) - 1:
            if merge_rule == (bpe_seq[i], bpe_seq[i+1]):
                ret.append(bpe_seq[i] + bpe_seq[i+1])
                i += 2
            else:
                ret.append(bpe_seq[i])
                i += 1
        if i == len(bpe_seq) - 1:
            ret.append(bpe_seq[i])
        return ret, Counter()

    def encode(self, s):

        if s in self.bpe_cache:
            return self.bpe_cache[s]
        else:
            ret = self.split_seq(s)
            for mr in self.merge_rules:
                ret, _ = self.apply_merge_rule(mr, ret)
            self.bpe_cache[s] = ret
            return ret

    def token_mapping(self):
        tokens = self.alphabet + [a + b for a, b in self.merge_rules]
        return {tok: i for i, tok in enumerate(tokens)}



class WordPieceTokenizer:
    def __init__(self, vocab, unk_token='[UNK]'):
        self.vocab = vocab
        self.unk_token = unk_token

    def tokenize(self, word):

        if word in self.vocab:
            return [word]
        tokens = []
        for i in range(len(word)):
            subword = word[:i+1]
            if subword in self.vocab:
                tokens.append(subword)
            else:
                tokens.append(self.unk_token)
        return tokens



def download_and_preprocess_data():

    print("Downloading and preprocessing data...")

    dataset = load_dataset("wikipedia", "20220301.en", split = "train[0:1000]")

    sentences = []
    for example in dataset:
        sentences.extend(example['text'].split('.')[:5])

    print("Sample sentences for tokenization:")
    for i, sentence in enumerate(sentences):
        print(f"{i+1}. {sentence}")

    return sentences

def tokenization_demo(sentences):

    alphabet = (["[CLS]", "[MASK]", "[SEP]", "[PAD]"] +
                [c for c in string.ascii_lowercase] +
                [f"_{c}" for c in string.ascii_lowercase] +
                [symbol for symbol in '0123456789!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~…ë'] +
                ["\n"])


    merge_rules = [("h", "e"), ("_t", "he"), ("s", "a"), ("_s", "he")]


    bpe_encoder = BPEncoder(alphabet, merge_rules)

    wordpiece_vocab = ["un", "happiness", "hap", "##pi", "##ness", "[UNK]"]
    wp_tokenizer = WordPieceTokenizer(vocab=wordpiece_vocab)

    print("\nTokenizing with BPE and WordPiece:\n")

    for sentence in sentences:
        print(f"Original sentence: {sentence}")

        bpe_tokens = bpe_encoder.encode('_' + sentence.lower().replace(' ', '_'))
        tok2idx_bpe = bpe_encoder.token_mapping()
        bpe_numeric = [tok2idx_bpe[tok] for tok in bpe_tokens]

        print(f"BPE tokens: {bpe_tokens}")
        print(f"BPE numeric: {bpe_numeric}")

        wp_tokens = wp_tokenizer.tokenize(sentence.lower())
        print(f"WordPiece tokens: {wp_tokens}")
        print("-" * 40)



if __name__ == "__main__":

    sentences = download_and_preprocess_data()

    tokenization_demo(sentences)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2391.  The genus is native from east Asia south to northern Australasia
2392. Aimoin of Fleury (; ), French chronicler, was born at Villefranche-de-Longchat about 960, and in early life entered the monastery of Fleury, where he became a monk and passed the greater part of his life
2393.  Between c
2394.  980 and 985 Aimoin wrote about St
2395.  Benedict in Abbey of Fleury-sur-Loire
2396.   His chief work is a Historia Francorum, or Libri V
2397. The Akkadian Empire () was the first ancient empire of Mesopotamia after the long-lived civilization of Sumer
2398.  It was centered in the city of Akkad () and its surrounding region
2399.  The empire united Akkadian and Sumerian speakers under one rule
2400.  The Akkadian Empire exercised influence across Mesopotamia, the Levant, and Anatolia, sending military expeditions as far south as Dilmun and Magan (modern Saudi Arabia, Bahrain, and Oman) in the Arabian Peninsula
2401. 

T

IndexError: list index out of range