# GPT Vs. BERT Tokenizers
* **Name:** Mohammad Mahdi Salmani

In [None]:
import collections
import re
from google.colab import drive
import os

In [None]:
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/dataset')

Mounted at /content/drive


In [None]:
# Load the text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "./data/All_Around_the_Moon.txt"
corpus = load_text_file(file_path)

In [None]:
def preprocess_corpus(corpus):
    # Convert to lowercase
    corpus = corpus.lower()
    # Remove punctuation
    corpus = re.sub(r'[^\w\s]', '', corpus)
    # Remove numbers
    corpus = re.sub(r'\d+', '', corpus)
    return corpus

In [None]:
processed_corpus = preprocess_corpus(corpus)

In [None]:
test_sentence1 = "This darkness is absolutely killing! If we ever take this trip again, it must be about the time of the sNew Moon!"
test_sentence2 = "This is a tokenization task. Tokenization is the first step in a NLP pipeline. We will be comparing the tokens generated by each tokenization model."

## BERT Tokenizer

In [None]:
import collections
import re

class WordPieceTokenizer:
    def __init__(self, vocab_size=10000):
        self.vocab_size = vocab_size
        self.vocab = {"[UNK]": 0}

    def train(self, corpus):
        word_freq = collections.Counter(re.findall(r'\w+', corpus))
        self.vocab = {char: count for char, count in word_freq.items() if len(char) == 1}

        while len(self.vocab) < self.vocab_size:
            subword_pairs = collections.Counter()
            for word, freq in word_freq.items():
                subwords = self.tokenize(word)
                for i in range(len(subwords) - 1):
                    subword_pairs[(subwords[i], subwords[i + 1])] += freq

            if not subword_pairs:
                break

            most_freq_pair = subword_pairs.most_common(1)[0][0]
            new_subword = ''.join(most_freq_pair)

            self.vocab[new_subword] = subword_pairs[most_freq_pair]

            new_word_freq = collections.Counter()
            for word, freq in word_freq.items():
                new_word = word.replace(''.join(most_freq_pair), new_subword)
                new_word_freq[new_word] += freq
            word_freq = new_word_freq

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            word_tokens = []
            while word:
                subword = self._find_longest_subword(word)
                if subword is None:
                    word_tokens.append("[UNK]")
                    break
                word_tokens.append(subword)
                word = word[len(subword):]
            tokens.extend(word_tokens)
        return tokens

    def _find_longest_subword(self, text):
        for length in range(len(text), 0, -1):
            subword = text[:length]
            if subword in self.vocab:
                return subword
        return None

In [None]:
tokenizer = WordPieceTokenizer(vocab_size=100)
tokenizer.train(corpus)
print(tokenizer.tokenize(test_sentence1))

## GPT Tokenizer

In [None]:
from collections import defaultdict

class BPETokenizer:
    def __init__(self, vocab_size=10000):
        self.vocab_size = vocab_size
        self.vocab = {}

    def train(self, texts):
        vocab = defaultdict(int)
        for text in texts:
            for word in text.split():
                vocab[tuple(word)] += 1

        num_merges = self.vocab_size - len(vocab)
        for _ in range(num_merges):
            pairs = defaultdict(int)
            for word, freq in vocab.items():
                for i in range(len(word) - 1):
                    pairs[word[i], word[i+1]] += freq
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            vocab[''.join(best)] = vocab.pop(best)
        self.vocab = {word: idx for idx, (word, _) in enumerate(vocab.items())}

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            token = ''
            for char in word:
                token += char
                if token in self.vocab:
                    tokens.append(self.vocab[token])
                    token = ''
            if token:
                tokens.append(self.vocab[token])
        return tokens

In [None]:
tokenizer = BPETokenizer(vocab_size=100)
tokenizer.train(corpus)
tokenizer.tokenize(test_sentence1)

## Implement tokenizer using libraries

In [None]:
!pip install tokenizers

In [None]:
from tokenizers import Tokenizer, models, trainers
from tokenizers.pre_tokenizers import Whitespace

In [None]:
# Initialize WordPiece tokenizer
wp_tokenizer = Tokenizer(models.WordPiece())

wp_tokenizer.pre_tokenizer = Whitespace()

# Train WordPiece tokenizer
wp_trainer = trainers.WordPieceTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
wp_tokenizer.train(files=[file_path], trainer=wp_trainer)

In [None]:
print("WordPiece vocab size:", wp_tokenizer.get_vocab_size())

WordPiece vocab size: 17557


In [None]:
# Initialize BPE tokenizer
bpe_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

bpe_tokenizer.pre_tokenizer = Whitespace()

# Train BPE tokenizer
bpe_trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
bpe_tokenizer.train(files=[file_path], trainer=bpe_trainer)

In [None]:
print("BPE vocab size:", bpe_tokenizer.get_vocab_size())

BPE vocab size: 16553


In [None]:
# Test WordPiece
wp_encoded = wp_tokenizer.encode(test_sentence1)
print("WordPiece tokens:\n", wp_encoded.tokens)
# Test BPE
bpe_encoded = bpe_tokenizer.encode(test_sentence1)
print("BPE tokens:\n", bpe_encoded.tokens)

WordPiece tokens:
 ['This', 'darkness', 'is', 'absolutely', 'killing', '!', 'If', 'we', 'ever', 'take', 'this', 'trip', 'again', ',', 'it', 'must', 'be', 'about', 'the', 'time', 'of', 'the', 's', '##N', '##ew', 'Moon', '!']
BPE tokens:
 ['This', 'darkness', 'is', 'absolutely', 'killing', '!', 'If', 'we', 'ever', 'take', 'this', 'trip', 'again', ',', 'it', 'must', 'be', 'about', 'the', 'time', 'of', 'the', 's', 'New', 'Moon', '!']


In [None]:
wp_encoded = wp_tokenizer.encode(test_sentence2)
print("WordPiece tokens:\n", wp_encoded.tokens)

bpe_encoded = bpe_tokenizer.encode(test_sentence2)
print("BPE tokens:\n", bpe_encoded.tokens)

WordPiece tokens:
 ['This', 'is', 'a', 'to', '##ken', '##ization', 'task', '.', 'To', '##ken', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'N', '##L', '##P', 'pip', '##el', '##ine', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', '##ken', '##s', 'generated', 'by', 'each', 'to', '##ken', '##ization', 'model', '.']
BPE tokens:
 ['This', 'is', 'a', 'to', 'ken', 'ization', 'task', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.']
