In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import torch
import json
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
### Load Corpus... its very large

from datasets import load_dataset
from huggingface_hub import login

login(token = "") # Dont share this

# Huge Corpus of English
dataset = load_dataset("oscar-corpus/OSCAR-2301",
                       token = True, 
                       language = "en", 
                       streaming = True, 
                       split = "train") 

In [None]:
### Create BPE Tokenizer from RoBERTa Paper

class Tokenizer:
    
    def __init__(self, tokenizer_vocab_path, tokenizer_merges_path):
        self.tokenizer_vocab = tokenizer_vocab_path
        self.tokenizer_merges = tokenizer_merges_path
        self.special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
        self.tokenizer = self._load_tokenizer()
    
    def _load_tokenizer(self):
        tokenizer = ByteLevelBPETokenizer()
        
        # Load existing tokenizer if available
        if Path(self.tokenizer_vocab).exists() and Path(self.tokenizer_merges).exists():
            tokenizer = ByteLevelBPETokenizer.from_file(self.tokenizer_vocab, self.tokenizer_merges)
            
        else:
            def text_iterator():
                for item in dataset:
                    yield item['text']
                        
            tokenizer.train_from_iterator(
                text_iterator(),
                vocab_size = 50265,
                min_frequency = 2,
                special_tokens = self.special_tokens
            )
            
            tokenizer.save_model("roberta_tokenizer", "my_vocab.json", "my_merges.txt")
        
        return tokenizer
    
    # Encode function
    def encode(self, text):
        return self.tokenizer.encode(text)
    
    # Decode function
    def decode(self, tokens):
        try:
            decoded_text = self.tokenizer.decode(tokens)
        except Exception as e:
            print(f"Error decoding tokens: {tokens}")
            raise e
        return decoded_text

In [None]:
### Load Tokenizer or Create New

tokenizer_vocab_path = ".json" 
tokenizer_merges_path = ".txt"

tokenizer = Tokenizer(tokenizer_vocab_path, tokenizer_merges_path)

In [None]:
### Encode Text

text = ["This is a tokenizer.", "This hopefully works.", "It should?"]
text = pd.Series(text)

encoded_corpus = []
i = 0

while i <= len(text) - 1:
    example = text.iloc[i]
    encoded = tokenizer.encode(example)
    encoded_corpus.append(encoded)
    i = i + 1
    
encoded_corpus[1]

In [None]:
### Decode Tokens Back to Text

decoded_text = []
i = 0

while i <= len(text) - 1:
    example = encoded_corpus[i]
    decoded = tokenizer.decode(example.ids)
    decoded_text.append(decoded)
    i = i + 1
    
decoded_text[1]