In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import torch
import json
import matplotlib.pyplot as plt
import pandas as pd

# Load Corpus

In [None]:
from datasets import load_dataset
from huggingface_hub import login

login(token="") # Dont share this

# Huge Corpus of English

dataset = load_dataset("oscar-corpus/OSCAR-2301",
                       token=True, # required
                       language="en", 
                       streaming=True, # optional
                       split="train") # optional

# Create/Train Tokenizer 

In [None]:
class Tokenizer:
    
    # BPE Tokenizer for RoBERTa specifically
    
    def __init__(self, tokenizer_vocab_path, tokenizer_merges_path):
    # Initialize Tokenizer aspects
        self.tokenizer_vocab = tokenizer_vocab_path
        self.tokenizer_merges = tokenizer_merges_path
        self.special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"] # Same tokens used in RoBERTa; these vary by model
        self.tokenizer = self._load_tokenizer()
    
    def _load_tokenizer(self):
    # Load Tokenizer
        tokenizer = ByteLevelBPETokenizer()
        
        if Path(self.tokenizer_vocab and self.tokenizer_merges).exists():
            # Load existing tokenizer
            tokenizer = ByteLevelBPETokenizer.from_file(self.tokenizer_vocab, self.tokenizer_merges)
            
        else:
            
            # Create a generator that yields text from the dataset
            def text_iterator():
                for item in dataset:
                    yield item['text']
                        
            # Train new tokenizer
            tokenizer.train_from_iterator(
                text_iterator(),
                vocab_size=50265, # RoBERTa Vocab Size
                min_frequency=2,
                special_tokens=self.special_tokens
            )
            
            tokenizer.save_model("path/to/save/directory", "my_vocab.json", "my_merges.txt")
        
        return tokenizer
    
    def encode(self, text):
    # Encode text to tokens
        return self.tokenizer.encode(text)
    
    def decode(self, tokens):
    # Decode Tokens back into text
        try:
            decoded_text = self.tokenizer.decode(tokens)
        except Exception as e:
            print(f"Error decoding tokens: {tokens}")
            raise e
        return decoded_text

# Load Tokenizer

In [None]:
tokenizer_vocab_path = ".json" 
tokenizer_merges_path = ".txt"

tokenizer = Tokenizer(tokenizer_vocab_path, tokenizer_merges_path) # Load Tokenizer or Create new

# Encode Text

In [None]:
text = ["This is a tokenizer.", "This hopefully works.", "It should?"]
text = pd.Series(text)

encoded_corpus = []

i = 0

while i <= len(text) - 1:
    example = text.iloc[i]
    encoded = tokenizer.encode(example)
    encoded_corpus.append(encoded)
    i = i + 1
    
encoded_corpus[1]

# Decode Text

In [None]:
decoded_text = []

i = 0

while i <= len(text) - 1:
    example = encoded_corpus[i]
    decoded = tokenizer.decode(example.ids)
    decoded_text.append(decoded)
    i = i + 1
    
decoded_text[1]