In [3]:
import os, json
from datasets import load_dataset


In [4]:
class Vocabulary:
    """Class to map codes from huggingface dataset to tokens in Llama 3-8B token"""

    def __init__(self):
        self.stoi = {}
        self.itos = {}
    
    def build_vocabulary(self, parquet_files, tokenizer_file="tokenizer.json"):
        '''
        creates the vocabulary from the Llama 3 tokenizer and hugging face dataset
        Args:
            tokenizer_file(str): file downloaded from Llama 3(8B) which contains the vocabulary for the model
            parquet_files(list): director with the dataset from hugging face in parquet format

        '''
        # Open the JSON file
        with open(tokenizer_file, 'r') as file:
            # Load the JSON data
            data = json.load(file)
        
        llama_stoi = data['model']['vocab']
        llama_itos = {value:key for key,value in llama_stoi.items()}

        #load hugging face data
        dataset = load_dataset('parquet', data_files=parquet_files)
        vocabulary = set()

        for sent in dataset["train"]["txt"]:
            for word in sent.split():
                vocabulary.add(word)
        
        self.itos = {int(value):llama_itos[int(value)] for value in vocabulary}
        self.stoi = {value:key for key,value in self.itos.items()}
    
    def save(self, file_path):
        with open(file_path, "w") as file:
            json.dump(self.itos, file)

In [5]:
os.chdir("..")

In [7]:
train_dir = [f"dataset/default/partial-train/000{i}.parquet" for i in range(10)]
dataset = load_dataset('parquet', data_files=train_dir)
txt = dataset["train"]["txt"]


In [9]:
type(txt)

list

In [11]:
# https://www.geeksforgeeks.org/byte-pair-encoding-bpe-in-nlp/

import re
from collections import defaultdict

def get_stats(vocab):
    """
    Given a vocabulary (dictionary mapping words to frequency counts), returns a 
    dictionary of tuples representing the frequency count of pairs of characters 
    in the vocabulary.
    """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    """
    Given a pair of characters and a vocabulary, returns a new vocabulary with the 
    pair of characters merged together wherever they appear.
    """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_vocab(data):
    """
    Given a list of strings, returns a dictionary of words mapping to their frequency 
    count in the data.
    """
    vocab = defaultdict(int) # word volcabulary
    for line in data:
        for word in line.split():
            vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def byte_pair_encoding(data, n):
    """
    Given a list of strings and an integer n, returns a list of n merged pairs
    of characters found in the vocabulary of the input data.
    """
    vocab = get_vocab(data)
    for i in range(n):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    return vocab

# Example usage:
corpus = '''Tokenization is the process of breaking down 
a sequence of text into smaller units called tokens,
which can be words, phrases, or even individual characters.
Tokenization is often the first step in natural languages processing tasks 
such as text classification, named entity recognition, and sentiment analysis.
The resulting tokens are typically used as input to further processing steps,
such as vectorization, where the tokens are converted
into numerical representations for machine learning models to use.'''
data = corpus.split('.')

n = 100
bpe_pairs = byte_pair_encoding(data, n)
bpe_pairs


{'Tokenization</w>': 2,
 'is</w>': 2,
 'the</w>': 3,
 'process</w>': 1,
 'of</w>': 2,
 'breaking</w>': 1,
 'down</w>': 1,
 'a</w>': 1,
 'sequence</w>': 1,
 'text</w>': 2,
 'into</w>': 2,
 'smaller</w>': 1,
 'units</w>': 1,
 'called</w>': 1,
 'tokens,</w>': 1,
 'which</w>': 1,
 'can</w>': 1,
 'be</w>': 1,
 'w or d s,</w>': 1,
 'p h r as es ,</w>': 1,
 'or</w>': 1,
 'e v en</w>': 1,
 'in d i v i d u al</w>': 1,
 'ch ar a c te r s</w>': 1,
 'of t en</w>': 1,
 'f i r s t</w>': 1,
 'step </w>': 1,
 'in </w>': 1,
 'na t ur al</w>': 1,
 'l an g u a g e s</w>': 1,
 'processing</w>': 2,
 't as k s</w>': 1,
 'such</w>': 2,
 'as</w>': 3,
 'c l as s i f ic ation ,</w>': 1,
 'na m ed</w>': 1,
 'enti ty </w>': 1,
 're c o g ni tion ,</w>': 1,
 'an d</w>': 1,
 's enti m en t</w>': 1,
 'an al y s is</w>': 1,
 'T h e</w>': 1,
 'r es u l t ing</w>': 1,
 'tokens</w>': 2,
 'are</w>': 2,
 'ty p ic all y </w>': 1,
 'us ed</w>': 1,
 'in p u t</w>': 1,
 'to</w>': 2,
 'f ur th er</w>': 1,
 'step s,</w>': 1,
 '