In [3]:
import os, json
from datasets import load_dataset


In [4]:
class Vocabulary:
    """Class to map codes from huggingface dataset to tokens in Llama 3-8B token"""

    def __init__(self):
        self.stoi = {}
        self.itos = {}
    
    def build_vocabulary(self, parquet_files, tokenizer_file="tokenizer.json"):
        '''
        creates the vocabulary from the Llama 3 tokenizer and hugging face dataset
        Args:
            tokenizer_file(str): file downloaded from Llama 3(8B) which contains the vocabulary for the model
            parquet_files(list): director with the dataset from hugging face in parquet format

        '''
        # Open the JSON file
        with open(tokenizer_file, 'r') as file:
            # Load the JSON data
            data = json.load(file)
        
        llama_stoi = data['model']['vocab']
        llama_itos = {value:key for key,value in llama_stoi.items()}

        #load hugging face data
        dataset = load_dataset('parquet', data_files=parquet_files)
        vocabulary = set()

        for sent in dataset["train"]["txt"]:
            for word in sent.split():
                vocabulary.add(word)
        
        self.itos = {int(value):llama_itos[int(value)] for value in vocabulary}
        self.stoi = {value:key for key,value in self.itos.items()}
    
    def save(self, file_path):
        with open(file_path, "w") as file:
            json.dump(self.itos, file)

In [5]:
os.chdir("..")

In [7]:
train_dir = [f"dataset/default/partial-train/000{i}.parquet" for i in range(10)]
dataset = load_dataset('parquet', data_files=train_dir)
txt = dataset["train"]["txt"]


In [9]:
type(txt)

list

## Applying BPE

using the following link for implimentation: https://huggingface.co/learn/nlp-course/en/chapter6/5

In [14]:
type(txt[0])

str

In [15]:
def add_special_character(corpus, special_char='▁'):
    modified_corpus = []
    for sentence in corpus:
        # Split the sentence into words
        words = sentence.split()
        # Add the special character to the beginning of each word
        modified_words = [special_char + word for word in words]
        # Join the modified words back into a sentence
        modified_sentence = ' '.join(modified_words)
        # Append the modified sentence to the new corpus
        modified_corpus.append(modified_sentence)
    return modified_corpus

# Example usage
corpus = ["this is a test", "another example sentence"]
modified_corpus = add_special_character(corpus)
for sentence in modified_corpus:
    print(sentence)


▁this ▁is ▁a ▁test
▁another ▁example ▁sentence


In [25]:
from collections import defaultdict

def add_special_character(corpus, special_char='▁'):
    modified_corpus = []
    for sentence in corpus:
        modified_sentence = ''
        words = []
        previous_char_is_space = False
        
        for char in sentence:
            if char == ' ':
                previous_char_is_space = True
                words.append(modified_sentence)
                modified_sentence = ''
            elif previous_char_is_space:
                modified_sentence += special_char + char
                previous_char_is_space = False
            else:
                modified_sentence += char
        
        modified_corpus.extend(words)
    
    return modified_corpus

# Example usage
corpus = [txt[0], txt[1], txt[2]]
modified_corpus = add_special_character(corpus)
word_freqs = defaultdict(int)
for word in modified_corpus:
    word_freqs[word] += 1


▁2029


In [26]:
word_freqs

defaultdict(int,
            {'896': 1,
             '▁2029': 4,
             '▁935': 2,
             '▁679': 4,
             '▁1115': 1,
             '▁3601': 1,
             '▁3000': 1,
             '▁222': 21,
             '▁3446': 2,
             '▁2218': 1,
             '▁3072': 1,
             '▁550': 3,
             '▁3652': 1,
             '▁665': 8,
             '▁2596': 5,
             '▁2809': 2,
             '▁3649': 1,
             '▁251': 6,
             '▁2610': 3,
             '▁2536': 2,
             '▁47': 1,
             '▁2852': 2,
             '▁2940': 2,
             '▁3353': 3,
             '▁3400': 1,
             '▁3336': 1,
             '▁325': 1,
             '▁2647': 4,
             '▁4076': 8,
             '▁3653': 1,
             '▁3253': 2,
             '▁58': 1,
             '▁3664': 2,
             '▁1424': 3,
             '▁1388': 1,
             '▁278': 3,
             '▁897': 1,
             '▁447': 4,
             '▁2355': 1,
             '▁2453': 4