# TRAIN SOME TOKENIZER

### Wikipedia (20GB)

In [1]:
from tqdm import tqdm
import os
cache_dir = "/shared/3/projects/hiatus/EVAL_wegmann/cache/huggingface"
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HF_DATASETS_CACHE"] = cache_dir
from transformers import AutoTokenizer




In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("wikipedia", "20220301.en", split="train")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [4]:
# Calculate the total number of words in the dataset
total_words = 0
for article in dataset:
    total_words += len(article['text'].split())

print(f"Total number of words: {total_words}")

Total number of words: 3100865347


In [9]:
def get_training_corpus(text_handle="text"):
    train_data = dataset
    for i in tqdm(range(0, len(train_data), 1000), desc="Generating training corpus"):
        yield train_data[i: i + 1000][text_handle]


def fit_tokenizer(huggingface_dataset, vocab_size):
    old_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    training_corpus = get_training_corpus()
    tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=vocab_size, length=6459000)
    dir_name = f"./llama3-tokenizer-wikitext-raw/{vocab_size}"
    os.makedirs(dir_name, exist_ok=True)
    tokenizer.save_pretrained(f"{dir_name}")

In [14]:
fit_tokenizer(get_training_corpus(), 1000)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generating training corpus: 100%|███████████████████████████████████████████████████████████████| 6459/6459 [17:35<00:00,  6.12it/s]







### "Messy" data: Twitter (10GB)

In [6]:
import bz2
import json
from tqdm import tqdm
file_path = '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-12-01.p2.bz2'

In [7]:
def load_bz2_json_sample(file_path, num_lines=2):
    """
    Load the first few lines of a bz2 compressed JSON file.
    
    :param file_path: Path to the bz2 compressed JSON file.
    :param num_lines: Number of lines to read from the file.
    :return: A list of JSON objects from the file.
    """
    lines = []
    with bz2.open(file_path, 'rt') as f:
        for _ in range(num_lines):
            line = f.readline()
            if not line:
                break
            lines.append(json.loads(line))
    return lines
sample_data = load_bz2_json_sample(file_path)

# Display the first few lines of the JSON data
for line in sample_data:
    print(json.dumps(line, indent=4))

{
    "created_at": "Wed Dec 01 04:53:30 +0000 2021",
    "id": 1465906883013713921,
    "id_str": "1465906883013713921",
    "text": "@kozoudazou_ \u3081\u3063\u3061\u3083\u8912\u3081\u3066\u304f\u308c\u308b\u2026\uff84\uff69\uff9d\uff78(\uff8a\uff85\uff8e\uff7c\uff9e",
    "display_text_range": [
        13,
        34
    ],
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "truncated": false,
    "in_reply_to_status_id": 1465905183993384966,
    "in_reply_to_status_id_str": "1465905183993384966",
    "in_reply_to_user_id": 846039760204288000,
    "in_reply_to_user_id_str": "846039760204288000",
    "in_reply_to_screen_name": "kozoudazou_",
    "user": {
        "id": 1301322691010392064,
        "id_str": "1301322691010392064",
        "name": "\u306f\u3044\u3060\u3057\u3087\u3046\u30b1\u30c4\u304a\u3070\u3055\u3093\u306f\u3071\u304a\u306e\u4e0b\u50d5",
        "screen_name": "ukyo3satomin",
        "location": null,
       

In [8]:
def load_bz2_json_batch(file_path, batch_size=1000, total_lines=6459000):
    """
    Load a bz2 compressed JSON file in batches.
    
    :param file_path: Path to the bz2 compressed JSON file.
    :param batch_size: Number of lines to read in each batch.
    :param total_lines: Total number of lines to read from the file.
    :return: A generator yielding batches of JSON objects.
    """
    with bz2.open(file_path, 'rt') as f:
        batch = []
        for i, line in enumerate(f):
            if i >= total_lines:
                break
            batch.append(json.loads(line))
            if len(batch) == batch_size:
                yield batch
                batch = []
        if batch:
            yield batch


In [9]:
def get_training_corpus_bz2(file_path, batch_size=1000, text_handle="text"):
    """
    Generate training corpus from the dataset in batches.
    
    :param file_path: Path to the bz2 compressed JSON file.
    :param batch_size: Number of lines to read in each batch.
    :param text_handle: Key to access text data in JSON objects.
    :return: A generator yielding text data in chunks.
    """
    for batch in load_bz2_json_batch(file_path, batch_size):
        for item in batch:
            yield item[text_handle]

In [5]:
from transformers import AutoTokenizer
def fit_tokenizer(file_path, vocab_size, batch_size=1000, text_handle="text"):
    """
    Train a new tokenizer on the given dataset.
    
    :param file_path: Path to the bz2 compressed JSON file.
    :param vocab_size: Size of the vocabulary.
    :param batch_size: Number of lines to read in each batch.
    :param text_handle: Key to access text data in JSON objects.
    """
    old_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    training_corpus = get_training_corpus_bz2(file_path, batch_size, text_handle)
    tokenizer = old_tokenizer.train_new_from_iterator(tqdm(training_corpus, desc="Fitting Tokenizer", unit="batch"), vocab_size=vocab_size)
    dir_name = f"./llama3-tokenizer-twitter-raw/{vocab_size}"
    os.makedirs(dir_name, exist_ok=True)
    tokenizer.save_pretrained(f"{dir_name}")

In [11]:
# Calculate the total number of words in the dataset
def calculate_total_words(file_path):
    total_words = 0
    for text in get_training_corpus_bz2(file_path):
        total_words += len(text.split())
    return total_words
print(calculate_total_words(file_path))

66339470


In [None]:
# Example usage

# Fit the tokenizer on the loaded dataset
fit_tokenizer(file_path, vocab_size=30522, batch_size=1000, text_handle="text")  # Adjust vocab_size and text_handle as needed


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Fitting Tokenizer: 0batch [00:00, ?batch/s]

### SADIRI dataset

In [12]:
# check the overlap between candidate and query files
from datasets import load_from_disk
directory_path = '/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/down_1_shuffle/train'

In [None]:
# Load the dataset
dataset = load_from_disk(directory_path)

# Display the first few rows of the train split
print(dataset['train'].head())