In [1]:
config_names = [
    "4chan_meta_sep",
    "c4_100_domains",
    "c4_en",
    "dolma_100_programing_languages",
    "dolma_100_subreddits",
    "dolma-v1_5",
    "falcon-refinedweb",
    "gab",
    "m2d2_s2orc_unsplit",
    "m2d2_wikipedia_unsplit",
    "manosphere_meta_sep",
    "mc4",
    "ptb",
    "redpajama",
    "twitterAAE_HELM_fixed",
    "wikitext_103"
]

In [14]:
SEQ_LEN = 2048

In [27]:
from datasets import load_dataset, concatenate_datasets
import os

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7b-0724-hf")


In [8]:
# Read the .env file and set environment variables
with open('.env', 'r') as f:
    for line in f:
        if line.strip() and not line.startswith('#'):
            key, value = line.strip().split('=', 1)
            os.environ[key] = value

In [22]:
def tokenize_and_chunk(examples):
    """Tokenizes and chunks text data to SEQ_LEN sequences."""
    tokens = []
    source = examples['source'][0] # NOTE: only one source per dataset

    for text in examples['text']:
        _tokens = tokenizer.encode(text)
        _tokens.append(tokenizer.eos_token_id)
        tokens.extend(_tokens)

    # Split tokens into chunks of SEQ_LEN
    chunks = [tokens[i:i + SEQ_LEN] for i in range(0, len(tokens), SEQ_LEN)]
    texts = [tokenizer.decode(chunk) for chunk in chunks]
    sources = [source] * len(texts)

    # print(len(chunks))
    
    # Discard the last chunk if it's shorter than SEQ_LEN
    if len(chunks[-1]) < SEQ_LEN:
        chunks = chunks[:-1]
        texts = texts[:-1]
        sources = sources[:-1]
        
    return {'input_ids': chunks, 'text': texts, 'source': sources}

In [33]:
# List to store all datasets
all_datasets = []
subsampled_datasets = []

for config_name in config_names:
    sub_dataset = load_dataset("allenai/paloma", config_name, split='val', token=os.environ["HF_TOKEN"])
    shuffled_sub_dataset = sub_dataset.shuffle(seed=42)  # Set seed for reproducibility

    tokenized_sub_dataset = shuffled_sub_dataset.map(
        tokenize_and_chunk,
        remove_columns=shuffled_sub_dataset.column_names,
        batched=True,
        batch_size=100,
        num_proc=70,
        keep_in_memory=True,
    )
    
    # Take up to 100 rows from each dataset
    subsampled_dataset = tokenized_sub_dataset.shuffle(seed=42).select(range(min(100, len(tokenized_sub_dataset))))
    subsampled_datasets.append(subsampled_dataset)

    all_datasets.append(tokenized_sub_dataset)

# Combine all datasets into one
combined_subsampled_dataset = concatenate_datasets(subsampled_datasets)
combined_all_datasets = concatenate_datasets(all_datasets)

Using the latest cached version of the dataset since allenai/paloma couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration '4chan_meta_sep' at /home/rd654/.cache/huggingface/datasets/allenai___paloma/4chan_meta_sep/0.0.0/65cd6fc59dba021b21db414fa5e8d7765ffbe5e6 (last modified on Sun Dec  1 16:30:34 2024).
Map (num_proc=70): 100%|██████████| 523/523 [00:02<00:00, 261.03 examples/s]
Using the latest cached version of the dataset since allenai/paloma couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'c4_100_domains' at /home/rd654/.cache/huggingface/datasets/allenai___paloma/c4_100_domains/0.0.0/65cd6fc59dba021b21db414fa5e8d7765ffbe5e6 (last modified on Sun Dec  1 16:30:36 2024).
Map (num_proc=70): 100%|██████████| 14059/14059 [00:14<00:00, 937.92 examples/s] 
Using the latest cached version of the dataset since allenai/paloma couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'c4_e

Dataset({
    features: ['text', 'source', 'input_ids'],
    num_rows: 1435
})
Dataset({
    features: ['text', 'source', 'input_ids'],
    num_rows: 29016
})


In [34]:
combined_subsampled_dataset.push_to_hub("pico-lm/pretokenized-paloma-tinsy",  token=os.environ["HF_TOKEN"], split="val")
combined_all_datasets.push_to_hub("pico-lm/pretokenized-paloma",  token=os.environ["HF_TOKEN"], split="val")

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.05ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:02<00:00, 10.02ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:11<00:00, 11.59s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/pico-lm/pretokenized-paloma/commit/83e6d221ec4f08ad4c7b1f8fe80a61b0f1e370c8', commit_message='Upload dataset', commit_description='', oid='83e6d221ec4f08ad4c7b1f8fe80a61b0f1e370c8', pr_url=None, pr_revision=None, pr_num=None)