### Paloma Dataset Creation for PICO-LM

This notebook processes the Paloma dataset from Allen AI to create two tokenized versions:
1. `pretokenized-paloma`: The complete tokenized dataset
2. `pretokenized-paloma-tinsy`: A smaller subset containing up to 100 examples from each source

In [1]:
# Defining basic constants: names of the datasets and the sequence length

config_names = [
    "4chan_meta_sep",
    "c4_100_domains",
    "c4_en",
    "dolma_100_programing_languages",
    "dolma_100_subreddits",
    "dolma-v1_5",
    "falcon-refinedweb",
    "gab",
    "m2d2_s2orc_unsplit",
    "m2d2_wikipedia_unsplit",
    "manosphere_meta_sep",
    "mc4",
    "ptb",
    "redpajama",
    "twitterAAE_HELM_fixed",
    "wikitext_103"
]

SEQ_LEN = 2048

In [27]:
# Importing necessary libraries
from datasets import load_dataset, concatenate_datasets
import os

from transformers import AutoTokenizer

# NOTE: this is the same tokenizer we use for the dolma preprocessing
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7b-0724-hf")


In [8]:
# Setting up environment variables for HuggingFace token
with open('.env', 'r') as f:
    for line in f:
        if line.strip() and not line.startswith('#'):
            key, value = line.strip().split('=', 1)
            os.environ[key] = value

In [22]:
def tokenize_and_chunk(examples):
    """Tokenizes and chunks text data to SEQ_LEN sequences."""
    tokens = []
    source = examples['source'][0] # NOTE: only one source per dataset

    for text in examples['text']:
        _tokens = tokenizer.encode(text)
        _tokens.append(tokenizer.eos_token_id)
        tokens.extend(_tokens)

    # Split tokens into chunks of SEQ_LEN
    chunks = [tokens[i:i + SEQ_LEN] for i in range(0, len(tokens), SEQ_LEN)]
    texts = [tokenizer.decode(chunk) for chunk in chunks]
    sources = [source] * len(texts)

    # print(len(chunks))
    
    # Discard the last chunk if it's shorter than SEQ_LEN
    if len(chunks[-1]) < SEQ_LEN:
        chunks = chunks[:-1]
        texts = texts[:-1]
        sources = sources[:-1]
        
    return {'input_ids': chunks, 'text': texts, 'source': sources}

In [None]:
# List to store 'all' and 'subsampled' datasets 
# NOTE: all will correspond to the entire pretokenized-paloma dataset while 
# subsampled will correspond to the pretokenized-paloma-tinsy dataset

all_datasets = []
subsampled_datasets = []

# NOTE: the paloma dataset is stored in multiple files, so we need to load each file separately
for config_name in config_names:
    sub_dataset = load_dataset("allenai/paloma", config_name, split='val', token=os.environ["HF_TOKEN"])
    shuffled_sub_dataset = sub_dataset.shuffle(seed=42)  # Set seed for reproducibility

    # Tokenizing and chunking the dataset
    tokenized_sub_dataset = shuffled_sub_dataset.map(
        tokenize_and_chunk,
        remove_columns=shuffled_sub_dataset.column_names,
        batched=True,
        batch_size=100,
        num_proc=70,
        keep_in_memory=True,
    )
    
    # Take up to 100 rows from each dataset
    subsampled_dataset = tokenized_sub_dataset.shuffle(seed=42).select(range(min(100, len(tokenized_sub_dataset))))
    subsampled_datasets.append(subsampled_dataset)

    all_datasets.append(tokenized_sub_dataset)

# Combine all datasets into one
combined_subsampled_dataset = concatenate_datasets(subsampled_datasets)
combined_all_datasets = concatenate_datasets(all_datasets)

In [None]:
# Pushing the datasets to the hub
combined_subsampled_dataset.push_to_hub("pico-lm/pretokenized-paloma-tinsy",  token=os.environ["HF_TOKEN"], split="val")
combined_all_datasets.push_to_hub("pico-lm/pretokenized-paloma",  token=os.environ["HF_TOKEN"], split="val")