# 📚 Data

This notebook contains code for the data in this experiment suite.

## Setup 

In [None]:
import autorootcwd

In [None]:
from typing import Dict, Tuple, Any
from tqdm import tqdm

import pandas as pd
from transformers import AutoTokenizer
from datasets import DatasetDict, Dataset, load_dataset
from src.utils import format_int

## WikiText 2

For now, we will usoe a tiny dataset `Salesforce/wikitext/wikitext-2-raw-v1`. It has a train, validation and test split that consist of 37K, 1.8K and 2.2K examples respectively.

In [None]:
# Load WikiText 2
wiki = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", cache_dir="/workspace/huggingface")
train_wiki, val_wiki, test_wiki = wiki["train"], wiki["validation"], wiki["test"]

print(f"Loaded {len(train_wiki)/1e3:.1f}K training, {len(val_wiki)/1e3:.1f}K validation and {len(test_wiki)/1e3:.1f}K test examples.")

A single example just has a `text` field, which contains a single line of text. They are parsed from high quality Wikipedia articles. We can already see that there are loads of empty lines and other artiffacts like headlines.

In [None]:
# Examples
for example in train_wiki.take(5):
    print(example)

We are going to remove empty lines, headlines, and trailing whitespace.

In [None]:
def non_empty_text(examples: Dict[str, Any]) -> bool:
    return examples["text"] != ""

def non_headline(examples: Dict[str, Any]) -> bool:
    return not examples["text"].startswith(" = ")

def strip_headline(examples: Dict[str, Any]) -> Dict[str, Any]:
    examples["text"] = examples["text"].lstrip().rstrip()
    return examples

In [None]:
train_wiki_processed = train_wiki.filter(non_empty_text).filter(non_headline).map(strip_headline)
val_wiki_processed = val_wiki.filter(non_empty_text).filter(non_headline).map(strip_headline)
test_wiki_processed = test_wiki.filter(non_empty_text).filter(non_headline).map(strip_headline)

print(f"Processed {len(train_wiki_processed)/1e3:.1f}K training, {len(val_wiki_processed)/1e3:.1f}K validation and {len(test_wiki_processed)/1e3:.1f}K test examples.")

In [None]:
for example in train_wiki_processed.take(5):
    print(example)

Looks good! Let's get some statistics on the processed dataset.

In [None]:
# Dataset statistics
get_num_examples = lambda dataset: len(dataset)
get_num_tokens = lambda dataset, tokenizer: sum(len(tokenizer.encode(example['text'])) for example in dataset)

# Llama 2 tokenizer
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
llama3_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

stats = pd.DataFrame({
    'Split': ['Train', 'Validation', 'Test'],
    'Examples': map(format_int, [get_num_examples(train_wiki_processed), get_num_examples(val_wiki_processed), get_num_examples(test_wiki_processed)]),
    'GPT-2 Tokens': map(format_int, [get_num_tokens(train_wiki_processed, gpt2_tokenizer), get_num_tokens(val_wiki_processed, gpt2_tokenizer), get_num_tokens(test_wiki_processed, gpt2_tokenizer)]),
    'Llama-3 Tokens': map(format_int, [get_num_tokens(train_wiki_processed, llama3_tokenizer), get_num_tokens(val_wiki_processed, llama3_tokenizer), get_num_tokens(test_wiki_processed, llama3_tokenizer)])
}).set_index('Split')

stats

Finally, let's push the processed datasets to the Hugging Face Hub.

In [None]:
# Push to Hugging Face Hub
data = DatasetDict({
    'train': train_wiki_processed,
    'validation': val_wiki_processed,
    'test': test_wiki_processed
})

repo_name = "wikitext-2"
data.push_to_hub(repo_name)

print(f"Pushed to https://huggingface.co/datasets/mikasenghaas/{repo_name}")

## FinewebEdu

The [FinewebEdu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) dataset is a large-scale pre-training dataset developed by the Hugging Face team. The smaller version consists of 1.3T high-quality tokens that have been filtered for quality using Llama 2 70B

We are going to use the 10BT version of the dataset:
- 9.67M Examples
- 10.1B (GPT-2) Tokens

In [None]:
# Load FinewebEdu (10BT)
finewebedu_10bt = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", cache_dir="/workspace/huggingface")

print(f"Loaded {len(finewebedu_10bt)/1e6:.1f}M training examples.")

In [None]:
def train_val_test_split(dataset: Dataset) -> Tuple[Dataset, Dataset, Dataset]:
    """Split dataset into 80% train, 10% eval, 10% test."""
    train_test_dict = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
    train_dataset = train_test_dict['train']
    val_test_dict = train_test_dict['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)
    eval_dataset = val_test_dict['train']
    test_dataset = val_test_dict['test']

    return train_dataset, eval_dataset, test_dataset

In [None]:
# Randomly sample 12.5% of FinewebEdu 10BT (1.25BT)
finewebedu_125pct = finewebedu_10bt.shuffle(seed=42).select(range(int(len(finewebedu_10bt) * 0.125)))
train_dataset, eval_dataset, test_dataset = train_val_test_split(finewebedu_125pct)

finewebedu_1bt = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset,
    'test': test_dataset
})

print(f"Sampled {len(finewebedu_1bt['train'])/1e3:.1f}K training, {len(finewebedu_1bt['validation'])/1e3:.1f}K validation and {len(finewebedu_1bt['test'])/1e3:.1f}K test examples in 12% of 10BT.")

In [None]:
# Randomly sample 10% of FinewebEdu 1.25BT
finewebedu_1_25pct = finewebedu_125pct.shuffle(seed=42).select(range(int(len(finewebedu_125pct) * 0.1)))
train_dataset, eval_dataset, test_dataset = train_val_test_split(finewebedu_1_25pct)

finewebedu_100mt = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset,
    'test': test_dataset
})

print(f"Sampled {len(finewebedu_100mt['train'])/1e3:.1f}K training, {len(finewebedu_100mt['validation'])/1e3:.1f}K validation and {len(finewebedu_100mt['test'])/1e3:.1f}K test examples in 12.5% of 10BT.")

In [None]:
# Dataset statistics
get_num_examples = lambda dataset: len(dataset)
get_num_tokens = lambda dataset, tokenizer: sum(len(tokenizer.encode(example['text'])) for example in dataset)

# GPT-2 tokenizer
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
llama3_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# Function to calculate average tokens per example
def calc_avg_tokens(dataset, tokenizer, subset=0.1):
    subset_size = int(len(dataset) * subset)
    subset_data = dataset.shuffle(seed=42).select(range(subset_size))
    return get_num_tokens(subset_data, tokenizer) / subset_size

# Calculate average tokens for each dataset
datasets = {
    '1BT': finewebedu_1bt,
    '100MT': finewebedu_100mt
}

stats_data = []
avg_gpt2_tokens = calc_avg_tokens(finewebedu_100mt['train'], gpt_tokenizer)
avg_llama3_tokens = calc_avg_tokens(finewebedu_100mt['train'], llama3_tokenizer)

for name, dataset in datasets.items():
    for split in tqdm(dataset.keys()):
        split_data = dataset[split]
        num_examples = get_num_examples(split_data)
        
        stats_data.append({
            'Dataset': name,
            'Split': split,
            'Examples': format_int(num_examples),
            'GPT-2 Tokens': format_int(num_examples * avg_gpt2_tokens),
            'Llama-3 Tokens': format_int(num_examples * avg_llama3_tokens)
        })

stats = pd.DataFrame(stats_data).set_index(['Dataset', 'Split'])
stats

Nice, we are getting 10%, and 1% of the dataset, i.e. roughly 1B and 100M GPT-2 training tokens, respectively. Let's upload the processed dataset to the Hugging Face Hub.

In [None]:
# Upload to Hugging Face Hub
repo_name = "fineweb-edu-100mt"
finewebedu_100mt.push_to_hub(repo_name)

print(f"Pushed to https://huggingface.co/datasets/mikasenghaas/{repo_name}")

In [None]:
# Upload to Hugging Face Hub
repo_name = "fineweb-edu-1bt"
finewebedu_1bt.push_to_hub(repo_name)

print(f"Pushed to https://huggingface.co/datasets/mikasenghaas/{repo_name}")