# Basic data cleaning and tokenization

## Cleaning

Some simple, regex-based cleaning is performed on train and dev datasets, e.g. to remove HTML tags from Wikipedia articles, non-verbal cues from subtitles, or even to correct I’s that were incorrectly recognized as l’s in OCR’ed uppercase text.

In [1]:
from pathlib import Path
from mrclean import *
from mrclean import cleanup_tinystories

In [2]:
DATA_ROOT = Path("./")
SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning
DATA_SPLITS = ['babylm_10M', 'babylm_dev']

CLEANUP_FUNCTIONS = {
    # 'aochildes': cleanup_aochildes,
    'childes': cleanup_childes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'tinystories': cleanup_tinystories,
    'children_stories': cleanup_qed,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    # 'qed': cleanup_qed,
    'simple_wiki': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}


In [3]:
for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT / 'data_v2' / split
    OUTPUT_DIR = DATA_ROOT / 'data_v2' / f'{split}_clean'
    
    OUTPUT_DIR.mkdir(exist_ok=True)

    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        if file.name == "aochildes.train":
            print("skip AOCHILDES")
            continue
        text = file.read_text()
        cleaned_text = CLEANUP_FUNCTIONS[file.stem](text, SEQ_LENGTH)
        (OUTPUT_DIR / file.name).write_text(cleaned_text)
        print(f"🧹 Cleaned '{file.name}' (size {len(text)} -> {len(cleaned_text)}) in {split}")


🧹 Cleaned 'open_subtitles.train' (size 10806305 -> 10804026) in babylm_10M
🧹 Cleaned 'bnc_spoken.train' (size 4883879 -> 4851676) in babylm_10M
🧹 Cleaned 'gutenberg.train' (size 13910986 -> 13910986) in babylm_10M
🧹 Cleaned 'childes.train' (size 15482927 -> 10730733) in babylm_10M
🧹 Cleaned 'simple_wiki.train' (size 8411630 -> 8387062) in babylm_10M
🧹 Cleaned 'tinystories.train' (size 50673985 -> 49807604) in babylm_10M
🧹 Cleaned 'switchboard.train' (size 719322 -> 719322) in babylm_10M
🧹 Cleaned 'simple_wiki.dev' (size 8149513 -> 8128239) in babylm_dev
🧹 Cleaned 'childes.dev' (size 14638378 -> 10348627) in babylm_dev
🧹 Cleaned 'switchboard.dev' (size 724013 -> 724013) in babylm_dev
🧹 Cleaned 'open_subtitles.dev' (size 11016133 -> 11014854) in babylm_dev
🧹 Cleaned 'gutenberg.dev' (size 15490473 -> 15490473) in babylm_dev
🧹 Cleaned 'bnc_spoken.dev' (size 6538139 -> 6503778) in babylm_dev


## Training a tokenizer

In [4]:
from pathlib import Path
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC

In [5]:
# We train the tokenizer on the train data only
data_dir = Path("./data_hm/babylm_10M_clean/tiny_stories_10M_clean")

paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".train"]]

# paths
print(len(paths))
assert len(paths) > 0, 'No data files found'

1


In [6]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

In [7]:
trainer = trainers.BpeTrainer(vocab_size=16000, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>"])
tokenizer.train(paths, trainer)






In [112]:
tokenizer_path =  DATA_ROOT / "data_hm/gpt-clean-16000-tiny-stories-attempt3.json"
tokenizer.save(str(tokenizer_path), pretty=True)

## Testing the tokenizer

In [54]:

tokenizer = Tokenizer.from_file(str(tokenizer_path))


# text = 'Shiro Okada (岡田志郎, "Okada Shirō", June 9, 1949; Hirakata, Osaka {age 71} - ) is a Japanese guitarist who participate in the Group Sound band, the Ox. His nickname was Shiro (シロー) and his real name is Shiro Okamoto (岡田史郎).'
text = "The quick brown fox jumps over the lazy dog."

encoded = tokenizer.encode(text)
print(f"Encoded String: {encoded.tokens}")

print(f"Encoded IDs: {encoded.ids}")

decoded = tokenizer.decode(encoded.ids)
print(f"Decoded String: {decoded}")


Encoded String: ['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġj', 'umps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Encoded IDs: [279, 1752, 3702, 5473, 349, 7450, 543, 187, 11684, 1996, 16]
Decoded String:  The quick brown fox jumps over the lazy dog.


In [38]:
file_path = 'data/babylm_10M_clean/aochildes.train'

with open(file_path, 'r') as file:
    text = file.read()

cleaned_text = cleanup_aochildes(text, SEQ_LENGTH)

encoded = tokenizer.encode(cleaned_text)

num_tokens = len(encoded.tokens)

print(num_tokens)

581299


In [36]:
import re

def cleanup_childes(text, seq_length):
    text = re.sub(r'\*\w+:\s*', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\]\[.*?\]', '', text)
    text = cleanup_extra_spaces(text)
    pad_seq = ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN])
    text = START_TOKEN + text.strip() + pad_seq
    return text

file_path = 'data/babylm_10M_clean/childes.train'

with open(file_path, 'r') as file:
    text = file.read()

cleaned_text = cleanup_childes(text, SEQ_LENGTH)

encoded = tokenizer.encode(cleaned_text)

num_tokens = len(encoded.tokens)

print(num_tokens)

3896529


In [41]:
DATA_ROOT = Path("data/babylm_10M_clean")

total_tokens = 0

for train_file in DATA_ROOT.glob("*.train"):
    if train_file.name == "childes.train":
        continue
    
    print(f"Processing {train_file.name}...")
    text = train_file.read_text()
    dataset_name = train_file.stem
    if dataset_name in CLEANUP_FUNCTIONS:
        cleaned_text = CLEANUP_FUNCTIONS[dataset_name](text, SEQ_LENGTH)
    else:
        cleaned_text = text
    encoded = tokenizer.encode(text)
    token_count = len(encoded.tokens)
    total_tokens += token_count
    
    print(f"{train_file.name}: {token_count} tokens")

print(total_tokens)

Processing open_subtitles.train...
open_subtitles.train: 3387309 tokens
Processing bnc_spoken.train...
bnc_spoken.train: 1277255 tokens
Processing gutenberg.train...
gutenberg.train: 3465292 tokens
Processing aochildes.train...
aochildes.train: 581299 tokens
Processing simple_wiki.train...
simple_wiki.train: 2181787 tokens
Processing switchboard.train...
switchboard.train: 249441 tokens
11142383


In [70]:
import pathlib
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

DATA_ROOT = Path("./")
DATA_SPLITS = ['babylm_10M']

def count_words(text, remove_punctuation=True, remove_stopwords=False, lowercase=True):
    if lowercase:
        text = text.lower()
    tokens = word_tokenize(text)
    
    if remove_punctuation:
        tokens = [token for token in tokens if token not in string.punctuation]
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    
    return len(tokens)

total_word_count = 0
dataset_word_counts = Counter()

for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT / 'data_v2' / split
    
    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        text = file.read_text()
        word_count = count_words(text)
        total_word_count += word_count
        dataset_word_counts[file.stem] += word_count
        print(f"📊 Counted {word_count} words in '{file.name}' from {split}")

print("\nWord count per dataset:")
for dataset, count in dataset_word_counts.items():
    percentage = (count / total_word_count) * 100
    print(f"{dataset}: {count} words ({percentage:.2f}%)")

print(f"\nTotal word count across all datasets: {total_word_count}")

📊 Counted 2150257 words in 'open_subtitles.train' from babylm_10M
📊 Counted 971613 words in 'bnc_spoken.train' from babylm_10M
📊 Counted 2716885 words in 'gutenberg.train' from babylm_10M
📊 Counted 2994607 words in 'childes.train' from babylm_10M
📊 Counted 1418689 words in 'simple_wiki.train' from babylm_10M
📊 Counted 152603 words in 'switchboard.train' from babylm_10M

Word count per dataset:
open_subtitles: 2150257 words (20.67%)
bnc_spoken: 971613 words (9.34%)
gutenberg: 2716885 words (26.11%)
childes: 2994607 words (28.78%)
simple_wiki: 1418689 words (13.64%)
switchboard: 152603 words (1.47%)

Total word count across all datasets: 10404654


In [72]:
DATA_ROOT = Path("./")
SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning
DATA_SPLITS = ['babylm_10M']

CLEANUP_FUNCTIONS = {
    # 'aochildes': cleanup_aochildes,
    'childes_10M': cleanup_childes,
    'childes_10M_last': cleanup_childes,
    'gutenberg_10M': cleanup_gutenberg,
    'simple_wiki_10M': cleanup_simple_wikipedia
}


In [73]:
for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT / 'data_hm' / split
    OUTPUT_DIR = DATA_ROOT / 'data_hm' / f'{split}_clean'
    
    OUTPUT_DIR.mkdir(exist_ok=True)

    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        if file.name == "aochildes.train":
            print("skip AOCHILDES")
            continue
        text = file.read_text()
        cleaned_text = CLEANUP_FUNCTIONS[file.stem](text, SEQ_LENGTH)
        (OUTPUT_DIR / file.name).write_text(cleaned_text)
        print(f"🧹 Cleaned '{file.name}' (size {len(text)} -> {len(cleaned_text)}) in {split}")


🧹 Cleaned 'simple_wiki_10M.train' (size 57849323 -> 57674157) in babylm_10M
🧹 Cleaned 'childes_10M_last.train' (size 53726989 -> 39457609) in babylm_10M
🧹 Cleaned 'childes_10M.train' (size 53944457 -> 37164144) in babylm_10M
🧹 Cleaned 'gutenberg_10M.train' (size 54808759 -> 54808759) in babylm_10M


In [7]:
import readability
import nltk

nltk.download('punkt')

def analyze_readability(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    sentences = nltk.sent_tokenize(text)
    tokenized_text = "\n".join([" ".join(nltk.word_tokenize(sentence)) for sentence in sentences])

    results = readability.getmeasures(tokenized_text, lang='en')

    readability_scores = {
        "Flesch Reading Ease": results['readability grades']['FleschReadingEase'],
        "Gunning Fog Index": results['readability grades']['GunningFogIndex'],
        "ARI": results['readability grades']['ARI']
    }

    return readability_scores

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nathanpaek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
scores = analyze_readability('data_hm/babylm_10M_clean/childes_10M_clean/childes_10M.train')
print(scores)

{'Flesch Reading Ease': 115.7017835360794, 'Gunning Fog Index': 2.8426439083926325, 'ARI': -2.1995914383563218}


In [9]:
scores = analyze_readability('data_hm/babylm_10M_clean/gutenberg_10M_clean/gutenberg_10M.train')
print(scores)

{'Flesch Reading Ease': 87.49324180574916, 'Gunning Fog Index': 9.89000879078063, 'ARI': 7.122197415060434}


In [10]:
scores = analyze_readability('data_hm/babylm_10M_clean/simple_wiki_10M_clean/simple_wiki_10M.train')
print(scores)

{'Flesch Reading Ease': 68.13464284909669, 'Gunning Fog Index': 10.310200051239534, 'ARI': 9.457072550485279}


In [12]:
scores = analyze_readability('data_hm/babylm_10M_clean/tiny_stories_10M_clean/tinystories.train')
print(scores)

{'Flesch Reading Ease': 105.19229007159716, 'Gunning Fog Index': 4.826088797503238, 'ARI': 0.8503273834313667}


In [13]:
def analyze_readability_folder(folder_path):
    all_text = ""
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".train"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                all_text += file.read() + "\n"  
    
    sentences = nltk.sent_tokenize(all_text)
    tokenized_text = "\n".join([" ".join(nltk.word_tokenize(sentence)) for sentence in sentences])

    results = readability.getmeasures(tokenized_text, lang='en')

    readability_scores = {
        "Flesch Reading Ease": results['readability grades']['FleschReadingEase'],
        "Gunning Fog Index": results['readability grades']['GunningFogIndex'],
        "ARI": results['readability grades']['ARI']
    }

    return readability_scores

folder_path = "data_v2/babylm_10M"
scores = analyze_readability_folder(folder_path)
print(scores)

{'Flesch Reading Ease': 105.89468505278845, 'Gunning Fog Index': 5.621648960021815, 'ARI': 1.5911814143108174}
