This file tests the deduplication code from EleutherAI's `janitor.py` file on small section(s) of Dolma to estimate how long full deduplication would take.

In [1]:
import pyarrow.parquet as pq
from pathlib import Path
import sys
import datetime

harness_dir = str(Path("__file__").resolve().parents[3] / "lm-evaluation-harness")
sys.path.append(harness_dir)

# To run this with C++ on Linux:
# At lm-evaluation-harness/scripts/clean_training_data, 
# run c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
# Rename the resulting .so file to janitor_ util.so
# Tell Python the location of janitor_util.so when it looks for janitor_util
sys.path.append(harness_dir + "/scripts/clean_training_data")
from lm_eval.decontamination.janitor import Janitor

In [2]:
with open('./arithmetic.txt', 'r') as file:
    arithmetic = file.read()

In [3]:
# Load 1M lines of c4
arrow_path = "/data/tir/projects/tir7/user_data/mchen5/dolma_100B/c4/part_1.arrow"
data = pq.read_table(arrow_path)
data_string = "".join(data.column("text").to_pandas())
print("Loaded c4/part_1.arrow")

pre_decontaminate = datetime.datetime.now()
# Test decontaminating arithmetic against c4 part 1
janitor = Janitor(delete_chars="")
janitor.register_contaminant(arithmetic)
result = janitor.clean_python(data_string)

print("Decontaminated c4 part 1 of arithmetic in " + str(datetime.datetime.now() - pre_decontaminate))

result = "".join(result)

result

Loaded c4/part_1.arrow


For estimating runtimes:
| Folder | # of arrow files |
|--------|-------|
| c4 | 4213499 |
| common-crawl | 510983 |
| gutenberg-books | 1178 |
| peS2o | 20803 |
| stack-code | 103818 |
| wiki-en-simple | 2785999 |
| dolma_100B (total) | 7636280 |
