This file tests the deduplication code from EleutherAI's `janitor.py` file on small section(s) of Dolma to estimate how long full deduplication would take.

To run ``janitor.py`` with C++ on Linux:
1. At ``lm-evaluation-harness/scripts/clean_training_data``, run ``c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)``
2. Rename the resulting ``.so`` file to ``janitor_util.so``
3. Tell Python the location of ``janitor_util.so`` when it looks for ``janitor_util``: ```sys.path.append(harness_dir + "/scripts/clean_training_data")```

In [1]:
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import sys
import datetime
import os
import pyarrow
from tqdm import tqdm
import copy

harness_dir = str(Path("__file__").resolve().parents[3] / "lm-evaluation-harness")
sys.path.append(harness_dir)

sys.path.append(harness_dir + "/scripts/clean_training_data")
from lm_eval.decontamination.janitor import Janitor

os.environ['NUMEXPR_MAX_THREADS'] = '256'
os.environ['NUMEXPR_NUM_THREADS'] = '128'
import numexpr as ne

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# with open("./tasks_txt_files/arithmetic.txt", "r") as file:
#     arithmetic: str = file.read() # 339K
# with open("./contaminant.txt", "r") as file:
#     contaminant: str = file.read() # 3.4G
# contaminant = contaminant.encode("utf-8", "ignore").decode("utf-8", "ignore")

In [21]:
# contaminant_mini = contaminant[:20000].encode('utf-8', 'ignore').decode('utf-8', 'ignore')
# with open("./contaminant_mini.txt", "w") as file:
#     file.write(contaminant_mini)

In [22]:
with open("./contaminant_mini.txt", "r") as file:
    contaminant_mini: str = file.read()

In [4]:
data_mini: pyarrow.lib.Table = pq.read_table("data_mini.arrow")
data_mini_size = sys.getsizeof(data_mini)
print(f"Size of data_mini: {sys.getsizeof(data_mini)} bytes")

Size of data_mini: 112133501 bytes


In [5]:
df: pd.DataFrame = data_mini.to_pandas()
df["text"] = df["text"].str.encode("utf-8", errors="ignore").str.decode("utf-8", errors="ignore")
df.head(5)

Unnamed: 0,id,text
0,09c6eceb562caeba5b94489087fb1e8d,"TAMPA, Fla., Nov. 03, 2016 (GLOBE NEWSWIRE) --..."
1,7378e5a823604985555d1d9267827368,"It was brimming with midges. Everywhere, these..."
2,43088e9ab3bdb2236fc493594b99f72f,We encourage all our employees to be ambitious...
3,14b802b07c5b0685470f5c87fc60e394,The first road assignment is coming this weeke...
4,954f973826676c5a9421c0286f964bd3,Course to upgrade skills for experienced Hr pr...


In [10]:
def decontaminate(contaminant: str, df: pd.DataFrame) -> (Janitor, pd.DataFrame):
    janitor = Janitor()
    result = copy.deepcopy(df)
    result["num_contaminated"] = 0

    print("Registering contaminant")
    pre_register = datetime.datetime.now()
    janitor.register_contaminant(contaminant)
    print(f"Registered in {str(datetime.datetime.now() - pre_register)}")
    
    print("Decontaminating")
    for index, row in tqdm(df.iterrows(), total=len(df)):
        # TODO: Why is there a UnicodeDecodeError????
        (cleaned, num_contaminated) = janitor.clean_cpp(row["text"].encode("utf-8", "ignore").decode("utf-8", "ignore"))
        result.iloc[index]["num_contaminated"] = num_contaminated
        if num_contaminated != 0:
            result.iloc[index]["text"] = "".join(cleaned)
        
    return (janitor, result)

    

In [10]:
"""
def test_decontaminate(contaminant: str, output_filename: str):
    print(f"Contaminant size {len(contaminant)}")
    janitor = Janitor(delete_chars="")

    registration_time = datetime.timedelta(hours=0)
    pre_register = datetime.datetime.now()
    print("Registering contaminant")
    janitor.register_contaminant(contaminant)
    registration_time += datetime.datetime.now() - pre_register
    print(f"Registered in {str(registration_time)}")

    decontamination_time = datetime.timedelta(hours=0)
    pre_decontaminate = datetime.datetime.now()
    print("Decontaminating")
    # NOTE: Running clean_cpp throws unicodedecode error; maybe sort this out later
    result = janitor.clean_python(data_string)
    
    decontamination_time += datetime.datetime.now() - pre_decontaminate
    print(f"Decontaminated in {str(decontamination_time)}")

    print(f"Total time: {str(registration_time + decontamination_time)}")
    return janitor
"""

In [11]:
(janitor, df_dedup) = decontaminate(contaminant_mini, df)

Registering contaminant
Registered in 0:00:00.001266
Decontaminating


  0%|          | 0/50000 [00:00<?, ?it/s]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe2 in position 86: unexpected end of data

In [None]:
# Save df_dedup to a new arrow file
def pandas_to_arrow(df: pd.DataFrame, output_file: str):
    schema = pyarrow.Schema.from_pandas(df, preserve_index=False)
    table = pyarrow.Table.from_pandas(df, preserve_index=False)
    
    writer = pyarrow.ipc.new_file(output_file, schema)
    writer.write(table)
    writer.close()

In [None]:
pre_write = datetime.datetime.now()
pandas_to_arrow(df_dedup)
print(f"Finished writing in {datetime.datetime.now() - pre_write}")

Testing results:
- Deduplicating full part 1 of c4 against arithmetic (C++):
    - 512G RAM (43 G used), 1 GPU, 4 CPUs (169% efficiency) - 15 mins 40 sec
- Deduplicating 1/10 of part 1 of c4 against arithmetic (Python):
    - 512G RAM (43 G used), 1 GPU, 4 CPUs (169% efficiency) - 1 min 35 sec
- Deduplicating full part 2 of c4 against arithemtic (C++):
    - 512G RAM (26 G used), 1 GPU, 16 CPUs (14% efficiency) - 16 min 16 sec
- Deduplicating parts 5 - 12 of c4 against arithmetic (C++):
    - 512G RAM (43 G used), 1 GPU, 16 CPUs (66% efficiency) - 140 min
        - Average 17.5 min per part

To do:
- Multithread(?) by splitting data into chunks and deduplicating each chunk in parallel