In [None]:
#| hide
from hf_clean_benchmarks.core import *

# hf_clean_benchmarks

> This repository contains code for cleaning your training data of benchmark data to help combat data snooping.

This repository is heavily inspired by the [BigCode repository](https://github.com/bigcode-project/bigcode-analysis/tree/main/data_analysis/decontamination) and is mostly a refactoring of their code.

## Install

```sh
pip install hf_clean_benchmarks
```

## How to use

First you need to specify which benchmarks you want to clean your data of. You can do this by creating dictionary with the benchmark name in huggingface's datasets repository as the key and the name of the column containing the benchmark data as the value. For example, if you want to clean your data of the `HumanEval` and `LAMBADA` benchmarks, you would do the following:

In [None]:
# Benchmarks to clean
benchmarks = [
    {
        "name": "openai_humaneval",
        "splits": ["test"],
        "columns": ["prompt", "canonical_solution", "test"],
    },
    {
        "name": "lambada",
        "splits": ["test"],
        "columns": ["text"],
    },
]

You then pass this dictionary to the `BenchmarkCleaner` class. This class will download the benchmarks and construct the suffix array for each benchmark. You can then use the `clean` method to clean a huggingface dataset. For example:

In [None]:
from datasets import load_dataset
from hf_clean_benchmarks.core import BenchmarkCleaner

cleaner = BenchmarkCleaner(benchmarks, threshold=0.1, num_perm=128)

# load your dataset
dataset = load_dataset("bigcode/the-stack-smol", data_dir="data/python", split="train")

# clean the dataset
cleaned_dataset = cleaner.clean(dataset, column="content")

Using custom data configuration bigcode--the-stack-smol-7b51f8bde3058781
Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-4fd2cfc3b4de5200.arrow
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fc2542430addd587.arrow
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6392d69100e23b4d.arrow
Loading cached processed dataset

In [None]:
cleaned_dataset

Dataset({
    features: ['content', 'avg_line_length', 'max_line_length', 'alphanum_fraction', 'licenses', 'repository_name', 'path', 'size', 'lang', '__id__'],
    num_rows: 6052
})