# cli

> Fill in a module description here

In [None]:
#| default_exp cli

In [None]:
#| export
import json

from datasets import load_dataset
from fastcore.script import *
from hf_clean_benchmarks.core import BenchmarkCleaner

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
import json
import tempfile

benchmarks = [
    {
        "name": "openai_humaneval",
        "splits": ["test"],
        "columns": ["prompt", "canonical_solution", "test"],
    },
    {
        "name": "mbpp",
        "splits": ["validation", "test"],
        "columns": ["text", "code", "test_list"],
    },
]

temp = tempfile.NamedTemporaryFile(prefix='pre_', suffix='_suf.json')
temp.write(json.dumps(benchmarks).encode())
print(temp.name)
temp.seek(0)
bm = json.load(temp)
print(bm)

/tmp/pre_e14t99ij_suf.json
[{'name': 'openai_humaneval', 'splits': ['test'], 'columns': ['prompt', 'canonical_solution', 'test']}, {'name': 'mbpp', 'splits': ['validation', 'test'], 'columns': ['text', 'code', 'test_list']}]


In [None]:
#| export
@call_parse
def clean_dataset(
    dataset_name: str, # Name of the dataset to clean
    column_name: str, # Name of the column to clean
    benchmark_configs_path: str, # Path to the benchmark configuration file
    output_path: str, # Path to where the cleaned dataset will be saved
    dataset_config_name: str = None, # Name of the dataset configuration to use
    data_dir: str = None, # Path to the data files to use
    dataset_split: str = "train", # Name of the dataset split to clean
    save_json: bool = False, # Whether to save the cleaned dataset as a JSON file
    ): 
    """
    Clean a dataset using a benchmark configuration file.
    """
    ds = load_dataset(
        dataset_name,
        name=dataset_config_name,
        data_dir=data_dir,
        split=dataset_split,
    )
    benchmarks = json.load(open(benchmark_configs_path))
    bench_cleaner = BenchmarkCleaner(benchmarks, threshold=0.1, num_perm=128)
    ds = bench_cleaner.clean(ds, column_name)
    if save_json:
        ds.to_json(output_path, orient="records", lines=True)
    else:
        ds.save_to_disk(output_path)

In [None]:
#| exec_doc
clean_dataset(
    dataset_name="bigcode/the-stack-smol",
    column_name="content",
    benchmark_configs_path=temp.name,
    output_path="/tmp/test.jsonl",
    data_dir="data/python",
    dataset_split="train",
    save_json=True,
)

Using custom data configuration bigcode--the-stack-smol-7b51f8bde3058781
Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-4fd2cfc3b4de5200.arrow
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fc2542430addd587.arrow
Loading cached processed dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-7b51f8bde3058781/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6392d69100e23b4d.arrow
Loading cached processed dataset

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.84ba/s]


In [None]:
#| hide
temp.close()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()