# Data Preparation

In [None]:
# | default_exp utils/data

We focus our analysis on three datasets from papers and provide a script to download each.

| **Dataset** | **Tool Used** | **Dataset Explanation** |
|-------------|---------------|-------------------------|
| **GSM8k**   | Code interpreter for mathematical reasoning | **Purpose:** Mathematical word problem-solving for arithmetic reasoning.<br>**Size:** 8,000 questions.<br>**Download:** Available at the [GSM8k GitHub repository](https://github.com/openai/grade-school-math). |
| **AmbigNQ** | Wikipedia search and Google for fact-checking | **Purpose:** Open-domain QA for ambiguous questions with multiple possible answers.<br>**Size:** ~14,042 examples.<br>**Download:** Accessible from the [Google NQ dataset page](https://ai.google.com/research/NaturalQuestions). |
| **HumanEval** | Code interpreter for code generation evaluation | **Purpose:** Evaluating functional correctness in code generation from docstrings.<br>**Size:** 164 programming problems.<br>**Download:** Available via the [Hugging Face Datasets library](https://huggingface.co/datasets/openai/openai_humaneval). |

In [None]:
# | export

from pathlib import Path
import requests
import zipfile
import io

In [None]:
# | hide

path = Path.cwd().parent / "data"
path

PosixPath('/Users/oliverpfante/Documents/agentic/nbs/data')

In [None]:
# | export


def download_file(
    url: str,
    filepath: Path,
):
    """
    Download a file from a URL.

    Parameters
    ----------
    url
        URL of the file to download.
    filepath
        Path where the downloaded file will be saved.

    Returns
    -------
    None
        This function performs file download but does not return any value.
    """
    if not filepath.exists():
        print(f"Downloading file from {url}...")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filepath, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"File downloaded and saved to {filepath}.")
    else:
        print(f"File {filepath} already exists. Skipping download.")

In [None]:
# | export


def download_gsm8k(
    path: Path,
):
    """
    Download the GSM8k dataset into a specified folder.

    Parameters
    ----------
    path
        Directory path where the GSM8k dataset will be saved. If the directory does not exist, it will be created.

    Returns
    -------
    None
        This function downloads the dataset and saves it locally but does not return any value.
    """
    # URLs for GSM8k dataset
    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/train.jsonl"
    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/test.jsonl"

    # Create directory if it doesn't exist
    gsm8k_path = path / "gsm8k"
    gsm8k_path.mkdir(parents=True, exist_ok=True)

    # Download files
    download_file(train_url, gsm8k_path / "gsm8k_train.jsonl")
    download_file(test_url, gsm8k_path / "gsm8k_test.jsonl")

In [None]:
download_gsm8k(path)

File /Users/oliverpfante/Documents/agentic/nbs/data/gsm8k/gsm8k_train.jsonl already exists. Skipping download.
File /Users/oliverpfante/Documents/agentic/nbs/data/gsm8k/gsm8k_test.jsonl already exists. Skipping download.


In [None]:
# | export


def download_ambignq(
    path: Path,
):
    """
    Download the AmbigNQ dataset into a specified folder.

    Parameters
    ----------
    path
        Directory path where the AmbigNQ dataset will be saved. If the directory does not exist, it will be created.

    Returns
    -------
    None
        This function downloads the dataset, extracts its contents, and saves them locally but does not return any value.
    """
    # URL for AmbigNQ dataset
    url = "https://nlp.cs.washington.edu/ambigqa/data/ambignq_light.zip"

    # Convert path to Path object and create directory if it doesn't exist
    ambignq_path = path / "ambignq"
    ambignq_path.mkdir(parents=True, exist_ok=True)

    # Download the ZIP file
    print(f"Downloading ZIP file from {url}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()

    # Open the ZIP file in memory and extract its contents
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
        print("Unpacking ZIP file...")
        zip_file.extractall(ambignq_path)
    print(f"Unpacked contents to {ambignq_path}")

In [None]:
download_ambignq(path)

Downloading ZIP file from https://nlp.cs.washington.edu/ambigqa/data/ambignq_light.zip...
Unpacking ZIP file...
Unpacked contents to /Users/oliverpfante/Documents/agentic/nbs/data/ambignq


In [None]:
# | export

# load_dataset from the datasets library: Facilitates loading datasets from the Hugging Face Hub. (pip install datasets)
from datasets import load_dataset


def download_humaneval(path: Path):
    """
    Download the HumanEval dataset and save it to the specified directory.

    Parameters
    ----------
    path :
        The directory path where the HumanEval dataset will be saved. If the directory does not exist, it will be created.

    Returns
    -------
    None
        This function downloads the dataset and saves it locally but does not return any value.
    """
    # Ensure the target directory exists
    humaneval_path = path / "humaneval"
    humaneval_path.mkdir(parents=True, exist_ok=True)

    # Load the HumanEval dataset
    dataset = load_dataset("openai_humaneval")

    # Save each split of the dataset to the specified directory
    for split in dataset.keys():
        split_dataset = dataset[split]
        split_path = humaneval_path / f"{split}.jsonl"
        split_dataset.to_json(split_path)
        print(f"Saved {split} split to {split_path}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
download_humaneval(path)

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 105.94ba/s]

Saved test split to /Users/oliverpfante/Documents/agentic/nbs/data/humaneval/test.jsonl





In [None]:
# | hide
dataset = load_dataset('json', data_files=str(path/'humaneval'/'test.jsonl'))
dataset['train'][0][canonical_solution']

{'task_id': 'HumanEval/0',
 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n',
 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert 