# Data Preparation

In [None]:
# | default_exp utils/data

We focus our analysis on two datasets of the critic paper and prepare a script to download them.

| **Dataset** | **Tool Used**                         | **Dataset Explanation**                                                                                          |
|-------------|--------------------------------------|-------------------------------------------------------------------------------------------------------------------|
| **GSM8k**   | Code interpreter for mathematical reasoning | **Purpose:** Mathematical word problem-solving for arithmetic reasoning. <br>**Size:** 8,000 questions. <br>**Download:** Available at [GSM8k GitHub](https://github.com/openai/grade-school-math) repository. |
| **AmbigNQ** | Wikipedia search and Google for fact-checking | **Purpose:** Open-domain QA for ambiguous questions with multiple possible answers. <br>**Size:** ~14,042 examples. <br>**Download:** Accessible from [Google NQ dataset page](https://ai.google.com/research/NaturalQuestions). |

In [None]:
# | export

from pathlib import Path
import requests
import zipfile
import io

In [None]:
# | hide

path = Path.cwd().parent / "data"
path

PosixPath('/Users/oliverpfante/Documents/agentic/nbs/data')

In [None]:
# | export


def download_file(
    url: str,
    filepath: Path,
):
    """
    Download a file from a URL.

    Parameters
    ----------
    url
        URL of the file to download.
    filepath
        Path where the downloaded file will be saved.

    Returns
    -------
    None
        This function performs file download but does not return any value.
    """
    if not filepath.exists():
        print(f"Downloading file from {url}...")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filepath, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"File downloaded and saved to {filepath}.")
    else:
        print(f"File {filepath} already exists. Skipping download.")

In [None]:
# | export


def download_gsm8k(
    path: Path,
):
    """
    Download the GSM8k dataset into a specified folder.

    Parameters
    ----------
    path
        Directory path where the GSM8k dataset will be saved. If the directory does not exist, it will be created.

    Returns
    -------
    None
        This function downloads the dataset and saves it locally but does not return any value.
    """
    # URLs for GSM8k dataset
    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/train.jsonl"
    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/test.jsonl"

    # Create directory if it doesn't exist
    path.mkdir(parents=True, exist_ok=True)

    # Download files
    download_file(train_url, path / "gsm8k_train.jsonl")
    download_file(test_url, path / "gsm8k_test.jsonl")

In [None]:
gsm8k_path = path / "gsm8k"
download_gsm8k(gsm8k_path)

Downloading file from https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/train.jsonl...
File downloaded and saved to /Users/oliverpfante/Documents/agentic/nbs/data/gsm8k/gsm8k_train.jsonl.
Downloading file from https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/test.jsonl...
File downloaded and saved to /Users/oliverpfante/Documents/agentic/nbs/data/gsm8k/gsm8k_test.jsonl.


In [None]:
# | export


def download_ambignq(
    path: Path,
):
    """
    Download the AmbigNQ dataset into a specified folder.

    Parameters
    ----------
    path
        Directory path where the AmbigNQ dataset will be saved. If the directory does not exist, it will be created.

    Returns
    -------
    None
        This function downloads the dataset, extracts its contents, and saves them locally but does not return any value.
    """
    # URL for AmbigNQ dataset
    url = "https://nlp.cs.washington.edu/ambigqa/data/ambignq_light.zip"

    # Convert path to Path object and create directory if it doesn't exist
    path.mkdir(parents=True, exist_ok=True)

    # Download the ZIP file
    print(f"Downloading ZIP file from {url}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()

    # Open the ZIP file in memory and extract its contents
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
        print("Unpacking ZIP file...")
        zip_file.extractall(path)
    print(f"Unpacked contents to {path}")

In [None]:
ambignq_path = path / "ambignq"
download_ambignq(ambignq_path)

Downloading ZIP file from https://nlp.cs.washington.edu/ambigqa/data/ambignq_light.zip...
Unpacking ZIP file...
Unpacked contents to /Users/oliverpfante/Documents/agentic/nbs/data/ambignq
