##### Imports

In [1]:
from pathlib import Path
from typing import Callable

import numpy as np
import requests
import yaml
from pmlb import classification_dataset_names, fetch_data, regression_dataset_names
from sklearn.model_selection import train_test_split
from tqdm import tqdm

round_digits = 3

r = lambda x: round(x, round_digits)

METADATA_URL = "https://raw.githubusercontent.com/EpistasisLab/pmlb/master/datasets/{}/metadata.yaml"

In [2]:
# total number of datasets
len(classification_dataset_names) + len(regression_dataset_names)

284

##### Function definitions

In [3]:
train_frac, val_frac, test_frac = 0.7, 0.2, 0.1

sum_frac = round(train_frac + val_frac + test_frac, 10)
assert sum_frac == 1.0, sum_frac

random_state = np.random.RandomState(0)

# Note that this random state is used for all splits, so it changes over time
shared_params = {
    "shuffle": True,
}

train_test_params = {
    **shared_params,
    "train_size": r(train_frac + val_frac),
    "test_size": r(test_frac),
}

train_val_test_params = {
    **shared_params,
    "train_size": r(train_frac / (1 - test_frac)),
    "test_size": r(val_frac / (1 - test_frac)),
}

print(train_test_params)
print(train_val_test_params)

{'shuffle': True, 'train_size': 0.9, 'test_size': 0.1}
{'shuffle': True, 'train_size': 0.778, 'test_size': 0.222}


In [4]:
def get_state(random_state: np.random.RandomState) -> list:
    state = random_state.get_state()

    # cast the state to something that is yaml serializable
    return [
        str(state[0]),
        list(map(int, state[1])),
        int(state[2]),
        int(state[3]),
        float(state[4]),
    ]


def preprocess(
    x: np.ndarray,
    y: np.ndarray,
    path: Path,
    stratify: bool,
) -> None:
    tts_random_state = get_state(random_state)
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        **train_test_params,
        stratify=y if stratify else None,
        random_state=random_state,
    )

    tvs_random_state = get_state(random_state)
    x_train, x_val, y_train, y_val = train_test_split(
        x_train,
        y_train,
        **train_val_test_params,
        stratify=y_train if stratify else None,
        random_state=random_state,
    )

    path.mkdir(parents=True, exist_ok=True)

    np.savetxt(path / "x_train.csv", x_train, delimiter=",")
    np.savetxt(path / "y_train.csv", y_train, delimiter=",")
    np.savetxt(path / "x_val.csv", x_val, delimiter=",")
    np.savetxt(path / "y_val.csv", y_val, delimiter=",")
    np.savetxt(path / "x_test.csv", x_test, delimiter=",")
    np.savetxt(path / "y_test.csv", y_test, delimiter=",")

    with open(path / "metadata.yaml", "w") as f:
        yaml.dump(
            {
                "train_split": train_test_params,
                "val_split": train_val_test_params,
                "scaler": None,
                "stratify": stratify,
                "n_train": len(set(y_train)),
                "n_val": len(set(y_val)),
                "n_test": len(set(y_test)),
                "n_total": x.shape[0],
                "n_features": x.shape[1],
                "tts_random_state": tts_random_state,
                "tvs_random_state": tvs_random_state,
            },
            f,
        )


def cache_data(pmlb_path: Path, bl_class: bool) -> None:
    metadata = {}
    method_name = "class" if bl_class else "reg"

    dataset_names = (
        classification_dataset_names if bl_class else regression_dataset_names
    )

    for dataset in tqdm(dataset_names):
        x, y = fetch_data(
            dataset,
            return_X_y=True,
            local_cache_dir=pmlb_path / "raw" / method_name,
        )

        metadata[dataset] = {
            "n_samples": x.shape[0],
            "n_features": x.shape[1],
        }

        if bl_class:
            metadata[dataset]["n_classes"] = len(set(y))
        else:
            metadata[dataset]["y_range"] = [float(min(y)), float(max(y))]

        tmp_metadata = yaml.safe_load(requests.get(METADATA_URL.format(dataset)).text)
        metadata[dataset]["pmlb_metadata"] = tmp_metadata
        metadata[dataset]["feature_type"] = [
            f["type"] for f in tmp_metadata["features"]
        ]

        if not bl_class or len(set(y)) == 2:
            preprocess(
                x, y, pmlb_path / "processed" / method_name / dataset, stratify=bl_class
            )

    with open(pmlb_path / f"{method_name}_metadata.yaml", "w") as f:
        yaml.dump(metadata, f)


def load_and_callback(
    pmlb_path: Path,
    metadata_path: Path,
    skip_callback: Callable[[dict], bool],
) -> dict[str, np.ndarray | dict]:
    datasets = {}

    metadata = yaml.load(
        open(metadata_path, "r"),
        Loader=yaml.FullLoader,
    )

    for dataset in tqdm(metadata.keys()):
        if skip_callback(metadata[dataset]):
            continue

        x_data, y_data = [], []

        for split in ["train", "val", "test"]:
            x = np.loadtxt(
                pmlb_path / dataset / f"x_{split}.csv",
                delimiter=",",
            )
            y = np.loadtxt(
                pmlb_path / dataset / f"y_{split}.csv",
                delimiter=",",
            )

            x_data.append(x)
            y_data.append(y)

        datasets[dataset] = {
            "x_train": x_data[0],
            "y_train": y_data[0],
            "x_val": x_data[1],
            "y_val": y_data[1],
            "x_test": x_data[2],
            "y_test": y_data[2],
            "metadata": metadata[dataset],
        }

    return datasets


def load(pmlb_path: Path, bl_class: bool) -> dict[str, np.ndarray | dict]:
    """
    Load cached classification datasets that are binary classification.
    """
    method_name = "class" if bl_class else "reg"

    datasets = load_and_callback(
        pmlb_path / "processed" / method_name,
        pmlb_path / f"{method_name}_metadata.yaml",
        lambda x: (x["n_classes"] != 2 if bl_class else False),
    )

    return datasets

##### Preprocess datasets

In [5]:
pmlb_path = Path("..") / "datasets" / "pmlb"

cache_data(pmlb_path, bl_class=True)
cache_data(pmlb_path, bl_class=False)

 38%|███▊      | 61/162 [01:34<02:36,  1.55s/it]


KeyboardInterrupt: 

In [None]:
class_datasets = load(pmlb_path, bl_class=True)
reg_datasets = load(pmlb_path, bl_class=False)

# This number does not account for datasets that may be too large for our GPU
print(f"Number of binary classification datasets: {len(class_datasets)}")
print(f"Number of regression datasets: {len(reg_datasets)}")

100%|██████████| 162/162 [00:00<00:00, 209.23it/s]
100%|██████████| 122/122 [00:05<00:00, 21.14it/s]

Number of binary classification datasets: 90
Number of regression datasets: 122



