In [5]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import yaml
from sklearn.datasets import load_diabetes

DATA_PATH = Path("..") / "datasets"

RIBO_PATH = DATA_PATH / "riboflavin"

round_digits = 3

r = lambda x: round(x, round_digits)


In [6]:
train_frac, val_frac, test_frac = 0.7, 0.2, 0.1

sum_frac = round(train_frac + val_frac + test_frac, 10)
assert sum_frac == 1.0, sum_frac

random_state = np.random.RandomState(0)

# Note that this random state is used for all splits, so it changes over time
shared_params = {
    "shuffle": True,
}

train_test_params = {
    **shared_params,
    "train_size": r(train_frac + val_frac),
    "test_size": r(test_frac),
}

train_val_test_params = {
    **shared_params,
    "train_size": r(train_frac / (1 - test_frac)),
    "test_size": r(val_frac / (1 - test_frac)),
}

print(train_test_params)
print(train_val_test_params)


{'shuffle': True, 'train_size': 0.9, 'test_size': 0.1}
{'shuffle': True, 'train_size': 0.778, 'test_size': 0.222}


In [7]:
def load_data(url, save_path, delimiter=","):
    # Convert save_path to a Path object
    save_path = Path(save_path) / Path(url).name

    # Define the custom User-Agent header
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "Referrer": url,
    }

    ext = save_path.suffix[1:]
    compression = ext if ext in ["zip", "bz2"] else None

    # Check if the file already exists at the save_path
    if save_path.exists():
        # Load the dataframe from the cache file
        df = pd.read_csv(save_path, compression=compression, delimiter=delimiter)
    else:
        # Download the file as it doesn't exist in the cache
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Write the content to a file
            save_path.write_bytes(response.content)
            # Now, load the file into a pandas dataframe
            df = pd.read_csv(save_path, compression=compression, delimiter=delimiter)
        else:
            raise Exception(
                f"Failed to retrieve the data. Status code: {response.status_code}"
            )

    return df


In [8]:
# TODO: These should be combined with process_pmlb.ipynb


def get_state(random_state: np.random.RandomState) -> list:
    state = random_state.get_state()

    # cast the state to something that is yaml serializable
    return [
        str(state[0]),
        list(map(int, state[1])),
        int(state[2]),
        int(state[3]),
        float(state[4]),
    ]


def preprocess(
    x: np.ndarray,
    y: np.ndarray,
    path: Path,
    stratify: bool,
) -> None:
    tts_random_state = get_state(random_state)
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        **train_test_params,
        stratify=y if stratify else None,
        random_state=random_state,
    )

    tvs_random_state = get_state(random_state)
    x_train, x_val, y_train, y_val = train_test_split(
        x_train,
        y_train,
        **train_val_test_params,
        stratify=y_train if stratify else None,
        random_state=random_state,
    )

    path.mkdir(parents=True, exist_ok=True)

    np.savetxt(path / "x_train.csv", x_train, delimiter=",")
    np.savetxt(path / "y_train.csv", y_train, delimiter=",")
    np.savetxt(path / "x_val.csv", x_val, delimiter=",")
    np.savetxt(path / "y_val.csv", y_val, delimiter=",")
    np.savetxt(path / "x_test.csv", x_test, delimiter=",")
    np.savetxt(path / "y_test.csv", y_test, delimiter=",")

    with open(path / "metadata.yaml", "w") as f:
        yaml.dump(
            {
                "train_split": train_test_params,
                "val_split": train_val_test_params,
                "scaler": None,
                "stratify": stratify,
                "n_train": len(set(y_train)),
                "n_val": len(set(y_val)),
                "n_test": len(set(y_test)),
                "n_total": x.shape[0],
                "n_features": x.shape[1],
                "tts_random_state": tts_random_state,
                "tvs_random_state": tvs_random_state,
            },
            f,
        )


In [9]:
# Not sure why riboflavin cannot be loaded using urllib
ribo_url = "https://www.annualreviews.org/doi/suppl/10.1146/annurev-statistics-022513-115545/suppl_file/riboflavin.csv"

# df = load_data(
#     ribo_url,
#     RIBO_PATH,
# )


In [10]:
riboflavin = pd.read_csv(
    RIBO_PATH / "raw" / "riboflavin.csv", delimiter=",", index_col=0, header=0
)
riboflavin = riboflavin.T

y = riboflavin.pop("q_RIBFLV").to_numpy()
x = riboflavin.to_numpy()

preprocess(x, y, RIBO_PATH / "processed", stratify=False)


In [11]:
x, y = load_diabetes(return_X_y=True)
preprocess(x, y, DATA_PATH / "diabetes" / "processed", stratify=False)


In [12]:
# NOTE: This dataset is advised against by the SKLearn team. However, many
# papers use it, so we run it anyway in case the reviewers specifically request
# it. Otherwise, we will not include it in the paper.

# SOURCE: SKLearn
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None, engine="python")
x = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]
preprocess(x, y, DATA_PATH / "boston" / "processed", stratify=False)
