In [1]:
from pathlib import Path
from tempfile import TemporaryDirectory

import dotenv
import pandas as pd
from car_prices.dataset import (ExperimentConfig, load_car_dataset,
                                load_datasets, save_datasets, split_train_test)

In [2]:
env = dotenv.dotenv_values()
data_dir = env['DATA_DIR']
print(f"Data dir: {data_dir}")

Data dir: /home/fjayres/dev/ml_labs/datasets


In [3]:
dataset = load_car_dataset(data_dir)
train_dataset, test_dataset = split_train_test(
    dataset,
    test_size=0.2,
    random_state=42,
)
print(dataset.shape, train_dataset.shape, test_dataset.shape)

(10000, 10) (8000, 10) (2000, 10)


In [4]:
def do_test(
    basepath: Path,
    train_dataset: pd.DataFrame,
    test_dataset: pd.DataFrame,
) -> None:
    save_datasets(train_dataset, test_dataset, basepath)
    loaded_train_dataset, loaded_test_dataset = load_datasets(basepath)
    assert loaded_train_dataset.shape == train_dataset.shape
    assert loaded_test_dataset.shape == test_dataset.shape


In [5]:
metadata = ExperimentConfig(
    test_size=0.2,
    random_state=42,
)

with TemporaryDirectory() as tmpdir:
    basepath = Path(tmpdir) / "car_prices"
    basepath.mkdir(exist_ok=True, parents=True)
    do_test(basepath, train_dataset, test_dataset)
    print("All tests passed!")

All tests passed!
