In [1]:
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory

import dotenv
import pandas as pd
from car_prices.dataset import (ExperimentConfig, load_car_dataset,
                                load_car_dataset_split,
                                split_train_test_and_save)

In [2]:
env = dotenv.dotenv_values()
original_data_dir = env['DATA_DIR']
print(f"Data dir: {original_data_dir}")

Data dir: /home/fjayres/dev/ml_labs/datasets


In [3]:
def do_test(data_dir, metadata):
    with TemporaryDirectory() as data_dir:
        # Copy the data to a temporary directory
        original_data_path = Path(original_data_dir)
        new_data_path = Path(data_dir)
        new_data_path.mkdir(exist_ok=True, parents=True)

        shutil.copytree(original_data_path, new_data_path, dirs_exist_ok=True)

        # Load the dataset from the temporary directory
        dataset = load_car_dataset(data_dir)

        # Test the functions
        split_train_test_and_save(
            dataset=dataset,
            metadata=metadata,
            data_dir=data_dir,
        )

        (
            loaded_train_dataset,
            loaded_test_dataset,
            loaded_metadata,
        ) = load_car_dataset_split(data_dir)

        print(loaded_metadata)
        print(
            dataset.shape,
            loaded_train_dataset.shape,
            loaded_test_dataset.shape,
        )

In [4]:
metadata = ExperimentConfig(
    test_size=0.2,
    random_state=42,
)

with TemporaryDirectory() as data_dir:
    do_test(data_dir, metadata)
    print("All tests passed!")

ExperimentConfig(test_size=0.2, random_state=42)
(10000, 10) (8000, 10) (2000, 10)
All tests passed!
