add AutoML regression testing mechanism (#1966)

* add AutoML regression testing mechanism * move automl tests into subfolder * get dataset name from ludwig dataset registry * add shared helper for creating dataset object * address review comments
ludwig-ai · May 9, 2022 · a01336d · a01336d
1 parent 1da6536
commit a01336d
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 0 deletions.
diff --git a/tests/regression_tests/automl/golden/adult_census_income.types.json b/tests/regression_tests/automl/golden/adult_census_income.types.json
@@ -0,0 +1,77 @@
+[
+    {
+        "column": "age",
+        "name": "age",
+        "type": "number"
+    },
+    {
+        "column": "workclass",
+        "name": "workclass",
+        "type": "category"
+    },
+    {
+        "column": "fnlwgt",
+        "name": "fnlwgt",
+        "type": "number"
+    },
+    {
+        "column": "education",
+        "name": "education",
+        "type": "category"
+    },
+    {
+        "column": "education-num",
+        "name": "education-num",
+        "type": "number"
+    },
+    {
+        "column": "marital-status",
+        "name": "marital-status",
+        "type": "category"
+    },
+    {
+        "column": "occupation",
+        "name": "occupation",
+        "type": "category"
+    },
+    {
+        "column": "relationship",
+        "name": "relationship",
+        "type": "category"
+    },
+    {
+        "column": "race",
+        "name": "race",
+        "type": "category"
+    },
+    {
+        "column": "sex",
+        "name": "sex",
+        "type": "category"
+    },
+    {
+        "column": "capital-gain",
+        "name": "capital-gain",
+        "type": "number"
+    },
+    {
+        "column": "capital-loss",
+        "name": "capital-loss",
+        "type": "number"
+    },
+    {
+        "column": "hours-per-week",
+        "name": "hours-per-week",
+        "type": "number"
+    },
+    {
+        "column": "native-country",
+        "name": "native-country",
+        "type": "category"
+    },
+    {
+        "column": "income",
+        "name": "income",
+        "type": "category"
+    }
+]
diff --git a/tests/regression_tests/automl/golden/mnist.types.json b/tests/regression_tests/automl/golden/mnist.types.json
@@ -0,0 +1,13 @@
+[
+    {
+        "column": "image_path",
+        "encoder": "stacked_cnn",
+        "name": "image_path",
+        "type": "image"
+    },
+    {
+        "column": "label",
+        "name": "label",
+        "type": "category"
+    }
+]
diff --git a/tests/regression_tests/automl/scripts/update_golden_types.py b/tests/regression_tests/automl/scripts/update_golden_types.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+"""This script updates all golden JSON files containing expected data types."""
+import json
+
+from ludwig.automl.automl import create_auto_config
+from tests.regression_tests.automl.utils import get_dataset_golden_types_path, get_dataset_object, TEST_DATASET_REGISTRY
+
+
+def write_json_files():
+    for dataset_name in TEST_DATASET_REGISTRY:
+        dataset_obj = get_dataset_object(dataset_name)
+        dataset = dataset_obj.load(split=False)
+
+        # NOTE: assuming type inference for input and output features is the same
+        config = create_auto_config(
+            dataset=dataset,
+            target=[],
+            time_limit_s=3600,
+            tune_for_memory=False,
+        )
+
+        golden_types_path = get_dataset_golden_types_path(dataset_name)
+        with open(golden_types_path, "w") as f:
+            json.dump(config["input_features"], f, indent=4, sort_keys=True)
+            f.write("\n")
+
+
+if __name__ == "__main__":
+    write_json_files()
diff --git a/tests/regression_tests/automl/test_type_inference.py b/tests/regression_tests/automl/test_type_inference.py
@@ -0,0 +1,28 @@
+import json
+
+import pytest
+
+from ludwig.automl.automl import create_auto_config
+from tests.integration_tests.utils import slow
+from tests.regression_tests.automl.utils import get_dataset_golden_types_path, get_dataset_object, TEST_DATASET_REGISTRY
+
+
+@slow
+@pytest.mark.parametrize("dataset_name", TEST_DATASET_REGISTRY)
+def test_auto_type_inference_regression(dataset_name):
+    golden_types_path = get_dataset_golden_types_path(dataset_name)
+    with open(golden_types_path) as f:
+        golden_types = json.load(f)
+
+    dataset_obj = get_dataset_object(dataset_name)
+    dataset = dataset_obj.load(split=False)
+
+    # NOTE: assuming type inference for input and output features is the same
+    config = create_auto_config(
+        dataset=dataset,
+        target=[],
+        time_limit_s=3600,
+        tune_for_memory=False,
+    )
+
+    assert golden_types == config["input_features"]
diff --git a/tests/regression_tests/automl/utils.py b/tests/regression_tests/automl/utils.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+from ludwig.datasets import dataset_registry
+from ludwig.datasets.base_dataset import BaseDataset
+
+# Subset of Ludwig Dataset Zoo used for AutoML type inference regression tests.
+TEST_DATASET_REGISTRY = {"adult_census_income", "mnist"}
+
+
+def get_dataset_golden_types_path(dataset_name: str) -> str:
+    """Returns the path to the golden types file for the given dataset."""
+    return str(Path(__file__).resolve().parent / "golden" / f"{dataset_name}.types.json")
+
+
+def get_dataset_object(dataset_name: str) -> BaseDataset:
+    """Returns a Ludwig dataset instance for the given dataset."""
+    return dataset_registry[dataset_name]()