Automatically infer vector_size for vector features when not provided (…

…#2888)
ludwig-ai · Dec 28, 2022 · b320bec · b320bec
1 parent 0d5d338
commit b320bec
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 6 deletions.
diff --git a/ludwig/features/vector_feature.py b/ludwig/features/vector_feature.py
@@ -118,14 +118,18 @@ def add_feature_data(
 
         # Determine vector size
         vector_size = backend.df_engine.compute(proc_df[feature_config[PROC_COLUMN]].map(len).max())
-        if "vector_size" in preprocessing_parameters:
-            if vector_size != preprocessing_parameters["vector_size"]:
+        vector_size_param = preprocessing_parameters.get("vector_size")
+        if vector_size_param is not None:
+            # TODO(travis): do we even need a user param for vector size if we're going to auto-infer it in all
+            # cases? Is this only useful as a sanity check for the user to make sure their data conforms to
+            # expectations?
+            if vector_size != vector_size_param:
                 raise ValueError(
                     "The user provided value for vector size ({}) does not "
                     "match the value observed in the data: {}".format(preprocessing_parameters, vector_size)
                 )
         else:
-            logger.debug(f"Observed vector size: {vector_size}")
+            logger.debug(f"Detected vector size: {vector_size}")
 
         metadata[feature_config[NAME]]["vector_size"] = vector_size
         return proc_df

diff --git a/tests/integration_tests/test_experiment.py b/tests/integration_tests/test_experiment.py
@@ -25,7 +25,7 @@
 
 from ludwig.api import LudwigModel
 from ludwig.backend import LOCAL_BACKEND
-from ludwig.constants import ENCODER, H3, TRAINER, TYPE
+from ludwig.constants import ENCODER, H3, PREPROCESSING, TRAINER, TYPE
 from ludwig.data.concatenate_datasets import concatenate_df
 from ludwig.data.preprocessing import preprocess_for_training
 from ludwig.encoders.registry import get_encoder_classes
@@ -821,7 +821,7 @@ def test_experiment_h3(encoder, csv_filename):
     run_experiment(input_features, output_features, dataset=rel_path)
 
 
-def test_experiment_vector_feature_1(csv_filename):
+def test_experiment_vector_feature(csv_filename):
     input_features = [vector_feature()]
     output_features = [binary_feature()]
     # Generate test data
@@ -830,10 +830,14 @@ def test_experiment_vector_feature_1(csv_filename):
     run_experiment(input_features, output_features, dataset=rel_path)
 
 
-def test_experiment_vector_feature_2(csv_filename):
+def test_experiment_vector_feature_infer_size(csv_filename):
     input_features = [vector_feature()]
     output_features = [vector_feature()]
     # Generate test data
     rel_path = generate_data(input_features, output_features, csv_filename)
 
+    # Unset vector_size so it needs to be inferred
+    del input_features[0][PREPROCESSING]
+    del output_features[0][PREPROCESSING]
+
     run_experiment(input_features, output_features, dataset=rel_path)