Skip to content

Commit

Permalink
Automatically infer vector_size for vector features when not provided (
Browse files Browse the repository at this point in the history
  • Loading branch information
tgaddair committed Dec 28, 2022
1 parent 0d5d338 commit b320bec
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
10 changes: 7 additions & 3 deletions ludwig/features/vector_feature.py
Expand Up @@ -118,14 +118,18 @@ def add_feature_data(

# Determine vector size
vector_size = backend.df_engine.compute(proc_df[feature_config[PROC_COLUMN]].map(len).max())
if "vector_size" in preprocessing_parameters:
if vector_size != preprocessing_parameters["vector_size"]:
vector_size_param = preprocessing_parameters.get("vector_size")
if vector_size_param is not None:
# TODO(travis): do we even need a user param for vector size if we're going to auto-infer it in all
# cases? Is this only useful as a sanity check for the user to make sure their data conforms to
# expectations?
if vector_size != vector_size_param:
raise ValueError(
"The user provided value for vector size ({}) does not "
"match the value observed in the data: {}".format(preprocessing_parameters, vector_size)
)
else:
logger.debug(f"Observed vector size: {vector_size}")
logger.debug(f"Detected vector size: {vector_size}")

metadata[feature_config[NAME]]["vector_size"] = vector_size
return proc_df
Expand Down
10 changes: 7 additions & 3 deletions tests/integration_tests/test_experiment.py
Expand Up @@ -25,7 +25,7 @@

from ludwig.api import LudwigModel
from ludwig.backend import LOCAL_BACKEND
from ludwig.constants import ENCODER, H3, TRAINER, TYPE
from ludwig.constants import ENCODER, H3, PREPROCESSING, TRAINER, TYPE
from ludwig.data.concatenate_datasets import concatenate_df
from ludwig.data.preprocessing import preprocess_for_training
from ludwig.encoders.registry import get_encoder_classes
Expand Down Expand Up @@ -821,7 +821,7 @@ def test_experiment_h3(encoder, csv_filename):
run_experiment(input_features, output_features, dataset=rel_path)


def test_experiment_vector_feature_1(csv_filename):
def test_experiment_vector_feature(csv_filename):
input_features = [vector_feature()]
output_features = [binary_feature()]
# Generate test data
Expand All @@ -830,10 +830,14 @@ def test_experiment_vector_feature_1(csv_filename):
run_experiment(input_features, output_features, dataset=rel_path)


def test_experiment_vector_feature_2(csv_filename):
def test_experiment_vector_feature_infer_size(csv_filename):
input_features = [vector_feature()]
output_features = [vector_feature()]
# Generate test data
rel_path = generate_data(input_features, output_features, csv_filename)

# Unset vector_size so it needs to be inferred
del input_features[0][PREPROCESSING]
del output_features[0][PREPROCESSING]

run_experiment(input_features, output_features, dataset=rel_path)

0 comments on commit b320bec

Please sign in to comment.