Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Torchscript] Add Vector preprocessing and postprocessing #2160

Merged
merged 13 commits into from
Jun 21, 2022
32 changes: 31 additions & 1 deletion ludwig/features/vector_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
# ==============================================================================
import logging
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional

import numpy as np
import torch
Expand All @@ -41,10 +41,32 @@
from ludwig.utils import output_feature_utils
from ludwig.utils.misc_utils import set_default_value
from ludwig.utils.torch_utils import LudwigModule
from ludwig.utils.types import TorchscriptPreprocessingInput

logger = logging.getLogger(__name__)


class _VectorPreprocessing(torch.nn.Module):
def forward(self, v: TorchscriptPreprocessingInput):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add return type? torch.Tensor?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to this function as well as the other torchscript preprocessing modules 🙂

if not torch.jit.isinstance(v, List[str]):
raise ValueError(f"Unsupported input: {v}")

vectors = []
for sample in v:
vector = torch.tensor([float(x) for x in sample.split()], dtype=torch.float32)
vectors.append(vector)
return torch.stack(vectors)


class _VectorPostprocessing(torch.nn.Module):
def forward(self, preds: Dict[str, torch.Tensor]) -> Dict[str, Any]:
# Workaround to convert type annotation from Dict[str, torch.Tensor] to Dict[str, Any]
preds_any: Dict[str, Any] = {}
for k, v in preds.items():
preds_any[k] = v
return preds_any


class _VectorPredict(PredictModule):
def forward(self, inputs: Dict[str, torch.Tensor], feature_name: str) -> Dict[str, torch.Tensor]:
logits = output_feature_utils.get_output_feature_tensor(inputs, feature_name, self.logits_key)
Expand Down Expand Up @@ -159,6 +181,10 @@ def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)
set_default_value(input_feature, "preprocessing", {})

@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _VectorPreprocessing()


class VectorOutputFeature(VectorFeatureMixin, OutputFeature):
decoder = "projector"
Expand Down Expand Up @@ -234,3 +260,7 @@ def populate_defaults(output_feature):
set_default_value(output_feature, "reduce_dependencies", None)
set_default_value(output_feature, "decoder", "projector")
set_default_value(output_feature, "dependencies", [])

@staticmethod
def create_postproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _VectorPostprocessing()
8 changes: 5 additions & 3 deletions ludwig/models/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch
from torch import nn

from ludwig.constants import COLUMN, NAME, TYPE
from ludwig.constants import BAG, BINARY, CATEGORY, COLUMN, NAME, SEQUENCE, SET, TEXT, TIMESERIES, TYPE, VECTOR
from ludwig.data.postprocessing import convert_dict_to_df
from ludwig.data.preprocessing import load_metadata
from ludwig.features.feature_registries import input_type_registry, output_type_registry
Expand All @@ -22,6 +22,8 @@
from ludwig.utils.data_utils import load_json
from ludwig.utils.misc_utils import get_from_registry

FEATURES_TO_CAST_AS_STRINGS = {BINARY, CATEGORY, BAG, SET, TEXT, SEQUENCE, TIMESERIES, VECTOR}


class InferenceModule(nn.Module):
"""Wraps preprocessing, model forward pass, and postprocessing into a single module.
Expand Down Expand Up @@ -62,7 +64,7 @@ def __init__(self, model: "ECD", config: Dict[str, Any], training_set_metadata:
module_dict_key = get_module_dict_key_from_name(feature_name)
self.postproc_modules[module_dict_key] = feature.create_postproc_module(training_set_metadata[feature_name])

def forward(self, inputs: Dict[str, TorchscriptPreprocessingInput]):
def forward(self, inputs: Dict[str, TorchscriptPreprocessingInput]) -> Dict[str, Dict[str, Any]]:
with torch.no_grad():
preproc_inputs = {}
for module_dict_key, preproc in self.preproc_modules.items():
Expand Down Expand Up @@ -121,6 +123,6 @@ def to_inference_module_input(s: pd.Series, feature_type: str, load_paths=False)
elif feature_type == "audio":
if load_paths:
return [read_audio_from_path(v) if isinstance(v, str) else v for v in s]
if feature_type in {"binary", "category", "bag", "set", "text", "sequence", "timeseries"}:
if feature_type in FEATURES_TO_CAST_AS_STRINGS:
return s.astype(str).to_list()
return torch.from_numpy(s.to_numpy())
4 changes: 2 additions & 2 deletions tests/integration_tests/test_torchscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ def test_torchscript_e2e_tabular(csv_filename, tmpdir):
category_feature(vocab_size=3),
bag_feature(vocab_size=3),
set_feature(vocab_size=3),
vector_feature(),
# TODO: future support
# vector_feature(),
# date_feature(),
# h3_feature(),
]
Expand All @@ -222,11 +222,11 @@ def test_torchscript_e2e_tabular(csv_filename, tmpdir):
binary_feature(),
number_feature(),
category_feature(vocab_size=3),
vector_feature()
# TODO: future support
# sequence_feature(vocab_size=3),
# text_feature(vocab_size=3),
# set_feature(vocab_size=3),
# vector_feature()
]
backend = LocalTestBackend()
config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}
Expand Down