Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mechanic to override default values for generation during model.predict() #3520

Merged
merged 2 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 37 additions & 36 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ def predict(
data_format: str = None,
split: str = FULL,
batch_size: int = 128,
generation_config: Optional[Dict] = None,
skip_save_unprocessed_output: bool = True,
skip_save_predictions: bool = True,
output_directory: str = "results",
Expand All @@ -840,43 +841,34 @@ def predict(
) -> Tuple[Union[dict, pd.DataFrame], str]:
"""Using a trained model, make predictions from the provided dataset.

# Inputs
:param dataset: (Union[str, dict, pandas.DataFrame]) source containing
the entire dataset to be evaluated.
:param data_format: (str, default: `None`) format to interpret data
sources. Will be inferred automatically if not specified. Valid
formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
`'fwf'`, `'hdf5'` (cache file produced during previous training),
`'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
`'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
`'stata'`, `'tsv'`.
:param: split: (str, default= `'full'`): if the input dataset contains
a split column, this parameter indicates which split of the data
to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
:param batch_size: (int, default: 128) size of batch to use when making
predictions.
:param skip_save_unprocessed_output: (bool, default: `True`) if this
parameter is `False`, predictions and their probabilities are saved
in both raw unprocessed numpy files containing tensors and as
postprocessed CSV files (one for each output feature).
If this parameter is `True`, only the CSV ones are saved and the
numpy ones are skipped.
:param skip_save_predictions: (bool, default: `True`) skips saving
test predictions CSV files.
:param output_directory: (str, default: `'results'`) the directory that
will contain the training statistics, TensorBoard logs, the saved
model and the training progress files.
:param return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame)
indicates the format of the returned predictions.
:param callbacks: (Optional[List[Callback]], default: None)
optional list of callbacks to use during this predict operation. Any callbacks
already registered to the model will be preserved.

# Return
Args:
dataset: (Union[str, dict, pandas.DataFrame]): source containing the entire dataset to be evaluated.
data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not
specified. Valid formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, `'fwf'`,
`'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML
`<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
`'stata'`, `'tsv'`.
split: (str, default= `'full'`): if the input dataset contains a split column, this parameter indicates
which split of the data to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
batch_size: (int, default: 128) size of batch to use when making predictions.
generation_config: Dict, default: `None`) config for the generation of the predictions. If `None`, the
config that was used during model training is used.
skip_save_unprocessed_output: (bool, default: `True`) if this parameter is `False`, predictions and their
probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV
files (one for each output feature). If this parameter is `True`, only the CSV ones are saved and the
numpy ones are skipped.
skip_save_predictions: (bool, default: `True`) skips saving test predictions CSV files.
output_directory: (str, default: `'results'`) the directory that will contain the training statistics,
TensorBoard logs, the saved model and the training progress files.
return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) indicates the format of the
returned predictions.
callbacks: (Optional[List[Callback]], default: None) optional list of callbacks to use during this predict
operation. Any callbacks already registered to the model will be preserved.

:return: (Tuple[Union[dict, pd.DataFrame], str]) `(predictions, output_directory)`
`predictions` predictions from the provided dataset,
`output_directory` filepath string to where data was stored.
Returns:
`(predictions, output_directory)`: (Tuple[Union[dict, pd.DataFrame], str])
`predictions` predictions from the provided dataset,
`output_directory` filepath string to where data was stored.
"""
self._check_initialization()

Expand All @@ -893,12 +885,21 @@ def predict(
callbacks=self.callbacks + (callbacks or []),
)

# Set the generation config if it exists.
# model.reset_generation_config() is called after batch prediction.
if generation_config is not None:
self.model.set_generation_config(generation_config)

logger.debug("Predicting")
with self.backend.create_predictor(self.model, batch_size=batch_size) as predictor:
predictions = predictor.batch_predict(
dataset,
)

# If there was a generation config set prior to batch prediction, reset it.
if generation_config is not None:
self.model.reset_generation_config()

if self.backend.is_coordinator():
# if we are skipping all saving,
# there is no need to create a directory that will remain empty
Expand Down
19 changes: 18 additions & 1 deletion ludwig/models/llm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import copy
import logging
import os
import tempfile
Expand Down Expand Up @@ -160,6 +161,10 @@ def __init__(

self.generation = GenerationConfig(**self.config_obj.generation.to_dict())

# Save the original generation config so that we can reset it if/when we change it when self.generation gets is
# dynamically mutated during 1-off predict calls after fine-tuning.
self.original_generation_config = copy.deepcopy(self.generation)

# ================ Inputs ================
try:
self.input_features.update(self.build_inputs(input_feature_configs=self.config_obj.input_features))
Expand Down Expand Up @@ -195,6 +200,14 @@ def __init__(
def create_feature_dict(self) -> LudwigFeatureDict:
return DictWrapper(LudwigFeatureDict())

def set_generation_config(self, generation_config_dict):
"""Sets the generation config for the model."""
self.generation = GenerationConfig(**generation_config_dict)

def reset_generation_config(self):
"""Sets the generation config for th."""
self.generation = self.original_generation_config

@property
def output_feature_decoder(self) -> OutputFeature:
return self._output_feature_decoder.module
Expand Down Expand Up @@ -375,14 +388,18 @@ def generate(
mask=None,
) -> Dict[str, torch.Tensor]:
"""Generates tokens using the model."""

logger.info(f"For generating text, using: {self.generation}")
input_ids, _ = self._unpack_inputs(inputs)

with torch.no_grad():
input_lengths = []
sequences_list = []
for input_ids_sample in input_ids:
input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer)
logger.info(
"Decoded text inputs for the first example in batch: "
f"{self.tokenizer.decode(input_ids_sample_no_padding[0])}"
)

if input_ids_sample_no_padding.shape[1] > self.max_input_length:
logger.warning(
Expand Down
Loading