From 930994f08ab25e55f77b5703db381c2051a84c9f Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 11 Aug 2023 10:05:39 -0400 Subject: [PATCH 1/2] Add mechanic to override default values for generation during model.predict(). --- ludwig/api.py | 73 ++++++++++++++++++++++---------------------- ludwig/models/llm.py | 19 +++++++++++- 2 files changed, 55 insertions(+), 37 deletions(-) diff --git a/ludwig/api.py b/ludwig/api.py index d1c5e2a3fe1..06323195715 100644 --- a/ludwig/api.py +++ b/ludwig/api.py @@ -831,6 +831,7 @@ def predict( data_format: str = None, split: str = FULL, batch_size: int = 128, + generation_config: Optional[Dict] = None, skip_save_unprocessed_output: bool = True, skip_save_predictions: bool = True, output_directory: str = "results", @@ -840,46 +841,42 @@ def predict( ) -> Tuple[Union[dict, pd.DataFrame], str]: """Using a trained model, make predictions from the provided dataset. - # Inputs - :param dataset: (Union[str, dict, pandas.DataFrame]) source containing - the entire dataset to be evaluated. - :param data_format: (str, default: `None`) format to interpret data - sources. Will be inferred automatically if not specified. Valid - formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, - `'fwf'`, `'hdf5'` (cache file produced during previous training), - `'html'` (file containing a single HTML ``), `'json'`, `'jsonl'`, - `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, - `'stata'`, `'tsv'`. - :param: split: (str, default= `'full'`): if the input dataset contains - a split column, this parameter indicates which split of the data - to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`. - :param batch_size: (int, default: 128) size of batch to use when making - predictions. - :param skip_save_unprocessed_output: (bool, default: `True`) if this - parameter is `False`, predictions and their probabilities are saved - in both raw unprocessed numpy files containing tensors and as - postprocessed CSV files (one for each output feature). - If this parameter is `True`, only the CSV ones are saved and the - numpy ones are skipped. - :param skip_save_predictions: (bool, default: `True`) skips saving - test predictions CSV files. - :param output_directory: (str, default: `'results'`) the directory that - will contain the training statistics, TensorBoard logs, the saved - model and the training progress files. - :param return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) - indicates the format of the returned predictions. - :param callbacks: (Optional[List[Callback]], default: None) - optional list of callbacks to use during this predict operation. Any callbacks - already registered to the model will be preserved. - - # Return + Args: + dataset: (Union[str, dict, pandas.DataFrame]): source containing the entire dataset to be evaluated. + data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not + specified. Valid formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, `'fwf'`, + `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML + `
`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, + `'stata'`, `'tsv'`. + split: (str, default= `'full'`): if the input dataset contains a split column, this parameter indicates + which split of the data to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`. + batch_size: (int, default: 128) size of batch to use when making predictions. + generation_config: Dict, default: `None`) config for the generation of the predictions. If `None`, the + config that was used during model training is used. + skip_save_unprocessed_output: (bool, default: `True`) if this parameter is `False`, predictions and their + probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV + files (one for each output feature). If this parameter is `True`, only the CSV ones are saved and the + numpy ones are skipped. + skip_save_predictions: (bool, default: `True`) skips saving test predictions CSV files. + output_directory: (str, default: `'results'`) the directory that will contain the training statistics, + TensorBoard logs, the saved model and the training progress files. + return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) indicates the format of the + returned predictions. + callbacks: (Optional[List[Callback]], default: None) optional list of callbacks to use during this predict + operation. Any callbacks already registered to the model will be preserved. - :return: (Tuple[Union[dict, pd.DataFrame], str]) `(predictions, output_directory)` - `predictions` predictions from the provided dataset, - `output_directory` filepath string to where data was stored. + Returns: + `(predictions, output_directory)`: (Tuple[Union[dict, pd.DataFrame], str]) + `predictions` predictions from the provided dataset, + `output_directory` filepath string to where data was stored. """ self._check_initialization() + # Set the generation config if it exists. + # model.reset_generation_config() is called after batch prediction. + if generation_config is not None: + self.model.set_generation_config(generation_config) + # preprocessing logger.debug("Preprocessing") dataset, _ = preprocess_for_prediction( # TODO (Connor): Refactor to use self.config_obj @@ -899,6 +896,10 @@ def predict( dataset, ) + # If there was a generation config, reset it. + if generation_config is not None: + self.model.reset_generation_config() + if self.backend.is_coordinator(): # if we are skipping all saving, # there is no need to create a directory that will remain empty diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index 0ef2c9e3200..f3263564755 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -1,4 +1,5 @@ import contextlib +import copy import logging import os import tempfile @@ -160,6 +161,10 @@ def __init__( self.generation = GenerationConfig(**self.config_obj.generation.to_dict()) + # Save the original generation config so that we can reset it if/when we change it when self.generation gets is + # dynamically mutated during 1-off predict calls after fine-tuning. + self.original_generation_config = copy.deepcopy(self.generation) + # ================ Inputs ================ try: self.input_features.update(self.build_inputs(input_feature_configs=self.config_obj.input_features)) @@ -195,6 +200,14 @@ def __init__( def create_feature_dict(self) -> LudwigFeatureDict: return DictWrapper(LudwigFeatureDict()) + def set_generation_config(self, generation_config_dict): + """Sets the generation config for the model.""" + self.generation = GenerationConfig(**generation_config_dict) + + def reset_generation_config(self): + """Sets the generation config for th.""" + self.generation = self.original_generation_config + @property def output_feature_decoder(self) -> OutputFeature: return self._output_feature_decoder.module @@ -375,7 +388,7 @@ def generate( mask=None, ) -> Dict[str, torch.Tensor]: """Generates tokens using the model.""" - + logger.info(f"For generating text, using: {self.generation}") input_ids, _ = self._unpack_inputs(inputs) with torch.no_grad(): @@ -383,6 +396,10 @@ def generate( sequences_list = [] for input_ids_sample in input_ids: input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer) + logger.info( + "Decoded text inputs for the first example in batch: " + f"{self.tokenizer.decode(input_ids_sample_no_padding[0])}" + ) if input_ids_sample_no_padding.shape[1] > self.max_input_length: logger.warning( From 1b6ee20c7216f9d22f536d3ed620bfdcb87b729e Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 11 Aug 2023 10:08:03 -0400 Subject: [PATCH 2/2] Move config set call to after pre-processing. --- ludwig/api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ludwig/api.py b/ludwig/api.py index 06323195715..3302f51d4d8 100644 --- a/ludwig/api.py +++ b/ludwig/api.py @@ -872,11 +872,6 @@ def predict( """ self._check_initialization() - # Set the generation config if it exists. - # model.reset_generation_config() is called after batch prediction. - if generation_config is not None: - self.model.set_generation_config(generation_config) - # preprocessing logger.debug("Preprocessing") dataset, _ = preprocess_for_prediction( # TODO (Connor): Refactor to use self.config_obj @@ -890,13 +885,18 @@ def predict( callbacks=self.callbacks + (callbacks or []), ) + # Set the generation config if it exists. + # model.reset_generation_config() is called after batch prediction. + if generation_config is not None: + self.model.set_generation_config(generation_config) + logger.debug("Predicting") with self.backend.create_predictor(self.model, batch_size=batch_size) as predictor: predictions = predictor.batch_predict( dataset, ) - # If there was a generation config, reset it. + # If there was a generation config set prior to batch prediction, reset it. if generation_config is not None: self.model.reset_generation_config()