Fix for 'upload_to_hf_hub()' path mismatch with 'save()' (#3977)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
ludwig-ai · Mar 24, 2024 · 4b07ce4 · 4b07ce4
1 parent b2cb8b0
commit 4b07ce4
Show file tree

Hide file tree

Showing 38 changed files with 192 additions and 121 deletions.
diff --git a/ludwig/api.py b/ludwig/api.py
@@ -64,7 +64,9 @@
 from ludwig.features.feature_registries import update_config_with_metadata, update_config_with_model
 from ludwig.globals import (
     LUDWIG_VERSION,
+    MODEL_FILE_NAME,
     MODEL_HYPERPARAMETERS_FILE_NAME,
+    MODEL_WEIGHTS_FILE_NAME,
     set_disable_progressbar,
     TRAIN_SET_METADATA_FILE_NAME,
     TRAINING_CHECKPOINTS_DIR_PATH,
@@ -110,6 +112,7 @@
 from ludwig.utils.torch_utils import DEVICE
 from ludwig.utils.trainer_utils import get_training_report
 from ludwig.utils.types import DataFrame, TorchDevice
+from ludwig.utils.upload_utils import HuggingFaceHub
 
 logger = logging.getLogger(__name__)
 
@@ -1938,35 +1941,48 @@ def upload_to_hf_hub(
 
         # Inputs
 
-        :param repo_id (`str`):
+        :param repo_id: (`str`)
             A namespace (user or an organization) and a repo name separated
             by a `/`.
-        :param model_path (`str`):
-            The path of the saved model. This is the top level directory where
-            the models weights as well as other associated training artifacts
-            are saved.
-        :param private (`bool`, *optional*, defaults to `False`):
+        :param model_path: (`str`)
+            The path of the saved model. This is either (a) the folder where
+            the 'model_weights' folder and the 'model_hyperparameters.json' file
+            are stored, or (b) the parent of that folder.
+        :param private: (`bool`, *optional*, defaults to `False`)
             Whether the model repo should be private.
-        :param repo_type (`str`, *optional*):
+        :param repo_type: (`str`, *optional*)
             Set to `"dataset"` or `"space"` if uploading to a dataset or
             space, `None` or `"model"` if uploading to a model. Default is
             `None`.
-        :param commit_message (`str`, *optional*):
+        :param commit_message: (`str`, *optional*)
             The summary / title / first line of the generated commit. Defaults to:
             `f"Upload {path_in_repo} with huggingface_hub"`
-        :param commit_description (`str` *optional*):
+        :param commit_description: (`str` *optional*)
             The description of the generated commit
 
         # Returns
 
         :return: (bool) True for success, False for failure.
         """
+        if os.path.exists(os.path.join(model_path, MODEL_FILE_NAME, MODEL_WEIGHTS_FILE_NAME)) and os.path.exists(
+            os.path.join(model_path, MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME)
+        ):
+            experiment_path = model_path
+        elif os.path.exists(os.path.join(model_path, MODEL_WEIGHTS_FILE_NAME)) and os.path.exists(
+            os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
+        ):
+            experiment_path = os.path.dirname(model_path)
+        else:
+            raise ValueError(
+                f"Can't find 'model_weights' and '{MODEL_HYPERPARAMETERS_FILE_NAME}' either at "
+                f"'{model_path}' or at '{model_path}/model'"
+            )
         model_service = get_upload_registry()["hf_hub"]
-        hub = model_service()
+        hub: HuggingFaceHub = model_service()
         hub.login()
-        upload_status = hub.upload(
+        upload_status: bool = hub.upload(
             repo_id=repo_id,
-            model_path=model_path,
+            model_path=experiment_path,
             repo_type=repo_type,
             private=private,
             commit_message=commit_message,

diff --git a/ludwig/automl/automl.py b/ludwig/automl/automl.py
@@ -50,7 +50,7 @@
 from ludwig.contrib import add_contrib_callback_args
 from ludwig.data.cache.types import CacheableDataset
 from ludwig.datasets import load_dataset_uris
-from ludwig.globals import LUDWIG_VERSION
+from ludwig.globals import LUDWIG_VERSION, MODEL_FILE_NAME
 from ludwig.hyperopt.run import hyperopt
 from ludwig.schema.model_config import ModelConfig
 from ludwig.types import ModelConfigDict
@@ -101,10 +101,10 @@ def best_model(self) -> Optional[LudwigModel]:
             # Read remote URIs using Ludwig's internal remote file loading APIs, as
             # Ray's do not handle custom credentials at the moment.
             with use_credentials(self._creds):
-                return LudwigModel.load(os.path.join(ckpt_path, "model"))
+                return LudwigModel.load(os.path.join(ckpt_path, MODEL_FILE_NAME))
         else:
             with checkpoint.as_directory() as ckpt_path:
-                return LudwigModel.load(os.path.join(ckpt_path, "model"))
+                return LudwigModel.load(os.path.join(ckpt_path, MODEL_FILE_NAME))
 
 
 @PublicAPI

diff --git a/ludwig/benchmarking/artifacts.py b/ludwig/benchmarking/artifacts.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict
 
+from ludwig.globals import MODEL_FILE_NAME
 from ludwig.types import ModelConfigDict, TrainingSetMetadataDict
 from ludwig.utils.data_utils import load_json, load_yaml
 
@@ -55,7 +56,11 @@ def build_benchmarking_result(benchmarking_config: dict, experiment_idx: int):
         description=load_json(os.path.join(experiment_run_path, "description.json")),
         test_statistics=load_json(os.path.join(experiment_run_path, "test_statistics.json")),
         training_statistics=load_json(os.path.join(experiment_run_path, "training_statistics.json")),
-        model_hyperparameters=load_json(os.path.join(experiment_run_path, "model", "model_hyperparameters.json")),
-        training_progress=load_json(os.path.join(experiment_run_path, "model", "training_progress.json")),
-        training_set_metadata=load_json(os.path.join(experiment_run_path, "model", "training_set_metadata.json")),
+        model_hyperparameters=load_json(
+            os.path.join(experiment_run_path, MODEL_FILE_NAME, "model_hyperparameters.json")
+        ),
+        training_progress=load_json(os.path.join(experiment_run_path, MODEL_FILE_NAME, "training_progress.json")),
+        training_set_metadata=load_json(
+            os.path.join(experiment_run_path, MODEL_FILE_NAME, "training_set_metadata.json")
+        ),
     )
diff --git a/ludwig/benchmarking/summary_dataclasses.py b/ludwig/benchmarking/summary_dataclasses.py
@@ -7,7 +7,7 @@
 
 import ludwig.modules.metric_modules  # noqa: F401
 from ludwig.benchmarking.utils import format_memory, format_time
-from ludwig.globals import MODEL_HYPERPARAMETERS_FILE_NAME
+from ludwig.globals import MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME
 from ludwig.modules.metric_registry import get_metric_classes, metric_feature_type_registry  # noqa: F401
 from ludwig.types import ModelConfigDict
 from ludwig.utils.data_utils import load_json
@@ -209,7 +209,7 @@ def build_metrics_summary(experiment_local_directory: str) -> MetricsSummary:
         e.g. local_experiment_repo/ames_housing/some_experiment/
     """
     config = load_json(
-        os.path.join(experiment_local_directory, "experiment_run", "model", MODEL_HYPERPARAMETERS_FILE_NAME)
+        os.path.join(experiment_local_directory, "experiment_run", MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME)
     )
     report = load_json(os.path.join(experiment_local_directory, "experiment_run", "test_statistics.json"))
     output_feature_type: str = config["output_features"][0]["type"]

diff --git a/ludwig/benchmarking/utils.py b/ludwig/benchmarking/utils.py
@@ -16,7 +16,7 @@
 from ludwig.constants import BINARY, CATEGORY
 from ludwig.datasets import model_configs_for_dataset
 from ludwig.datasets.loaders.dataset_loader import DatasetLoader
-from ludwig.globals import CONFIG_YAML
+from ludwig.globals import CONFIG_YAML, MODEL_FILE_NAME, MODEL_WEIGHTS_FILE_NAME
 from ludwig.utils.data_utils import load_yaml
 from ludwig.utils.dataset_utils import get_repeatable_train_val_test_split
 from ludwig.utils.defaults import default_random_seed
@@ -251,9 +251,9 @@ def delete_model_checkpoints(output_directory: str):
     Args:
         output_directory: output directory of the hyperopt run.
     """
-    shutil.rmtree(os.path.join(output_directory, "model", "training_checkpoints"), ignore_errors=True)
-    if os.path.isfile(os.path.join(output_directory, "model", "model_weights")):
-        os.remove(os.path.join(output_directory, "model", "model_weights"))
+    shutil.rmtree(os.path.join(output_directory, MODEL_FILE_NAME, "training_checkpoints"), ignore_errors=True)
+    if os.path.isfile(os.path.join(output_directory, MODEL_FILE_NAME, MODEL_WEIGHTS_FILE_NAME)):
+        os.remove(os.path.join(output_directory, MODEL_FILE_NAME, MODEL_WEIGHTS_FILE_NAME))
 
 
 def delete_hyperopt_outputs(output_directory: str):

diff --git a/ludwig/contribs/mlflow/__init__.py b/ludwig/contribs/mlflow/__init__.py
@@ -6,7 +6,7 @@
 from ludwig.api_annotations import DeveloperAPI, PublicAPI
 from ludwig.callbacks import Callback
 from ludwig.constants import TRAINER
-from ludwig.globals import MODEL_HYPERPARAMETERS_FILE_NAME, TRAIN_SET_METADATA_FILE_NAME
+from ludwig.globals import MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME, TRAIN_SET_METADATA_FILE_NAME
 from ludwig.types import TrainingSetMetadataDict
 from ludwig.utils.data_utils import chunk_dict, flatten_dict, save_json, to_json_dict
 from ludwig.utils.package_utils import LazyLoader
@@ -258,7 +258,7 @@ def _log_mlflow(log_metrics, steps, save_path, should_continue, log_artifacts: b
 def _log_artifacts(output_directory):
     for fname in os.listdir(output_directory):
         lpath = os.path.join(output_directory, fname)
-        if fname == "model":
+        if fname == MODEL_FILE_NAME:
             _log_model(lpath)
         else:
             mlflow.log_artifact(lpath)

diff --git a/ludwig/contribs/mlflow/model.py b/ludwig/contribs/mlflow/model.py
@@ -16,7 +16,7 @@
 from mlflow.utils.model_utils import _get_flavor_configuration
 
 from ludwig.api_annotations import DeveloperAPI
-from ludwig.globals import MODEL_HYPERPARAMETERS_FILE_NAME
+from ludwig.globals import MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME
 from ludwig.utils.data_utils import load_json
 
 FLAVOR_NAME = "ludwig"
@@ -97,7 +97,7 @@ def save_model(
     path = os.path.abspath(path)
     if os.path.exists(path):
         raise MlflowException(f"Path '{path}' already exists")
-    model_data_subpath = "model"
+    model_data_subpath = MODEL_FILE_NAME
     model_data_path = os.path.join(path, model_data_subpath)
     os.makedirs(path)
     if mlflow_model is None:
@@ -267,7 +267,7 @@ def export_model(model_path, output_path, registered_model_name=None):
         if not model_path.startswith("runs:/") or output_path is not None:
             # No run specified, so in order to register the model in mlflow, we need
             # to create a new run and upload the model as an artifact first
-            output_path = output_path or "model"
+            output_path = output_path or MODEL_FILE_NAME
             log_model(
                 _CopyModel(model_path),
                 artifact_path=output_path,
@@ -295,7 +295,7 @@ def log_saved_model(lpath):
     """
     log_model(
         _CopyModel(lpath),
-        artifact_path="model",
+        artifact_path=MODEL_FILE_NAME,
     )
 
 

diff --git a/ludwig/globals.py b/ludwig/globals.py
@@ -16,6 +16,7 @@
 
 LUDWIG_VERSION = "0.10.2.dev"
 
+MODEL_FILE_NAME = "model"
 MODEL_WEIGHTS_FILE_NAME = "model_weights"
 MODEL_HYPERPARAMETERS_FILE_NAME = "model_hyperparameters.json"
 TRAIN_SET_METADATA_FILE_NAME = "training_set_metadata.json"

diff --git a/ludwig/hyperopt/execution.py b/ludwig/hyperopt/execution.py
@@ -35,6 +35,7 @@
 from ludwig.backend.ray import initialize_ray
 from ludwig.callbacks import Callback
 from ludwig.constants import MAXIMIZE, TEST, TRAINER, TRAINING, TYPE, VALIDATION
+from ludwig.globals import MODEL_FILE_NAME
 from ludwig.hyperopt.results import HyperoptResults, TrialResults
 from ludwig.hyperopt.syncer import RemoteSyncer
 from ludwig.hyperopt.utils import load_json_values, substitute_parameters
@@ -125,7 +126,7 @@ def ignore_dot_files(src, files):
         return [f for f in files if f.startswith(".")]
 
     with tune.checkpoint_dir(step=progress_tracker.tune_checkpoint_num) as checkpoint_dir:
-        checkpoint_model = os.path.join(checkpoint_dir, "model")
+        checkpoint_model = os.path.join(checkpoint_dir, MODEL_FILE_NAME)
         # Atomic copying of the checkpoints
         if not os.path.isdir(checkpoint_model):
             copy_id = uuid.uuid4()
@@ -411,7 +412,7 @@ def _evaluate_best_model(
         debug,
     ):
         best_model = LudwigModel.load(
-            os.path.join(best_model_path, "model"),
+            os.path.join(best_model_path, MODEL_FILE_NAME),
             backend=backend,
             gpus=gpus,
             gpu_memory_limit=gpu_memory_limit,
@@ -549,7 +550,7 @@ def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
                             os.rename(save_path, tmp_path)
 
                         try:
-                            safe_move_file(os.path.join(ckpt_path, "model"), save_path)
+                            safe_move_file(os.path.join(ckpt_path, MODEL_FILE_NAME), save_path)
                         except Exception:
                             # Rollback from partial changes. Remove the save_path
                             # and move the original save_path back.

diff --git a/ludwig/trainers/base.py b/ludwig/trainers/base.py
@@ -1,14 +1,15 @@
 from abc import ABC, abstractmethod
 
 from ludwig.data.dataset.base import Dataset
+from ludwig.globals import MODEL_FILE_NAME
 from ludwig.schema.trainer import BaseTrainerConfig
 from ludwig.types import ModelConfigDict
 from ludwig.utils.defaults import default_random_seed
 
 
 class BaseTrainer(ABC):
     @abstractmethod
-    def train(self, training_set, validation_set=None, test_set=None, save_path="model", **kwargs):
+    def train(self, training_set, validation_set=None, test_set=None, save_path=MODEL_FILE_NAME, **kwargs):
         raise NotImplementedError()
 
     @abstractmethod

diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py
@@ -50,6 +50,7 @@
 from ludwig.distributed.base import DistributedStrategy, LocalStrategy
 from ludwig.globals import (
     is_progressbar_disabled,
+    MODEL_FILE_NAME,
     MODEL_HYPERPARAMETERS_FILE_NAME,
     TRAINING_CHECKPOINTS_DIR_PATH,
     TRAINING_PROGRESS_TRACKER_FILE_NAME,
@@ -831,7 +832,7 @@ def train(
         training_set,
         validation_set=None,
         test_set=None,
-        save_path="model",
+        save_path=MODEL_FILE_NAME,
         return_state_dict: bool = False,
         **kwargs,
     ):

diff --git a/ludwig/trainers/trainer_lightgbm.py b/ludwig/trainers/trainer_lightgbm.py
@@ -15,7 +15,12 @@
 from ludwig.distributed import init_dist_strategy
 from ludwig.distributed.base import DistributedStrategy, LocalStrategy
 from ludwig.features.feature_utils import LudwigFeatureDict
-from ludwig.globals import is_progressbar_disabled, TRAINING_CHECKPOINTS_DIR_PATH, TRAINING_PROGRESS_TRACKER_FILE_NAME
+from ludwig.globals import (
+    is_progressbar_disabled,
+    MODEL_FILE_NAME,
+    TRAINING_CHECKPOINTS_DIR_PATH,
+    TRAINING_PROGRESS_TRACKER_FILE_NAME,
+)
 from ludwig.models.gbm import GBM
 from ludwig.models.predictor import Predictor
 from ludwig.modules.metric_modules import get_improved_fn, get_initial_validation_value
@@ -562,7 +567,7 @@ def train(
         training_set: Union["Dataset", "RayDataset"],  # noqa: F821
         validation_set: Optional[Union["Dataset", "RayDataset"]],  # noqa: F821
         test_set: Optional[Union["Dataset", "RayDataset"]],  # noqa: F821
-        save_path="model",
+        save_path=MODEL_FILE_NAME,
         **kwargs,
     ):
         # ====== General setup =======

diff --git a/ludwig/trainers/trainer_llm.py b/ludwig/trainers/trainer_llm.py
@@ -9,6 +9,7 @@
 from ludwig.data.dataset.base import Dataset
 from ludwig.distributed.base import DistributedStrategy, LocalStrategy
 from ludwig.features.feature_utils import LudwigFeatureDict
+from ludwig.globals import MODEL_FILE_NAME
 from ludwig.models.llm import LLM
 from ludwig.models.predictor import LlmFineTunePredictor, LlmPredictor
 from ludwig.modules.metric_modules import get_initial_validation_value
@@ -140,7 +141,7 @@ def train(
         training_set: Dataset,
         validation_set: Optional[Dataset] = None,
         test_set: Optional[Dataset] = None,
-        save_path: str = "model",
+        save_path: str = MODEL_FILE_NAME,
         return_state_dict: bool = False,
         **kwargs,
     ):

diff --git a/ludwig/upload.py b/ludwig/upload.py
@@ -1,8 +1,10 @@
 import argparse
 import logging
+import os
 import sys
 from typing import Optional
 
+from ludwig.globals import MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME, MODEL_WEIGHTS_FILE_NAME
 from ludwig.utils.print_utils import get_logging_level_registry
 from ludwig.utils.upload_utils import HuggingFaceHub, Predibase
 
@@ -38,9 +40,9 @@ def upload_cli(
             A namespace (user or an organization) and a repo name separated
             by a `/`.
         model_path (`str`):
-            The path of the saved model. This is the top level directory where
-            the models weights as well as other associated training artifacts
-            are saved.
+            The path of the saved model. This is the parent-folder of the folder
+            where the 'model_weights' folder and the 'model_hyperparameters.json' file
+            are stored.
         private (`bool`, *optional*, defaults to `False`):
             Whether the model repo should be private.
         repo_type (`str`, *optional*):
@@ -60,10 +62,23 @@ def upload_cli(
             `"predibase"`.
     """
     model_service = get_upload_registry().get(service, "hf_hub")
-    hub = model_service()
+    hub: HuggingFaceHub = model_service()
+    if os.path.exists(os.path.join(model_path, MODEL_FILE_NAME, MODEL_WEIGHTS_FILE_NAME)) and os.path.exists(
+        os.path.join(model_path, MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME)
+    ):
+        experiment_path = model_path
+    elif os.path.exists(os.path.join(model_path, MODEL_WEIGHTS_FILE_NAME)) and os.path.exists(
+        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
+    ):
+        experiment_path = os.path.normpath(os.path.join(model_path, ".."))
+    else:
+        raise ValueError(
+            f"Can't find 'model_weights' and '{MODEL_HYPERPARAMETERS_FILE_NAME}' either at "
+            f"'{model_path}' or at '{model_path}/model'"
+        )
     hub.upload(
         repo_id=repo_id,
-        model_path=model_path,
+        model_path=experiment_path,
         repo_type=repo_type,
         private=private,
         commit_message=commit_message,