mlflow · jwyyy · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -6,4 +6,5 @@ sphinx-autobuild
 sphinx-click
 scikit-learn
 scipy
+xgboost
 kubernetes
diff --git a/examples/xgboost_sklearn/train_sklearn.py b/examples/xgboost_sklearn/train_sklearn.py
@@ -0,0 +1,55 @@
+from pprint import pprint
+
+import pandas as pd
+import xgboost as xgb
+from sklearn.datasets import load_boston
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+
+import numpy as np
+import mlflow
+import mlflow.xgboost
+
+from utils import fetch_logged_data
+
+
+def main():
+    # prepare example dataset
+    boston = load_boston()
+    X = pd.DataFrame(boston.data, columns=boston.feature_names)
+    y = pd.Series(boston.target)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    # enable auto logging
+    # this includes xgboost.sklearn estimators
+    mlflow.xgboost.autolog()
+    run_id = None
+    with mlflow.start_run() as run:
+
+        regressor = xgb.XGBRegressor(
+            n_estimators=100,
+            reg_lambda=1,
+            gamma=0,
+            max_depth=3
+        )
+        regressor.fit(X_train, y_train)
+        y_pred = regressor.predict(X_test)
+        mse = mean_squared_error(y_test, y_pred)
+        mlflow.log_metrics({"mse" : mse})
+        run_id = run.info.run_id
+        print("Logged data and model in run {}".format(run_id))
+        mlflow.xgboost.log_model(regressor, artifact_path="log_model")
+
+    # show logged data
+    for key, data in fetch_logged_data(run.info.run_id).items():
+        print("\n---------- logged {} ----------".format(key))
+        pprint(data)
+
+    mlflow.xgboost.save_model(regressor, "trained_model/")
+    reload_model = mlflow.pyfunc.load_model("trained_model/")
+    assert np.array_equal(y_pred, reload_model.predict(X_test))
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/examples/xgboost_sklearn/utils.py b/examples/xgboost_sklearn/utils.py
@@ -0,0 +1,26 @@
+import mlflow
+
+
+def yield_artifacts(run_id, path=None):
+    """Yield all artifacts in the specified run"""
+    client = mlflow.tracking.MlflowClient()
+    for item in client.list_artifacts(run_id, path):
+        if item.is_dir:
+            yield from yield_artifacts(run_id, item.path)
+        else:
+            yield item.path
+
+
+def fetch_logged_data(run_id):
+    """Fetch params, metrics, tags, and artifacts in the specified run"""
+    client = mlflow.tracking.MlflowClient()
+    data = client.get_run(run_id).data
+    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
+    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
+    artifacts = list(yield_artifacts(run_id))
+    return {
+        "params": data.params,
+        "metrics": data.metrics,
+        "tags": tags,
+        "artifacts": artifacts,
+    }
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
@@ -257,6 +257,7 @@
 DATA = "data"
 ENV = "env"
 PY_VERSION = "python_version"
+MODEL_CLASS = "model_class"
 
 _logger = logging.getLogger(__name__)
 PyFuncInput = Union[pandas.DataFrame, np.ndarray, List[Any], Dict[str, Any]]
@@ -664,7 +665,11 @@ def load_model(model_uri: str, suppress_warnings: bool = True) -> PyFuncModel:
         code_path = os.path.join(local_path, conf[CODE])
         mlflow.pyfunc.utils._add_code_to_system_path(code_path=code_path)
     data_path = os.path.join(local_path, conf[DATA]) if (DATA in conf) else local_path
-    model_impl = importlib.import_module(conf[MAIN])._load_pyfunc(data_path)
+    module = importlib.import_module(conf[MAIN])
+    # Add model_class information if it is logged
+    if MODEL_CLASS in conf:
+        setattr(module, MODEL_CLASS, conf[MODEL_CLASS])
+    model_impl = module._load_pyfunc(data_path)
     return PyFuncModel(model_meta=model_meta, model_impl=model_impl)