Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] [Draft] Autologging functionality for scikit-learn integration with XGBoost and LightGBM #4885

Closed
wants to merge 16 commits into from
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ sphinx-autobuild
sphinx-click
scikit-learn
scipy
xgboost
kubernetes
55 changes: 55 additions & 0 deletions examples/xgboost_sklearn/train_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from pprint import pprint

import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import mlflow
import mlflow.xgboost

from utils import fetch_logged_data


def main():
# prepare example dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# enable auto logging
# this includes xgboost.sklearn estimators
mlflow.xgboost.autolog()
run_id = None
with mlflow.start_run() as run:

regressor = xgb.XGBRegressor(
n_estimators=100,
reg_lambda=1,
gamma=0,
max_depth=3
)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mlflow.log_metrics({"mse" : mse})
run_id = run.info.run_id
print("Logged data and model in run {}".format(run_id))
mlflow.xgboost.log_model(regressor, artifact_path="log_model")

# show logged data
for key, data in fetch_logged_data(run.info.run_id).items():
print("\n---------- logged {} ----------".format(key))
pprint(data)

mlflow.xgboost.save_model(regressor, "trained_model/")
reload_model = mlflow.pyfunc.load_model("trained_model/")
assert np.array_equal(y_pred, reload_model.predict(X_test))


if __name__ == "__main__":
main()

26 changes: 26 additions & 0 deletions examples/xgboost_sklearn/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import mlflow


def yield_artifacts(run_id, path=None):
"""Yield all artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
for item in client.list_artifacts(run_id, path):
if item.is_dir:
yield from yield_artifacts(run_id, item.path)
else:
yield item.path


def fetch_logged_data(run_id):
"""Fetch params, metrics, tags, and artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
data = client.get_run(run_id).data
# Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
artifacts = list(yield_artifacts(run_id))
return {
"params": data.params,
"metrics": data.metrics,
"tags": tags,
"artifacts": artifacts,
}
7 changes: 6 additions & 1 deletion mlflow/pyfunc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@
DATA = "data"
ENV = "env"
PY_VERSION = "python_version"
MODEL_CLASS = "model_class"

_logger = logging.getLogger(__name__)
PyFuncInput = Union[pandas.DataFrame, np.ndarray, List[Any], Dict[str, Any]]
Expand Down Expand Up @@ -664,7 +665,11 @@ def load_model(model_uri: str, suppress_warnings: bool = True) -> PyFuncModel:
code_path = os.path.join(local_path, conf[CODE])
mlflow.pyfunc.utils._add_code_to_system_path(code_path=code_path)
data_path = os.path.join(local_path, conf[DATA]) if (DATA in conf) else local_path
model_impl = importlib.import_module(conf[MAIN])._load_pyfunc(data_path)
module = importlib.import_module(conf[MAIN])
# Add model_class information if it is logged
if MODEL_CLASS in conf:
setattr(module, MODEL_CLASS, conf[MODEL_CLASS])
model_impl = module._load_pyfunc(data_path)
return PyFuncModel(model_meta=model_meta, model_impl=model_impl)


Expand Down