From 35a19d20f8d6b85a9540dc87100f88871c2af03d Mon Sep 17 00:00:00 2001 From: "Kamil A. Kaczmarek" Date: Fri, 11 Jun 2021 10:11:53 +0200 Subject: [PATCH] Docstrings, corrected namespaces, control tree visuals size (#4) * standardize code <'> is now <"> * fixed problem with small trees visuals * Added docstrings * corrections in namespaces --- neptune_xgboost/impl/__init__.py | 220 ++++++++++++++++++++++++------- 1 file changed, 174 insertions(+), 46 deletions(-) diff --git a/neptune_xgboost/impl/__init__.py b/neptune_xgboost/impl/__init__.py index 9b64248..bddae8e 100644 --- a/neptune_xgboost/impl/__init__.py +++ b/neptune_xgboost/impl/__init__.py @@ -15,15 +15,17 @@ # __all__ = [ - 'NeptuneCallback', + "NeptuneCallback", ] import json import subprocess import warnings +from io import BytesIO import matplotlib.pyplot as plt import xgboost as xgb +from matplotlib import image from neptune_xgboost import __version__ @@ -36,25 +38,138 @@ import neptune from neptune.internal.utils import verify_type -INTEGRATION_VERSION_KEY = 'source_code/integrations/neptune-xgboost' +INTEGRATION_VERSION_KEY = "source_code/integrations/neptune-xgboost" class NeptuneCallback(xgb.callback.TrainingCallback): + """Neptune callback for logging metadata during XGBoost model training. + + See guide with examples in the `Neptune-XGBoost docs`_. + + This callback logs metrics, all parameters, learning rate, pickled model, visualizations. + If early stopping is activated "best_score" and "best_iteration" is also logged. + + All metadata are collected under the common namespace that you can specify. + See: ``base_namespace`` argument (defaults to "training"). + + Metrics are logged for every dataset in the ``evals`` list and for every metric specified. + For example with ``evals = [(dtrain, "train"), (dval, "valid")]`` and ``"eval_metric": ["mae", "rmse"]``, + 4 metrics are created:: + + "train/mae" + "train/rmse" + "valid/mae" + "valid/rmse" + + Visualizations are feature importances and trees. + + Callback works with ``xgboost.train()`` and ``xgboost.cv()`` functions, and with the sklearn API ``model.fit()``. + + Note: + This callback works with ``xgboost>=1.3.0``. This release introduced new style Python callback API. + + Note: + You can use public ``api_token="ANONYMOUS"`` and set ``project="common/xgboost-integration"`` + for testing without registration. + + Args: + run (:obj:`neptune.new.run.Run`): Neptune run object. + A run in Neptune is a representation of all metadata that you log to Neptune. + Learn more in `run docs`_. + base_namespace(:obj:`str`, optional): Defaults to "training". + Root namespace. All metadata will be logged inside. + log_model (bool): Defaults to True. Log model as pickled file at the end of training. + log_importance (bool): Defaults to True. Log feature importance charts at the end of training. + max_num_features (int): Defaults to None. Max number of top features on the importance charts. + Works only if ``log_importance`` is set to ``True``. If None, all features will be displayed. + See `xgboost.plot_importance`_ for details. + log_tree (list): Defaults to None. Indices of the target trees to log as charts. + This requires graphviz to work. Learn about setup in the `Neptune-XGBoost installation`_ docs. + See `xgboost.to_graphviz`_ for details. + tree_figsize (int): Defaults to 30, Control size of the visualized tree image. + Increase this in case you work with large trees. Works only if ``log_tree`` is list. + + Examples: + For more examples visit `example scripts`_. + Full script that does model training and logging of the metadata:: + + import neptune.new as neptune + import xgboost as xgb + from neptune.new.integrations.xgboost import NeptuneCallback + from sklearn.datasets import load_boston + from sklearn.model_selection import train_test_split + + # Create run + run = neptune.init( + project="common/xgboost-integration", + api_token="ANONYMOUS", + name="xgb-train", + tags=["xgb-integration", "train"] + ) + + # Create neptune callback + neptune_callback = NeptuneCallback(run=run, log_tree=[0, 1, 2, 3]) + + # Prepare data + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) + dtrain = xgb.DMatrix(X_train, label=y_train) + dval = xgb.DMatrix(X_test, label=y_test) + + # Define parameters + model_params = { + "eta": 0.7, + "gamma": 0.001, + "max_depth": 9, + "objective": "reg:squarederror", + "eval_metric": ["mae", "rmse"] + } + evals = [(dtrain, "train"), (dval, "valid")] + num_round = 57 + + # Train the model and log metadata to the run in Neptune + xgb.train( + params=model_params, + dtrain=dtrain, + num_boost_round=num_round, + evals=evals, + callbacks=[ + neptune_callback, + xgb.callback.LearningRateScheduler(lambda epoch: 0.99**epoch), + xgb.callback.EarlyStopping(rounds=30) + ], + ) + + .. _Neptune-XGBoost docs: + https://docs.neptune.ai/integrations-and-supported-tools/model-training/xgboost + _Neptune-XGBoost installation: + https://docs.neptune.ai/integrations-and-supported-tools/model-training/xgboost#install-requirements + _run docs: + https://docs.neptune.ai/api-reference/run + _xgboost.plot_importance: + https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.plot_importance + _xgboost.to_graphviz: + https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.to_graphviz + _example scripts: + https://github.com/neptune-ai/examples/tree/main/integrations-and-supported-tools/xgboost/scripts + """ def __init__(self, - run, # Neptune run, required - base_namespace='training', # if none we apply 'training' by default - log_model=True, # log model as pickled object at the end of training - log_importance=True, # requires matplotlib, log feature importance chart at the end of training - max_num_features=None, # requires matplotlib, number of top features on the feature importance chart - log_tree=None): # requires graphviz, indices of trained trees to log as chart, i.e. [0, 1, 2] - - verify_type('run', run, neptune.Run) - verify_type('base_namespace', base_namespace, str) - log_model is not None and verify_type('log_model', log_model, bool) - log_importance is not None and verify_type('log_importance', log_importance, bool) - max_num_features is not None and verify_type('max_num_features', max_num_features, int) - log_tree is not None and verify_type('log_tree', log_tree, list) + run, + base_namespace="training", + log_model=True, + log_importance=True, + max_num_features=None, + log_tree=None, + tree_figsize=30): + + verify_type("run", run, neptune.Run) + verify_type("base_namespace", base_namespace, str) + log_model is not None and verify_type("log_model", log_model, bool) + log_importance is not None and verify_type("log_importance", log_importance, bool) + max_num_features is not None and verify_type("max_num_features", max_num_features, int) + log_tree is not None and verify_type("log_tree", log_tree, list) + verify_type("tree_figsize", tree_figsize, int) self.run = run[base_namespace] self.log_model = log_model @@ -62,10 +177,11 @@ def __init__(self, self.max_num_features = max_num_features self.log_tree = log_tree self.cv = False + self.tree_figsize = tree_figsize if self.log_tree: try: - subprocess.call(['dot', '-V'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.call(["dot", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except OSError: self.log_tree = None message = "Graphviz executables not found, so trees will not be logged. " \ @@ -75,21 +191,21 @@ def __init__(self, run[INTEGRATION_VERSION_KEY] = __version__ def before_training(self, model): - if hasattr(model, 'cvfolds'): + if hasattr(model, "cvfolds"): self.cv = True return model def after_training(self, model): - # model structure is different for 'cv' and 'train' functions that you use to train xgb model + # model structure is different for "cv" and "train" functions that you use to train xgb model if self.cv: for i, fold in enumerate(model.cvfolds): - self.run[f'fold_{i}/booster_config'] = json.loads(fold.bst.save_config()) + self.run[f"fold_{i}/booster_config"] = json.loads(fold.bst.save_config()) else: - self.run['booster_config'] = json.loads(model.save_config()) - if 'best_score' in model.attributes().keys(): - self.run['best_score'] = model.attributes()['best_score'] - if 'best_iteration' in model.attributes().keys(): - self.run['best_iteration'] = model.attributes()['best_iteration'] + self.run["booster_config"] = json.loads(model.save_config()) + if "best_score" in model.attributes().keys(): + self.run["early_stopping/best_score"] = model.attributes()["best_score"] + if "best_iteration" in model.attributes().keys(): + self.run["early_stopping/best_iteration"] = model.attributes()["best_iteration"] self._log_importance(model) self._log_trees(model) @@ -98,51 +214,63 @@ def after_training(self, model): def _log_importance(self, model): if self.log_importance: - # for 'cv' log importance chart per fold + # for "cv" log importance chart per fold if self.cv: for i, fold in enumerate(model.cvfolds): importance = xgb.plot_importance(fold.bst, max_num_features=self.max_num_features) - self.run[f'fold_{i}/plots/importance'].upload(neptune.types.File.as_image(importance.figure)) - plt.close('all') + self.run[f"fold_{i}/plots/importance"].upload(neptune.types.File.as_image(importance.figure)) + plt.close("all") else: importance = xgb.plot_importance(model, max_num_features=self.max_num_features) - self.run['plots/importance'].upload(neptune.types.File.as_image(importance.figure)) - plt.close('all') + self.run["plots/importance"].upload(neptune.types.File.as_image(importance.figure)) + plt.close("all") def _log_trees(self, model): if self.log_tree is not None: - # for 'cv' log trees for each cv fold (different model is trained on each fold) + # for "cv" log trees for each cv fold (different model is trained on each fold) if self.cv: for i, fold in enumerate(model.cvfolds): trees = [] for j in self.log_tree: - tree = xgb.plot_tree(fold.bst, num_trees=j) - trees.append(neptune.types.File.as_image(tree.figure)) - self.run[f'fold_{i}/plots/trees'] = neptune.types.FileSeries(trees) - plt.close('all') + tree = xgb.to_graphviz(fold.bst, num_trees=j) + _, ax = plt.subplots(1, 1, figsize=(self.tree_figsize, self.tree_figsize)) + s = BytesIO() + s.write(tree.pipe(format="png")) + s.seek(0) + ax.imshow(image.imread(s)) + ax.axis("off") + trees.append(neptune.types.File.as_image(ax.figure)) + self.run[f"fold_{i}/plots/trees"] = neptune.types.FileSeries(trees) + plt.close("all") else: trees = [] for j in self.log_tree: - tree = xgb.plot_tree(model, num_trees=j) - trees.append(neptune.types.File.as_image(tree.figure)) - self.run['plots/trees'] = neptune.types.FileSeries(trees) - plt.close('all') + tree = xgb.to_graphviz(model, num_trees=j) + _, ax = plt.subplots(1, 1, figsize=(self.tree_figsize, self.tree_figsize)) + s = BytesIO() + s.write(tree.pipe(format="png")) + s.seek(0) + ax.imshow(image.imread(s)) + ax.axis("off") + trees.append(neptune.types.File.as_image(ax.figure)) + self.run["plots/trees"] = neptune.types.FileSeries(trees) + plt.close("all") def _log_model(self, model): if self.log_model: - # for 'cv' log model per fold + # for "cv" log model per fold if self.cv: for i, fold in enumerate(model.cvfolds): - self.run[f'fold_{i}/model_pickle'].upload(neptune.types.File.as_pickle(fold.bst)) + self.run[f"fold_{i}/pickled_model"].upload(neptune.types.File.as_pickle(fold.bst)) else: - self.run['model_pickle'].upload(neptune.types.File.as_pickle(model)) + self.run["pickled_model"].upload(neptune.types.File.as_pickle(model)) def before_iteration(self, model, epoch: int, evals_log) -> bool: # False to indicate training should not stop. return False def after_iteration(self, model, epoch: int, evals_log) -> bool: - self.run['epoch'].log(epoch) + self.run["epoch"].log(epoch) self._log_metrics(evals_log) self._log_learning_rate(model) return False @@ -152,8 +280,8 @@ def _log_metrics(self, evals_log): for metric_name, metric_values in evals_log[stage].items(): if self.cv: mean, std = metric_values[-1] - self.run[stage][metric_name]['mean'].log(mean) - self.run[stage][metric_name]['std'].log(std) + self.run[stage][metric_name]["mean"].log(mean) + self.run[stage][metric_name]["std"].log(std) else: self.run[stage][metric_name].log(metric_values[-1]) @@ -162,5 +290,5 @@ def _log_learning_rate(self, model): config = json.loads(model.cvfolds[0].bst.save_config()) else: config = json.loads(model.save_config()) - lr = config['learner']['gradient_booster']['updater']['grow_colmaker']['train_param']['learning_rate'] - self.run['learning_rate'].log(float(lr)) + lr = config["learner"]["gradient_booster"]["updater"]["grow_colmaker"]["train_param"]["learning_rate"] + self.run["learning_rate"].log(float(lr))