From e9e6ad7b7036301ff303b2b4855f35a40101121f Mon Sep 17 00:00:00 2001 From: Jakub Date: Fri, 1 Mar 2019 17:27:51 +0100 Subject: [PATCH] Reporting (#24) (#25) * added model diagnostic charts * added results conversion for BayesOptimization and Optuna libraries --- docs/examples/examples_index.rst | 1 + docs/examples/explore_hyperparams_skopt.ipynb | 6 +- docs/examples/log_matplotlib.ipynb | 2 +- docs/examples/log_model_diagnostics.ipynb | 188 +++++++++++ docs/user_guide/monitoring/reporting.rst | 6 + docs/user_guide/viz/utils.rst | 6 - neptunecontrib/hpo/utils.py | 140 ++++++++ neptunecontrib/monitoring/reporting.py | 298 ++++++++++++++++++ neptunecontrib/monitoring/skopt.py | 3 +- neptunecontrib/monitoring/utils.py | 44 +++ neptunecontrib/viz/utils.py | 136 -------- requirements.txt | 1 + 12 files changed, 683 insertions(+), 148 deletions(-) create mode 100644 docs/examples/log_model_diagnostics.ipynb create mode 100644 docs/user_guide/monitoring/reporting.rst delete mode 100644 docs/user_guide/viz/utils.rst create mode 100644 neptunecontrib/monitoring/reporting.py delete mode 100644 neptunecontrib/viz/utils.py diff --git a/docs/examples/examples_index.rst b/docs/examples/examples_index.rst index 8a8f5bd..2d9a873 100644 --- a/docs/examples/examples_index.rst +++ b/docs/examples/examples_index.rst @@ -2,6 +2,7 @@ Interactive experiment run comparison Hyper parameter comparison Run skopt/hyperopt hyperparameter sweep + Log model diagnostics Monitor lightGBM training Monitor fast.ai training Log matplotlib charts to neptune diff --git a/docs/examples/explore_hyperparams_skopt.ipynb b/docs/examples/explore_hyperparams_skopt.ipynb index 396a758..6055cf5 100644 --- a/docs/examples/explore_hyperparams_skopt.ipynb +++ b/docs/examples/explore_hyperparams_skopt.ipynb @@ -104,7 +104,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can use `df2result` helper function from `neptunecontrib.viz`." + "You can use `df2result` helper function from `neptunecontrib.hpo.utils`." ] }, { @@ -125,7 +125,7 @@ } ], "source": [ - "from neptunecontrib.viz.utils import df2result\n", + "from neptunecontrib.hpo.utils import df2result\n", "\n", "result = df2result(hyper_df, \n", " metric_col='ROC_AUC', \n", @@ -209,7 +209,7 @@ ], "metadata": { "kernelspec": { - "display_name": "neptunecontrib_py36", + "display_name": "scraping", "language": "python", "name": "python3" }, diff --git a/docs/examples/log_matplotlib.ipynb b/docs/examples/log_matplotlib.ipynb index 964bc44..d006a05 100644 --- a/docs/examples/log_matplotlib.ipynb +++ b/docs/examples/log_matplotlib.ipynb @@ -105,7 +105,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "scraping", "language": "python", "name": "python3" }, diff --git a/docs/examples/log_model_diagnostics.ipynb b/docs/examples/log_model_diagnostics.ipynb new file mode 100644 index 0000000..a76273e --- /dev/null +++ b/docs/examples/log_model_diagnostics.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Log model diagnostics to Neptune\n", + "## Train your model and run predictions\n", + "Let's train a model on a synthetic problem predict on test data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "\n", + "X, y = make_classification(n_samples=2000)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "\n", + "model = RandomForestClassifier()\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_test_pred = model.predict_proba(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiate Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import neptune\n", + "\n", + "ctx = neptune.Context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Send classification report to Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from neptunecontrib.monitoring.reporting import send_binary_classification_report\n", + "\n", + "send_binary_classification_report(ctx, y_test, y_test_pred, threshold=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is now safely logged in Neptune\n", + "\n", + "![image1](https://gist.githubusercontent.com/jakubczakon/f754769a39ea6b8fa9728ede49b9165c/raw/a1386b3a5edddc0eecb478a81d497336156b5b19/clf_report1.png)\n", + "\n", + "## Send confusion matrix to Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from neptunecontrib.monitoring.reporting import send_confusion_matrix\n", + "\n", + "send_confusion_matrix(ctx, y_test, y_test_pred[:, 1] > 0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is now safely logged in Neptune\n", + "\n", + "![image2](https://gist.githubusercontent.com/jakubczakon/f754769a39ea6b8fa9728ede49b9165c/raw/a1386b3a5edddc0eecb478a81d497336156b5b19/clf_report4.png)\n", + "\n", + "## Send ROC AUC curve to Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from neptunecontrib.monitoring.reporting import send_roc_auc_curve\n", + "\n", + "send_roc_auc_curve(ctx, y_test, y_test_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is now safely logged in Neptune\n", + "\n", + "![image3](https://gist.githubusercontent.com/jakubczakon/f754769a39ea6b8fa9728ede49b9165c/raw/a1386b3a5edddc0eecb478a81d497336156b5b19/clf_report3.png)\n", + "\n", + "## Send Precision-Recall curve to Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from neptunecontrib.monitoring.reporting import send_precision_recall\n", + "\n", + "send_prediction_distribution(y_test, y_test_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is now safely logged in Neptune\n", + "\n", + "![image4](https://gist.githubusercontent.com/jakubczakon/f754769a39ea6b8fa9728ede49b9165c/raw/a1386b3a5edddc0eecb478a81d497336156b5b19/clf_report5.png)\n", + "\n", + "## Send Precision-Recall curve to Neptune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from neptunecontrib.monitoring.reporting import send_prediction_distribution\n", + "\n", + "send_prediction_distribution(y_test, y_test_pred[:, 1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is now safely logged in Neptune\n", + "\n", + "![image5](https://gist.githubusercontent.com/jakubczakon/f754769a39ea6b8fa9728ede49b9165c/raw/a1386b3a5edddc0eecb478a81d497336156b5b19/clf_report2.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "santander", + "language": "python", + "name": "santander" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/user_guide/monitoring/reporting.rst b/docs/user_guide/monitoring/reporting.rst new file mode 100644 index 0000000..9059988 --- /dev/null +++ b/docs/user_guide/monitoring/reporting.rst @@ -0,0 +1,6 @@ +Reporting +=========== + +.. automodule:: neptunecontrib.monitoring.reporting + :members: + :show-inheritance: diff --git a/docs/user_guide/viz/utils.rst b/docs/user_guide/viz/utils.rst deleted file mode 100644 index dfe80a9..0000000 --- a/docs/user_guide/viz/utils.rst +++ /dev/null @@ -1,6 +0,0 @@ -Visualization utils -=========== - -.. automodule:: neptunecontrib.viz.utils - :members: - :show-inheritance: diff --git a/neptunecontrib/hpo/utils.py b/neptunecontrib/hpo/utils.py index a54d819..7decdc5 100644 --- a/neptunecontrib/hpo/utils.py +++ b/neptunecontrib/hpo/utils.py @@ -19,6 +19,7 @@ import subprocess from neptunelib.session import Session +import pandas as pd from scipy.optimize import OptimizeResult import skopt from retrying import retry @@ -133,6 +134,145 @@ def hyperopt2skopt(trials, space): return optimize_results +def df2result(df, metric_col, param_cols, param_types=None): + """Converts dataframe with metrics and hyperparameters to the OptimizeResults format. + + It is a helper function that lets you use all the tools that expect OptimizeResult object + like for example scikit-optimize plot_evaluations function. + + Args: + df(`pandas.DataFrame`): Dataframe containing metric and hyperparameters. + metric_col(str): Name of the metric column. + param_cols(list): Names of the hyperparameter columns. + param_types(list or None): Optional list of hyperparameter column types. + By default it will treat all the columns as float but you can also pass str + for categorical channels. E.g param_types=[float, str, float, float] + + Returns: + `scipy.optimize.OptimizeResult`: Results object that contains the hyperparameter and metric + information. + + Examples: + Instantiate a session. + + >>> from neptunelib.api.session import Session + >>> session = Session() + + Fetch a project and a list of experiments. + + >>> project = session.get_projects('neptune-ml')['neptune-ml/Home-Credit-Default-Risk'] + >>> leaderboard = project.get_leaderboard(state=['succeeded'], owner=['czakon']) + + Comvert the leaderboard dataframe to the `ResultOptimize` instance taking only the parameters and + metric that you care about. + + >>> result = df2result(leaderboard, + metric_col='channel_ROC_AUC', + param_cols=['parameter_lgbm__max_depth', 'parameter_lgbm__num_leaves', 'parameter_lgbm__min_child_samples']) + + """ + + if not param_types: + param_types = [float for _ in param_cols] + + df = _prep_df(df, param_cols, param_types) + df = df.sort_values(metric_col, ascending=False) + param_space = _convert_to_param_space(df, param_cols, param_types) + + results = OptimizeResult() + results.x_iters = df[param_cols].values + results.func_vals = df[metric_col].to_list() + results.x = results.x_iters[0] + results.fun = results.func_vals[0] + results.space = param_space + return results + + +def optuna2skopt(results): + """Converts optuna results to scipy OptimizeResult. + + Helper function that converts the optuna Trials instance into scipy OptimizeResult + format. + + Args: + results(`pandas.DataFrame`): Dataframe containing scores and hyperparameters. + It is the output of running study.trials_dataframe(). + + Returns: + `scipy.optimize.optimize.OptimizeResult`: Converted OptimizeResult. + + Examples: + Run your optuna study. + + >>> study = optuna.create_study() + >>> study.optimize(objective, n_trials=100) + + Convert trials_dataframe object to the OptimizeResult object. + + >>> import neptunecontrib.hpo.utils as hp_utils + >>> results = hp_utils.optuna2skopt(study.trials_dataframe()) + """ + + results_ = results['params'] + results_['target'] = -1.0 * results['value'] + return df2result(results_, + metric_col='target', + param_cols=[col for col in results_.columns if col != 'target']) + + +def bayes2skopt(results): + """Converts BayesOptimization results to scipy OptimizeResult. + + Helper function that converts the optuna Trials instance into scipy OptimizeResult + format. + + Args: + results(`pandas.DataFrame`): Dataframe containing scores and hyperparameters. + It is the output of running study.trials_dataframe(). + + Returns: + `scipy.optimize.optimize.OptimizeResult`: Converted OptimizeResult. + + Examples: + Run BayesOptimize maximization. + + >>> bayes_optimization = BayesianOptimization(objective, space) + >>> bayes_optimization.maximize(init_points=10, n_iter=100, xi=0.06) + + Convert bayes.space.res() object to the OptimizeResult object. + + >>> import neptunecontrib.hpo.utils as hp_utils + >>> results = hp_utils.bayes2skopt(bayes_optimization.space.res()) + """ + + results = [{'target': trial['target'], **trial['params']} for trial in results] + results_df = pd.DataFrame(results) + return df2result(results_df, + metric_col='target', + param_cols=[col for col in results_df.columns if col != 'target']) + + +def _prep_df(df, param_cols, param_types): + for col, col_type in zip(param_cols, param_types): + df[col] = df[col].astype(col_type) + return df + + +def _convert_to_param_space(df, param_cols, param_types): + dimensions = [] + for colname, col_type in zip(param_cols, param_types): + if col_type == str: + dimensions.append(skopt.space.Categorical(categories=df[colname].unique(), + name=colname)) + elif col_type == float: + low, high = df[colname].min(), df[colname].max() + dimensions.append(skopt.space.Real(low, high, name=colname)) + else: + raise NotImplementedError + skopt_space = skopt.Space(dimensions) + return skopt_space + + def _convert_space_hop_skopt(space): dimensions = [] for name, specs in space.items(): diff --git a/neptunecontrib/monitoring/reporting.py b/neptunecontrib/monitoring/reporting.py new file mode 100644 index 0000000..ba1c904 --- /dev/null +++ b/neptunecontrib/monitoring/reporting.py @@ -0,0 +1,298 @@ +# +# Copyright (c) 2019, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import matplotlib.pyplot as plt +import neptune +import pandas as pd +import seaborn as sns +from scikitplot.metrics import plot_roc, plot_precision_recall, plot_confusion_matrix + +from neptunecontrib.monitoring.utils import fig2pil + + +def send_binary_classification_report(ctx, y_true, y_pred, + threshold=0.5, + figsize=(16, 12), + channel_name='classification report'): + """Creates binary classification report and logs it in Neptune. + + This function creates ROC AUC curve, confusion matrix, precision recall curve and + prediction distribution charts and logs it to the 'classification report' channel in Neptune. + + Args: + ctx(`neptune.Context`): Neptune context. + y_true (array-like, shape (n_samples)): Ground truth (correct) target values. + y_pred (array-like, shape (n_samples, 2)): Predictions both for negative and positive class + in the float format. + threshold(float): threshold to be applied for the class asignment. + figsize(tuple): size of the matplotlib.pyplot figure object + channel_name(str): name of the neptune channel. Default is 'classification report'. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Log classification report to Neptune. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import send_binary_classification_report + >>> + >>> ctx = neptune.Context() + >>> send_binary_classification_report(ctx, y_test, y_test_pred) + + """ + fig, axs = plt.subplots(2, 2, figsize=figsize) + plot_roc(y_true, y_pred, ax=axs[0, 0]) + plot_precision_recall(y_true, y_pred, ax=axs[0, 1]) + plot_prediction_distribution(y_true, y_pred[:, 1], ax=axs[1, 0]) + plot_confusion_matrix(y_true, y_pred[:, 1] > threshold, ax=axs[1, 1]) + fig.tight_layout() + npt_pred_dist = neptune.Image(name='chart', description='', data=fig2pil(fig)) + ctx.channel_send(channel_name, npt_pred_dist) + + +def send_prediction_distribution(ctx, y_true, y_pred, figsize=(16, 12), channel_name='prediction distribution'): + """Creates prediction distribution chart and logs it in Neptune. + + Args: + ctx(`neptune.Context`): Neptune context. + y_true (array-like, shape (n_samples)): Ground truth (correct) target values. + y_pred (array-like, shape (n_samples)): Predictions both for the positive class in the float format. + figsize(tuple): size of the matplotlib.pyplot figure object + channel_name(str): name of the neptune channel. Default is 'prediction distribution'. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Log prediction distribution to Neptune. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import send_prediction_distribution + >>> + >>> ctx = neptune.Context() + >>> send_prediction_distribution(ctx, y_test, y_test_pred[:, 1]) + + """ + fig, ax = plt.subplots(figsize=figsize) + plot_prediction_distribution(y_true, y_pred, ax=ax) + npt_pred_dist = neptune.Image(name='chart', description='', data=fig2pil(fig)) + ctx.channel_send(channel_name, npt_pred_dist) + + +def send_roc_auc_curve(ctx, y_true, y_pred, figsize=(16, 12), channel_name='ROC AUC curve'): + """Creates ROC AUC curve and logs it in Neptune. + + Args: + ctx(`neptune.Context`): Neptune context. + y_true (array-like, shape (n_samples)): Ground truth (correct) target values. + y_pred (array-like, shape (n_samples, 2)): Predictions both for negative and positive class + in the float format. + figsize(tuple): size of the matplotlib.pyplot figure object + channel_name(str): name of the neptune channel. Default is 'ROC AUC curve'. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Log classification report to Neptune. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import send_roc_auc_curve + >>> + >>> ctx = neptune.Context() + >>> send_roc_auc_curve(ctx, y_test, y_test_pred) + + """ + fig, ax = plt.subplots(figsize=figsize) + plot_roc(y_true, y_pred, ax=ax) + npt_roc_auc = neptune.Image(name='chart', description='', data=fig2pil(fig)) + ctx.channel_send(channel_name, npt_roc_auc) + + +def send_confusion_matrix(ctx, y_true, y_pred, figsize=(16, 12), channel_name='confusion_matrix'): + """Creates ROC AUC curve and logs it in Neptune. + + Args: + ctx(`neptune.Context`): Neptune context. + y_true (array-like, shape (n_samples)): Ground truth (correct) target values. + y_pred (array-like, shape (n_samples)): Positive class predictions in the binary format. + figsize(tuple): size of the matplotlib.pyplot figure object + channel_name(str): name of the neptune channel. Default is 'ROC AUC curve'. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Log classification report to Neptune. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import send_confusion_matrix + >>> + >>> ctx = neptune.Context() + >>> send_confusion_matrix(ctx, y_test, y_test_pred[:, 1] > 0.5) + + """ + fig, ax = plt.subplots(figsize=figsize) + plot_confusion_matrix(y_true, y_pred, ax=ax) + npt_conf_matrix = neptune.Image(name='chart', description='', data=fig2pil(fig)) + ctx.channel_send(channel_name, npt_conf_matrix) + + +def send_precision_recall(ctx, y_true, y_pred, figsize=(16, 12), channel_name='precision_recall_curve'): + """Creates precision recall curve and logs it in Neptune. + + Args: + ctx(`neptune.Context`): Neptune context. + y_true (array-like, shape (n_samples)): Ground truth (correct) target values. + y_pred (array-like, shape (n_samples, 2)): Predictions both for negative and positive class + in the float format. + figsize(tuple): size of the matplotlib.pyplot figure object + channel_name(str): name of the neptune channel. Default is 'ROC AUC curve'. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Log classification report to Neptune. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import send_precision_recall + >>> + >>> ctx = neptune.Context() + >>> send_precision_recall(ctx, y_test, y_test_pred) + + """ + fig, ax = plt.subplots(figsize=figsize) + plot_precision_recall(y_true, y_pred, ax=ax) + npt_roc_auc = neptune.Image(name='chart', description='', data=fig2pil(fig)) + ctx.channel_send(channel_name, npt_roc_auc) + + +def plot_prediction_distribution(y_true, y_pred, ax=None, figsize=None): + """Generates prediction distribution plot from predictions and true labels. + + Args: + y_true (array-like, shape (n_samples)): + Ground truth (correct) target values. + y_pred (array-like, shape (n_samples)): + Estimated targets as returned by a classifier. + ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to + plot the curve. If None, the plot is drawn on a new set of axes. + figsize (2-tuple, optional): Tuple denoting figure size of the plot + e.g. (6, 6). Defaults to ``None``. + Returns: + ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was + drawn. + + Examples: + Train the model and make predictions on test. + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.metrics import classification_report + >>> + >>> X, y = make_classification(n_samples=2000) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + >>> + >>> model = RandomForestClassifier() + >>> model.fit(X_train, y_train) + >>> + >>> y_test_pred = model.predict_proba(X_test) + + Plot prediction distribution. + + >>> import neptune + >>> from neptunecontrib.monitoring.reporting import plot_prediction_distribution + >>> + >>> plot_prediction_distribution(y_test, y_test_pred[:, 1]) + """ + + if ax is None: + _, ax = plt.subplots(1, 1, figsize=figsize) + + ax.set_title('Prediction Distribution', fontsize='large') + + df = pd.DataFrame({'Prediction': y_pred, + 'True label': y_true}) + + sns.distplot(df[df['True label'] == 0]['Prediction'], label='negative', ax=ax) + sns.distplot(df[df['True label'] == 1]['Prediction'], label='positive', ax=ax) + + ax.legend(prop={'size': 16}, title='Labels') + + return ax diff --git a/neptunecontrib/monitoring/skopt.py b/neptunecontrib/monitoring/skopt.py index e293259..10e3089 100644 --- a/neptunecontrib/monitoring/skopt.py +++ b/neptunecontrib/monitoring/skopt.py @@ -17,8 +17,7 @@ import neptune import skopt.plots as sk_plots -from neptunecontrib.monitoring.utils import fig2pil -from neptunecontrib.viz.utils import axes2fig +from neptunecontrib.monitoring.utils import fig2pil, axes2fig class NeptuneMonitor: diff --git a/neptunecontrib/monitoring/utils.py b/neptunecontrib/monitoring/utils.py index d2078a4..1716d5d 100644 --- a/neptunecontrib/monitoring/utils.py +++ b/neptunecontrib/monitoring/utils.py @@ -14,6 +14,9 @@ # limitations under the License. # +from itertools import product + +import matplotlib.pyplot as plt import numpy as np from PIL import Image @@ -80,3 +83,44 @@ def is_offline_context(context): True """ return context.params.__class__.__name__ == 'OfflineContextParams' + + +def axes2fig(axes, fig=None): + """Converts ndarray of matplotlib object to matplotlib figure. + + Scikit-optimize plotting functions return ndarray of axes. This can be tricky + to work with so you can use this function to convert it to the standard figure format. + + Args: + axes(`numpy.ndarray`): Array of matplotlib axes objects. + fig('matplotlib.figure.Figure'): Matplotlib figure on which you may want to plot + your axes. Default None. + + Returns: + 'matplotlib.figure.Figure': Matplotlib figure with axes objects as subplots. + + Examples: + Assuming you have a `scipy.optimize.OptimizeResult` object you want to plot. + + >>> from skopt.plots import plot_evaluations + >>> eval_plot = plot_evaluations(result, bins=20) + >>> type(eval_plot) + numpy.ndarray + + >>> from neptunecontrib.viz.utils import axes2fig + >>> fig = axes2fig(eval_plot) + >>> fig + matplotlib.figure.Figure + + """ + try: + h, w = axes.shape + if not fig: + fig = plt.figure(figsize=(h * 3, w * 3)) + for i, j in product(range(h), range(w)): + fig._axstack.add(fig._make_key(axes[i, j]), axes[i, j]) + except AttributeError: + if not fig: + fig = plt.figure(figsize=(6, 6)) + fig._axstack.add(fig._make_key(axes), axes) + return fig diff --git a/neptunecontrib/viz/utils.py b/neptunecontrib/viz/utils.py deleted file mode 100644 index 14ccb7e..0000000 --- a/neptunecontrib/viz/utils.py +++ /dev/null @@ -1,136 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from itertools import product - -import matplotlib.pyplot as plt -from scipy.optimize import OptimizeResult -import skopt - - -def df2result(df, metric_col, param_cols, param_types=None): - """Converts dataframe with metrics and hyperparameters to the OptimizeResults format. - - It is a helper function that lets you use all the tools that expect OptimizeResult object - like for example scikit-optimize plot_evaluations function. - - Args: - df(`pandas.DataFrame`): Dataframe containing metric and hyperparameters. - metric_col(str): Name of the metric column. - param_cols(list): Names of the hyperparameter columns. - param_types(list or None): Optional list of hyperparameter column types. - By default it will treat all the columns as float but you can also pass str - for categorical channels. E.g param_types=[float, str, float, float] - - Returns: - `scipy.optimize.OptimizeResult`: Results object that contains the hyperparameter and metric - information. - - Examples: - Instantiate a session. - - >>> from neptunelib.api.session import Session - >>> session = Session() - - Fetch a project and a list of experiments. - - >>> project = session.get_projects('neptune-ml')['neptune-ml/Home-Credit-Default-Risk'] - >>> leaderboard = project.get_leaderboard(state=['succeeded'], owner=['czakon']) - - Comvert the leaderboard dataframe to the `ResultOptimize` instance taking only the parameters and - metric that you care about. - - >>> result = df2result(leaderboard, - metric_col='channel_ROC_AUC', - param_cols=['parameter_lgbm__max_depth', 'parameter_lgbm__num_leaves', 'parameter_lgbm__min_child_samples']) - - """ - - if not param_types: - param_types = [float for _ in param_cols] - - df = _prep_df(df, param_cols, param_types) - df = df.sort_values(metric_col, ascending=False) - param_space = _convert_to_param_space(df, param_cols, param_types) - - results = OptimizeResult() - results.x_iters = df[param_cols].values - results.func_vals = df[metric_col].to_list() - results.x = results.x_iters[0] - results.fun = results.func_vals[0] - results.space = param_space - return results - - -def axes2fig(axes, fig=None): - """Converts ndarray of matplotlib object to matplotlib figure. - - Scikit-optimize plotting functions return ndarray of axes. This can be tricky - to work with so you can use this function to convert it to the standard figure format. - - Args: - axes(`numpy.ndarray`): Array of matplotlib axes objects. - fig('matplotlib.figure.Figure'): Matplotlib figure on which you may want to plot - your axes. Default None. - - Returns: - 'matplotlib.figure.Figure': Matplotlib figure with axes objects as subplots. - - Examples: - Assuming you have a `scipy.optimize.OptimizeResult` object you want to plot. - - >>> from skopt.plots import plot_evaluations - >>> eval_plot = plot_evaluations(result, bins=20) - >>> type(eval_plot) - numpy.ndarray - - >>> from neptunecontrib.viz.utils import axes2fig - >>> fig = axes2fig(eval_plot) - >>> fig - matplotlib.figure.Figure - - """ - try: - h, w = axes.shape - if not fig: - fig = plt.figure(figsize=(h * 3, w * 3)) - for i, j in product(range(h), range(w)): - fig._axstack.add(fig._make_key(axes[i, j]), axes[i, j]) - except AttributeError: - if not fig: - fig = plt.figure(figsize=(6, 6)) - fig._axstack.add(fig._make_key(axes), axes) - return fig - - -def _prep_df(df, param_cols, param_types): - for col, col_type in zip(param_cols, param_types): - df[col] = df[col].astype(col_type) - return df - - -def _convert_to_param_space(df, param_cols, param_types): - dimensions = [] - for colname, col_type in zip(param_cols, param_types): - if col_type == str: - dimensions.append(skopt.space.Categorical(categories=df[colname].unique(), - name=colname)) - elif col_type == float: - low, high = df[colname].min(), df[colname].max() - dimensions.append(skopt.space.Real(low, high, name=colname)) - else: - raise NotImplementedError - skopt_space = skopt.Space(dimensions) - return skopt_space diff --git a/requirements.txt b/requirements.txt index 1b6866d..a53a1de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ scikit-optimize==0.5.2 retrying==1.3.3 hyperopt==0.1.1 python-telegram-bot==11.1.0 +scikit-plot==0.3.7 vega