From 31cabd6484eff8bb3398b2e566123e01ae29367f Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 16:11:34 +0100 Subject: [PATCH 01/10] add long description, prepare bump --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index dc3ddff..a1b97fe 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ +[![PyPi Version](https://img.shields.io/pypi/pyversions/bokbokbok)](#) +[![PyPI](https://img.shields.io/pypi/v/bokbokbok)](#) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/bokbokbok)](#) + # bokbokbok From aceab94021e285c4ab2681f69360e196bbbd98f1 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 16:11:48 +0100 Subject: [PATCH 02/10] correct Gradint --- docs/derivations/note.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/derivations/note.md b/docs/derivations/note.md index 191a45f..f4c3ce5 100644 --- a/docs/derivations/note.md +++ b/docs/derivations/note.md @@ -17,6 +17,6 @@ The Hessian is similarly calculated: -We will make use of the following property for the calculations of the Losses and Hessians: +We will make use of the following property for the calculations of the Gradients and Hessians: \ No newline at end of file From b3acd6fb17f0346dfe70cef157156cc9bf280372 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 16:12:23 +0100 Subject: [PATCH 03/10] add working init --- bokbokbok/eval_metrics/regression/__init__.py | 14 ++++++++++++++ bokbokbok/loss_functions/regression/__init__.py | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/bokbokbok/eval_metrics/regression/__init__.py b/bokbokbok/eval_metrics/regression/__init__.py index e69de29..ead2072 100644 --- a/bokbokbok/eval_metrics/regression/__init__.py +++ b/bokbokbok/eval_metrics/regression/__init__.py @@ -0,0 +1,14 @@ +"""Import required metrics.""" + + +from .regression_eval_metrics import ( + SquaredLogErrorMetric, + RootMeanSquaredLogErrorMetric, + LogCoshMetric, +) + +__all__ = [ + "SquaredLogErrorMetric", + "RootMeanSquaredLogErrorMetric", + "LogCoshMetric" +] diff --git a/bokbokbok/loss_functions/regression/__init__.py b/bokbokbok/loss_functions/regression/__init__.py index e69de29..0034830 100644 --- a/bokbokbok/loss_functions/regression/__init__.py +++ b/bokbokbok/loss_functions/regression/__init__.py @@ -0,0 +1,12 @@ +"""Import required losses.""" + + +from .regression_loss_functions import ( + SquaredLogErrorLoss, + LogCoshLoss, +) + +__all__ = [ + "SquaredLogErrorLoss", + "LogCoshLoss" +] \ No newline at end of file From 8080f77b7228b4613f23bd1309bc4263e37cd2e9 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 16:12:43 +0100 Subject: [PATCH 04/10] add docstring --- bokbokbok/loss_functions/regression/regression_loss_functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bokbokbok/loss_functions/regression/regression_loss_functions.py b/bokbokbok/loss_functions/regression/regression_loss_functions.py index 803ebf7..886e388 100644 --- a/bokbokbok/loss_functions/regression/regression_loss_functions.py +++ b/bokbokbok/loss_functions/regression/regression_loss_functions.py @@ -62,6 +62,7 @@ def squared_log_loss( def LogCoshLoss(): """ + An alternative to Mean Absolute Error. """ def _gradient(yhat, dtrain): From 363c9799f9f63c5abb29efbceeaeefacb232d82c Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:43:49 +0100 Subject: [PATCH 05/10] add f1score --- .../eval_metrics/classification/__init__.py | 6 ++- .../classification_eval_metrics.py | 38 ++++++++++++++++++- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/bokbokbok/eval_metrics/classification/__init__.py b/bokbokbok/eval_metrics/classification/__init__.py index 7752d34..f77d8b3 100644 --- a/bokbokbok/eval_metrics/classification/__init__.py +++ b/bokbokbok/eval_metrics/classification/__init__.py @@ -1,12 +1,14 @@ """Import required metrics.""" -from .classification_eval_metrics import( +from .classification_eval_metrics import ( WeightedCrossEntropyMetric, FocalMetric, + F1_Score_Binary, ) __all__ = [ "WeightedCrossEntropyMetric", - "FocalMetric" + "FocalMetric", + "F1_Score_Binary" ] \ No newline at end of file diff --git a/bokbokbok/eval_metrics/classification/classification_eval_metrics.py b/bokbokbok/eval_metrics/classification/classification_eval_metrics.py index 415fdcd..2527790 100644 --- a/bokbokbok/eval_metrics/classification/classification_eval_metrics.py +++ b/bokbokbok/eval_metrics/classification/classification_eval_metrics.py @@ -1,4 +1,5 @@ import numpy as np +from sklearn.metrics import f1_score from bokbokbok.utils import clip_sigmoid @@ -69,11 +70,44 @@ def focal_metric(yhat, dtrain, alpha=alpha, gamma=gamma, XGBoost=XGBoost): yhat = clip_sigmoid(yhat) elements = (- alpha * y * np.log(yhat) * np.power(1 - yhat, gamma) - - (1 - y) * np.log(1 - yhat) * np.power(yhat, gamma)) + (1 - y) * np.log(1 - yhat) * np.power(yhat, gamma)) if XGBoost: return f'Focal_alpha{alpha}_gamma{gamma}', (np.sum(elements) / len(y)) else: - return f'Focal_alpha{alpha}_gamma{gamma}', (np.sum(elements)/ len(y)), False + return f'Focal_alpha{alpha}_gamma{gamma}', (np.sum(elements) / len(y)), False return focal_metric + + +def F1_Score_Binary(XGBoost=False, *args, **kwargs): + """ + Implements the f1_score metric from scikit learn: + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn-metrics-f1-score + + Args: + *args: The arguments to be fed into the scikit learn metric. + XGBoost (Bool): Set to True if using XGBoost. We assume LightGBM as default use. + Note that you should also set `maximize=True` in the XGBoost train function + + """ + def binary_f1_score(yhat, data, XGBoost=XGBoost): + """ + F1 Score. + + Args: + yhat: Predictions + dtrain: The XGBoost / LightGBM dataset + XGBoost (Bool): If XGBoost is to be implemented + + Returns: + Name of the eval metric, Eval score, Bool to maximise function + """ + y_true = data.get_label() + yhat = np.round(yhat) + if XGBoost: + return 'F1', f1_score(y_true, yhat, *args, **kwargs) + else: + return 'F1', f1_score(y_true, yhat, *args, **kwargs), True + + return binary_f1_score From 3cccb5885f58c08c7ceff3cadebde59a1b8956ed Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:44:13 +0100 Subject: [PATCH 06/10] clean --- .../classification/classification_loss_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bokbokbok/loss_functions/classification/classification_loss_functions.py b/bokbokbok/loss_functions/classification/classification_loss_functions.py index 8d2e467..fa6d90f 100644 --- a/bokbokbok/loss_functions/classification/classification_loss_functions.py +++ b/bokbokbok/loss_functions/classification/classification_loss_functions.py @@ -23,7 +23,7 @@ def _gradient(yhat, dtrain, alpha): yhat = clip_sigmoid(yhat) - grad = y * yhat * (alpha - 1) + yhat - alpha * y + grad = (y * yhat * (alpha - 1)) + yhat - (alpha * y) return grad From a4f7b0179f60fe5af0a2ecc3da9969c785044ca8 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:44:57 +0100 Subject: [PATCH 07/10] add sigmoid score --- docs/tutorials/focal_loss.ipynb | 10 ++++++++-- docs/tutorials/weighted_cross_entropy.ipynb | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/tutorials/focal_loss.ipynb b/docs/tutorials/focal_loss.ipynb index 06a50a1..934b39a 100644 --- a/docs/tutorials/focal_loss.ipynb +++ b/docs/tutorials/focal_loss.ipynb @@ -8,8 +8,10 @@ "source": [ "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import roc_auc_score\n", "from bokbokbok.loss_functions.classification import FocalLoss\n", "from bokbokbok.eval_metrics.classification import FocalMetric\n", + "from bokbokbok.utils import clip_sigmoid\n", "\n", "X, y = make_classification(n_samples=1000, \n", " n_features=10, \n", @@ -54,7 +56,9 @@ " valid_names=['train','valid'],\n", " fobj=FocalLoss(alpha=alpha, gamma=gamma),\n", " feval=FocalMetric(alpha=alpha, gamma=gamma),\n", - " early_stopping_rounds=100)" + " early_stopping_rounds=100)\n", + "\n", + "roc_auc_score(y_valid, clip_sigmoid(clf.predict(X_valid)))" ] }, { @@ -89,7 +93,9 @@ " obj=FocalLoss(alpha=alpha, gamma=gamma),\n", " maximize=False,\n", " feval=FocalMetric(alpha=alpha, gamma=gamma, XGBoost=True),\n", - " evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])" + " evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])\n", + "\n", + "roc_auc_score(y_valid, clip_sigmoid(bst.predict(dvalid)))" ] } ], diff --git a/docs/tutorials/weighted_cross_entropy.ipynb b/docs/tutorials/weighted_cross_entropy.ipynb index 249fc35..aa66fe6 100644 --- a/docs/tutorials/weighted_cross_entropy.ipynb +++ b/docs/tutorials/weighted_cross_entropy.ipynb @@ -8,8 +8,10 @@ "source": [ "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import roc_auc_score\n", "from bokbokbok.loss_functions.classification import WeightedCrossEntropyLoss\n", "from bokbokbok.eval_metrics.classification import WeightedCrossEntropyMetric\n", + "from bokbokbok.utils import clip_sigmoid\n", "\n", "X, y = make_classification(n_samples=1000, \n", " n_features=10, \n", @@ -53,7 +55,9 @@ " valid_names=['train','valid'],\n", " fobj=WeightedCrossEntropyLoss(alpha=alpha),\n", " feval=WeightedCrossEntropyMetric(alpha=alpha),\n", - " early_stopping_rounds=100)" + " early_stopping_rounds=100)\n", + "\n", + "roc_auc_score(y_valid, clip_sigmoid(clf.predict(X_valid)))" ] }, { @@ -88,7 +92,9 @@ " obj=WeightedCrossEntropyLoss(alpha=alpha),\n", " maximize=False,\n", " feval=WeightedCrossEntropyMetric(alpha=alpha, XGBoost=True),\n", - " evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])" + " evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])\n", + "\n", + "roc_auc_score(y_valid, clip_sigmoid(bst.predict(dvalid)))" ] } ], From ba31867e50147c827e33b35b5ea916d4a8bee08f Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:45:28 +0100 Subject: [PATCH 08/10] add f1 score tutorial --- docs/tutorials/F1_score.ipynb | 126 ++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 docs/tutorials/F1_score.ipynb diff --git a/docs/tutorials/F1_score.ipynb b/docs/tutorials/F1_score.ipynb new file mode 100644 index 0000000..ef415a7 --- /dev/null +++ b/docs/tutorials/F1_score.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import roc_auc_score\n", + "from bokbokbok.eval_metrics.classification import F1_Score_Binary\n", + "from bokbokbok.utils import clip_sigmoid\n", + "\n", + "X, y = make_classification(n_samples=1000, \n", + " n_features=10, \n", + " random_state=41114)\n", + "\n", + "X_train, X_valid, y_train, y_valid = train_test_split(X, \n", + " y, \n", + " test_size=0.25, \n", + " random_state=41114)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Usage in LightGBM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import lightgbm as lgb\n", + "\n", + "train = lgb.Dataset(X_train, y_train)\n", + "valid = lgb.Dataset(X_valid, y_valid, reference=train)\n", + "params = {\n", + " 'n_estimators': 300,\n", + " 'objective': 'binary',\n", + " 'seed': 41114,\n", + " 'n_jobs': 8,\n", + " 'learning_rate': 0.1,\n", + " }\n", + "\n", + "clf = lgb.train(params=params,\n", + " train_set=train,\n", + " valid_sets=[train, valid],\n", + " valid_names=['train','valid'],\n", + " feval=F1_Score_Binary(average='micro'),\n", + " early_stopping_rounds=100)\n", + "\n", + "roc_auc_score(y_valid, clf.predict(X_valid))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Usage in XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "\n", + "dtrain = xgb.DMatrix(X_train, y_train)\n", + "dvalid = xgb.DMatrix(X_valid, y_valid)\n", + "\n", + "params = {\n", + " 'seed': 41114,\n", + " 'objective':'binary:logistic',\n", + " 'learning_rate': 0.1,\n", + " 'disable_default_eval_metric': 1\n", + " }\n", + "\n", + "bst = xgb.train(params,\n", + " dtrain=dtrain,\n", + " num_boost_round=300,\n", + " early_stopping_rounds=10,\n", + " verbose_eval=10,\n", + " maximize=True,\n", + " feval=F1_Score_Binary(average='micro', XGBoost=True),\n", + " evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])\n", + "\n", + "roc_auc_score(y_valid, clip_sigmoid(bst.predict(dvalid)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:skorecard_py37]", + "language": "python", + "name": "conda-env-skorecard_py37-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a53feb8119d7ab94bcb9a3b0b66fc8416257ae2f Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:46:26 +0100 Subject: [PATCH 09/10] add f1 score --- README.md | 1 + mkdocs.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index a1b97fe..37e67ea 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Main features: - Focal Loss - Squared Log Error - Log Cosh Loss +- F1 score ## Installation diff --git a/mkdocs.yml b/mkdocs.yml index dfb98ee..ef6baec 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,6 +12,7 @@ nav: - Tutorials: - Weighted Cross Entropy: tutorials/weighted_cross_entropy.ipynb - Focal Loss: tutorials/focal_loss.ipynb + - F1 Score: tutorials/F1_score.ipynb - Derivations: - General Remarks: derivations/note.md - Weighted Cross Entropy: derivations/wce.md From 46978fd54c011275757540fa24634804ec761d61 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 17 Mar 2021 18:46:37 +0100 Subject: [PATCH 10/10] bump and long description --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 4c53bc8..27b219a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages -# with open("README.md", "r", encoding="UTF-8") as fh: -# long_description = fh.read() +with open("README.md", "r", encoding="UTF-8") as fh: + long_description = fh.read() base_packages = [ "numpy>=1.19.2", @@ -33,9 +33,9 @@ setup( name="bokbokbok", - version="0.1", + version="0.2", description="Custom Losses and Metrics for XGBoost, LightGBM, CatBoost", - #long_description=long_description, + long_description=long_description, long_description_content_type="text/markdown", author="Daniel Timbrell", author_email="dantimbrell@gmail.com",