Skip to content

Commit

Permalink
Remove the scikit learn restriction and bump minimal python version t…
Browse files Browse the repository at this point in the history
…o 3.8 (#233)

* Remove the scikit learn restriction

* Set minimal version to 3.7

* Fix linter and swap the type check in the metalearners

* Replace boston dataset by california

* rollback the type check change, linter will break

* Remove list accessor of the california dataset

* Reformat imports

* Change acessors

* Fix feature name

* Remove trailing space

* Put the correct test value

* Change test value

* Change test value

* Fix test pd extractors

* Fix transformation

* Fix type annotations

* Lint fix

* Lint fix

* Put the correct version

* Add changelog

* Bump lightgbm

* Add upper limits to deps

* Bump major

* Increase major constraint of pandas

* Remove upper limitation on xgboost

* Remove silent unused keyword

* Update a few types

* Lint fix

* Add typing extensions for python 3.7 support

* trick to avoid type checking for lists

* Fix classification tests

* Try to replace ndarrays by numpy typing NDArrays

* Change back ndarray

* Reduce type list

* Add one more type

* Add other types

* Remove all other types

* Try to use numpy typing

* Drop python 3.7 support

* Swap utils by testing in pandas assertion functions

* In order to support pandas 2 it is required to bump xgboost up to version 2

* Fix xgboost dmatrix tests

* Fix rank categorical

* Solve pd extractors test

* Fix hash eval test

* Fix lookup in ensemble learner

* Add type annotation to the new functions

* Create conditional assertions based on python version

* Remove necessity for typing extension and fix hash values

* Lint fix

* Fix mypi for multiclass classification for lgbm classifier

* Bump catboost and joblib

* Bump pytest

* Bump coverage packages

* Bump xdist

* Bump mypy

* Bump hypothesis

* Rollback coverage bumps

* Update changelog

* Update changelog

* Change hash test to match exactly 8 minor version
  • Loading branch information
otaviocv committed Nov 8, 2023
1 parent 054d319 commit a302d91
Show file tree
Hide file tree
Showing 22 changed files with 348 additions and 228 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.6", "3.7", "3.8", "3.9"]
python-version: ["3.8", "3.9"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [3.0.0] - 2023-11-08
- **Enhancement**
- Remove support for python 3.6 and 3.7.
- Bumps in joblib, numpy, pandas, scikit-learn, statsmodels, toolz, catboost, lightgbm, shap, xgboost
and test auxiliary packages.

## [2.3.1] - 2023-04-11
- **Bugfix**
- Remove incorrect `lightgbm` import from common paths
Expand Down
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
joblib>=0.13.2,<2
numpy>=1.16.4,<2
pandas>=0.24.1,<2
scikit-learn>=0.21.2,<0.25.0
statsmodels>=0.9.0,<1
toolz>=0.9.0,<1
joblib>=1.3.2,<2
numpy>=1.24.4,<2
pandas>=2,<3
scikit-learn>=1,<2
statsmodels>=0.14.0,<1
toolz>=0.12.0,<1
2 changes: 1 addition & 1 deletion requirements_catboost.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
catboost>=0.14.2,<2
catboost>=1.2.2,<2
2 changes: 1 addition & 1 deletion requirements_lgbm.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
lightgbm>=2.2.2,<4
lightgbm>=4,<5
8 changes: 4 additions & 4 deletions requirements_test.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pytest>=4.2.1,<7
pytest>=7.4.3,<8
pytest-cov>=2.6.1,<3
pytest-xdist>=1.26.1,<3
mypy>=0.670,<1
pytest-xdist>=3.3.1,<4
mypy>=1.6.1,<2
coverage<5
codecov>=2.0,<3
hypothesis>=5.5.4,<7
hypothesis>=6.88.3,<7
4 changes: 2 additions & 2 deletions requirements_tools.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shap>=0.31.0,<=0.40
swifter>=0.284,<2
shap>=0.43,<1
swifter>=0.24,<2
2 changes: 1 addition & 1 deletion requirements_xgboost.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
xgboost>=0.81,<1.5
xgboost>=2,<3
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def requirements_from_pip(filename='requirements.txt'):
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/nubank/{:s}'.format(REPO_NAME),
python_requires='>=3.6.2,<3.10',
python_requires='>=3.8,<3.10',
author="Nubank",
package_dir={'': 'src'},
packages=find_packages('src'),
Expand All @@ -52,8 +52,6 @@ def requirements_from_pip(filename='requirements.txt'):
include_package_data=True,
zip_safe=False,
classifiers=[
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9'
])
2 changes: 1 addition & 1 deletion src/fklearn/resources/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.3.1
3.0.0
194 changes: 118 additions & 76 deletions src/fklearn/training/classification.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING
from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING, Literal

import numpy as np
import numpy.typing as npt
import pandas as pd
from pathlib import Path
from toolz import curry, merge, assoc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import __version__ as sk_version

from fklearn.types import LearnerReturnType, LogType
from fklearn.types import LearnerReturnType, LearnerLogType, LogType
from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring
from fklearn.training.utils import log_learner_time, expand_features_encoded

Expand Down Expand Up @@ -83,16 +84,19 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:

p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner")

log = {'logistic_classification_learner': {
'features': features,
'target': target,
'parameters': merged_params,
'prediction_column': prediction_column,
'package': "sklearn",
'package_version': sk_version,
'feature_importance': dict(zip(features, clf.coef_.flatten())),
'training_samples': len(df)},
'object': clf}
log = {
'logistic_classification_learner': {
'features': features,
'target': target,
'parameters': merged_params,
'prediction_column': prediction_column,
'package': "sklearn",
'package_version': sk_version,
'feature_importance': dict(zip(features, clf.coef_.flatten())),
'training_samples': len(df)
},
'object': clf
}

return p, p(df), log

Expand Down Expand Up @@ -174,13 +178,21 @@ def xgb_classification_learner(df: pd.DataFrame,

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = xgb.DMatrix(df[features].values, label=df[target].values, feature_names=map(str, features), weight=weights)
dtrain = xgb.DMatrix(
df[features].values,
label=df[target].values,
feature_names=list(map(str, features)),
weight=weights
)

bst = xgb.train(params, dtrain, num_estimators)

def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features))
dtest = xgb.DMatrix(
new_df[features].values,
feature_names=list(map(str, features))
)

pred = bst.predict(dtest)
if params["objective"] == "multi:softprob":
Expand Down Expand Up @@ -218,16 +230,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True)

log = {'xgb_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "xgboost",
'package_version': xgb.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': bst.get_score(),
'training_samples': len(df)},
'object': bst}
log = {
'xgb_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "xgboost",
'package_version': xgb.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': bst.get_score(),
'training_samples': len(df)
},
'object': bst
}

return p, p(df), log

Expand Down Expand Up @@ -393,16 +408,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner", shap=True)

log = {'catboost_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "catboost",
'package_version': catboost.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': cbr.feature_importances_,
'training_samples': len(df)},
'object': cbr}
log = {
'catboost_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "catboost",
'package_version': catboost.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': cbr.feature_importances_,
'training_samples': len(df)
},
'object': cbr
}

return p, p(df), log

Expand Down Expand Up @@ -501,29 +519,34 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:

@curry
@log_learner_time(learner_name='lgbm_classification_learner')
def lgbm_classification_learner(df: pd.DataFrame,
features: List[str],
target: str,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: Optional[LogType] = None,
prediction_column: str = "prediction",
weight_column: Optional[str] = None,
encode_extra_cols: bool = True,
valid_sets: Optional[List[pd.DataFrame]] = None,
valid_names: Optional[List[str]] = None,
feval: Optional[Union[
Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]],
List[Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]]]]
] = None,
init_model: Optional[Union[str, Path, 'Booster']] = None,
feature_name: Union[List[str], str] = 'auto',
categorical_feature: Union[List[str], List[int], str] = 'auto',
keep_training_booster: bool = False,
callbacks: Optional[List[Callable]] = None,
dataset_init_score: Optional[Union[
List, List[List], np.ndarray, pd.Series, pd.DataFrame]
] = None) -> LearnerReturnType:
def lgbm_classification_learner(
df: pd.DataFrame,
features: List[str],
target: str,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: Optional[LogType] = None,
prediction_column: str = "prediction",
weight_column: Optional[str] = None,
encode_extra_cols: bool = True,
valid_sets: Optional[List[pd.DataFrame]] = None,
valid_names: Optional[List[str]] = None,
feval: Optional[Union[
Union[Callable[[npt.NDArray, Any], Tuple[str, float, bool]],
Callable[[npt.NDArray, Any], List[Tuple[str, float, bool]]]],
List[Union[Callable[[npt.NDArray, Any],
Tuple[str, float, bool]],
Callable[[npt.NDArray, Any],
List[Tuple[str, float, bool]]]]],
None
]] = None,
init_model: Optional[Union[str, Path, 'Booster']] = None,
feature_name: Union[List[str], Literal['auto']] = 'auto',
categorical_feature: Union[List[str], List[int], Literal['auto']] = 'auto',
keep_training_booster: bool = False,
callbacks: Optional[List[Callable]] = None,
dataset_init_score: Optional[Union[List, List[List], npt.NDArray, pd.Series, pd.DataFrame]] = None
) -> LearnerReturnType:
"""
Fits an LGBM classifier to the dataset.
Expand Down Expand Up @@ -632,20 +655,37 @@ def lgbm_classification_learner(df: pd.DataFrame,

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True, init_score=dataset_init_score)

bst = lgbm.train(params=params, train_set=dtrain, num_boost_round=num_estimators, valid_sets=valid_sets,
valid_names=valid_names, feval=feval, init_model=init_model, feature_name=feature_name,
categorical_feature=categorical_feature, keep_training_booster=keep_training_booster,
callbacks=callbacks)
dtrain = lgbm.Dataset(
df[features].values,
label=df[target],
feature_name=list(map(str, features)),
weight=weights,
init_score=dataset_init_score
)

bst = lgbm.train(
params=params,
train_set=dtrain,
num_boost_round=num_estimators,
valid_sets=valid_sets,
valid_names=valid_names,
feval=feval,
init_model=init_model,
feature_name=feature_name,
categorical_feature=categorical_feature,
keep_training_booster=keep_training_booster,
callbacks=callbacks
)

def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
predictions = bst.predict(new_df[features].values)
if isinstance(predictions, List):
predictions = np.ndarray(predictions)
if is_multiclass_classification:
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(bst.predict(new_df[features].values).T)}
for (key, value) in enumerate(predictions.T)}
else:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
col_dict = {prediction_column: predictions}

if apply_shap:
import shap
Expand Down Expand Up @@ -675,16 +715,18 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

p.__doc__ = learner_pred_fn_docstring("lgbm_classification_learner", shap=True)

log = {'lgbm_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "lightgbm",
'package_version': lgbm.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': dict(zip(features, bst.feature_importance().tolist())),
'training_samples': len(df)},
'object': bst}
log: LearnerLogType = {
'lgbm_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "lightgbm",
'package_version': lgbm.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': dict(zip(features, bst.feature_importance().tolist())),
'training_samples': len(df)},
'object': bst
}

return p, p(df), log

Expand Down
10 changes: 8 additions & 2 deletions src/fklearn/training/ensemble.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Any, Dict, List, TypeVar

import numpy as np
import numpy.typing as npt
import pandas as pd
from toolz import curry, assoc, compose

Expand Down Expand Up @@ -136,10 +138,14 @@ def xgb_octopus_classification_learner(train_set: pd.DataFrame,
def p(df: pd.DataFrame) -> pd.DataFrame:
pred_fn = compose(*pred_fns.values())

def lookup(df: pd.DataFrame) -> npt.NDArray:
idx, cols = pd.factorize(df.pred_bin.values.squeeze())
output = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
return output

return (pred_fn(df)
.assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str))
.assign(prediction=lambda d: d.lookup(d.index.values,
d.pred_bin.values.squeeze()))
.assign(prediction=lookup)
.rename(index=str, columns={"prediction": prediction_column})
.drop("pred_bin", axis=1))

Expand Down

0 comments on commit a302d91

Please sign in to comment.