Remove the scikit learn restriction and bump minimal python version t…

…o 3.8 (#233) * Remove the scikit learn restriction * Set minimal version to 3.7 * Fix linter and swap the type check in the metalearners * Replace boston dataset by california * rollback the type check change, linter will break * Remove list accessor of the california dataset * Reformat imports * Change acessors * Fix feature name * Remove trailing space * Put the correct test value * Change test value * Change test value * Fix test pd extractors * Fix transformation * Fix type annotations * Lint fix * Lint fix * Put the correct version * Add changelog * Bump lightgbm * Add upper limits to deps * Bump major * Increase major constraint of pandas * Remove upper limitation on xgboost * Remove silent unused keyword * Update a few types * Lint fix * Add typing extensions for python 3.7 support * trick to avoid type checking for lists * Fix classification tests * Try to replace ndarrays by numpy typing NDArrays * Change back ndarray * Reduce type list * Add one more type * Add other types * Remove all other types * Try to use numpy typing * Drop python 3.7 support * Swap utils by testing in pandas assertion functions * In order to support pandas 2 it is required to bump xgboost up to version 2 * Fix xgboost dmatrix tests * Fix rank categorical * Solve pd extractors test * Fix hash eval test * Fix lookup in ensemble learner * Add type annotation to the new functions * Create conditional assertions based on python version * Remove necessity for typing extension and fix hash values * Lint fix * Fix mypi for multiclass classification for lgbm classifier * Bump catboost and joblib * Bump pytest * Bump coverage packages * Bump xdist * Bump mypy * Bump hypothesis * Rollback coverage bumps * Update changelog * Update changelog * Change hash test to match exactly 8 minor version
nubank · Nov 8, 2023 · a302d91 · a302d91
1 parent 054d319
commit a302d91
Show file tree

Hide file tree

Showing 22 changed files with 348 additions and 228 deletions.
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -50,7 +50,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9"]
+        python-version: ["3.8", "3.9"]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [3.0.0] - 2023-11-08
+- **Enhancement**
+  - Remove support for python 3.6 and 3.7.
+  - Bumps in joblib, numpy, pandas, scikit-learn, statsmodels, toolz, catboost, lightgbm, shap, xgboost 
+    and test auxiliary packages.
+
 ## [2.3.1] - 2023-04-11
 - **Bugfix**
   - Remove incorrect `lightgbm` import from common paths

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-joblib>=0.13.2,<2
-numpy>=1.16.4,<2
-pandas>=0.24.1,<2
-scikit-learn>=0.21.2,<0.25.0
-statsmodels>=0.9.0,<1
-toolz>=0.9.0,<1
+joblib>=1.3.2,<2
+numpy>=1.24.4,<2
+pandas>=2,<3
+scikit-learn>=1,<2
+statsmodels>=0.14.0,<1
+toolz>=0.12.0,<1
diff --git a/requirements_catboost.txt b/requirements_catboost.txt
@@ -1 +1 @@
-catboost>=0.14.2,<2
+catboost>=1.2.2,<2
diff --git a/requirements_lgbm.txt b/requirements_lgbm.txt
@@ -1 +1 @@
-lightgbm>=2.2.2,<4
+lightgbm>=4,<5
diff --git a/requirements_test.txt b/requirements_test.txt
@@ -1,7 +1,7 @@
-pytest>=4.2.1,<7
+pytest>=7.4.3,<8
 pytest-cov>=2.6.1,<3
-pytest-xdist>=1.26.1,<3
-mypy>=0.670,<1
+pytest-xdist>=3.3.1,<4
+mypy>=1.6.1,<2
 coverage<5
 codecov>=2.0,<3
-hypothesis>=5.5.4,<7
+hypothesis>=6.88.3,<7
diff --git a/requirements_tools.txt b/requirements_tools.txt
@@ -1,2 +1,2 @@
-shap>=0.31.0,<=0.40
-swifter>=0.284,<2
+shap>=0.43,<1
+swifter>=0.24,<2
diff --git a/requirements_xgboost.txt b/requirements_xgboost.txt
@@ -1 +1 @@
-xgboost>=0.81,<1.5
+xgboost>=2,<3
diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@ def requirements_from_pip(filename='requirements.txt'):
       long_description=long_description,
       long_description_content_type="text/markdown",
       url='https://github.com/nubank/{:s}'.format(REPO_NAME),
-      python_requires='>=3.6.2,<3.10',
+      python_requires='>=3.8,<3.10',
       author="Nubank",
       package_dir={'': 'src'},
       packages=find_packages('src'),
@@ -52,8 +52,6 @@ def requirements_from_pip(filename='requirements.txt'):
       include_package_data=True,
       zip_safe=False,
       classifiers=[
-          'Programming Language :: Python :: 3.6',
-          'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8',
           'Programming Language :: Python :: 3.9'
           ])
diff --git a/src/fklearn/resources/VERSION b/src/fklearn/resources/VERSION
@@ -1 +1 @@
-2.3.1
+3.0.0
diff --git a/src/fklearn/training/classification.py b/src/fklearn/training/classification.py
@@ -1,14 +1,15 @@
-from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING
+from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING, Literal
 
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from pathlib import Path
 from toolz import curry, merge, assoc
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn import __version__ as sk_version
 
-from fklearn.types import LearnerReturnType, LogType
+from fklearn.types import LearnerReturnType, LearnerLogType, LogType
 from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring
 from fklearn.training.utils import log_learner_time, expand_features_encoded
 
@@ -83,16 +84,19 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:
 
     p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner")
 
-    log = {'logistic_classification_learner': {
-        'features': features,
-        'target': target,
-        'parameters': merged_params,
-        'prediction_column': prediction_column,
-        'package': "sklearn",
-        'package_version': sk_version,
-        'feature_importance': dict(zip(features, clf.coef_.flatten())),
-        'training_samples': len(df)},
-        'object': clf}
+    log = {
+        'logistic_classification_learner': {
+            'features': features,
+            'target': target,
+            'parameters': merged_params,
+            'prediction_column': prediction_column,
+            'package': "sklearn",
+            'package_version': sk_version,
+            'feature_importance': dict(zip(features, clf.coef_.flatten())),
+            'training_samples': len(df)
+        },
+        'object': clf
+    }
 
     return p, p(df), log
 
@@ -174,13 +178,21 @@ def xgb_classification_learner(df: pd.DataFrame,
 
     features = features if not encode_extra_cols else expand_features_encoded(df, features)
 
-    dtrain = xgb.DMatrix(df[features].values, label=df[target].values, feature_names=map(str, features), weight=weights)
+    dtrain = xgb.DMatrix(
+        df[features].values,
+        label=df[target].values,
+        feature_names=list(map(str, features)),
+        weight=weights
+    )
 
     bst = xgb.train(params, dtrain, num_estimators)
 
     def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
 
-        dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features))
+        dtest = xgb.DMatrix(
+            new_df[features].values,
+            feature_names=list(map(str, features))
+        )
 
         pred = bst.predict(dtest)
         if params["objective"] == "multi:softprob":
@@ -218,16 +230,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
 
     p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True)
 
-    log = {'xgb_classification_learner': {
-        'features': features,
-        'target': target,
-        'prediction_column': prediction_column,
-        'package': "xgboost",
-        'package_version': xgb.__version__,
-        'parameters': assoc(params, "num_estimators", num_estimators),
-        'feature_importance': bst.get_score(),
-        'training_samples': len(df)},
-        'object': bst}
+    log = {
+        'xgb_classification_learner': {
+            'features': features,
+            'target': target,
+            'prediction_column': prediction_column,
+            'package': "xgboost",
+            'package_version': xgb.__version__,
+            'parameters': assoc(params, "num_estimators", num_estimators),
+            'feature_importance': bst.get_score(),
+            'training_samples': len(df)
+        },
+        'object': bst
+    }
 
     return p, p(df), log
 
@@ -393,16 +408,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
 
     p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner", shap=True)
 
-    log = {'catboost_classification_learner': {
-        'features': features,
-        'target': target,
-        'prediction_column': prediction_column,
-        'package': "catboost",
-        'package_version': catboost.__version__,
-        'parameters': assoc(params, "num_estimators", num_estimators),
-        'feature_importance': cbr.feature_importances_,
-        'training_samples': len(df)},
-        'object': cbr}
+    log = {
+        'catboost_classification_learner': {
+            'features': features,
+            'target': target,
+            'prediction_column': prediction_column,
+            'package': "catboost",
+            'package_version': catboost.__version__,
+            'parameters': assoc(params, "num_estimators", num_estimators),
+            'feature_importance': cbr.feature_importances_,
+            'training_samples': len(df)
+        },
+        'object': cbr
+    }
 
     return p, p(df), log
 
@@ -501,29 +519,34 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:
 
 @curry
 @log_learner_time(learner_name='lgbm_classification_learner')
-def lgbm_classification_learner(df: pd.DataFrame,
-                                features: List[str],
-                                target: str,
-                                learning_rate: float = 0.1,
-                                num_estimators: int = 100,
-                                extra_params: Optional[LogType] = None,
-                                prediction_column: str = "prediction",
-                                weight_column: Optional[str] = None,
-                                encode_extra_cols: bool = True,
-                                valid_sets: Optional[List[pd.DataFrame]] = None,
-                                valid_names: Optional[List[str]] = None,
-                                feval: Optional[Union[
-                                    Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]],
-                                    List[Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]]]]
-                                ] = None,
-                                init_model: Optional[Union[str, Path, 'Booster']] = None,
-                                feature_name: Union[List[str], str] = 'auto',
-                                categorical_feature: Union[List[str], List[int], str] = 'auto',
-                                keep_training_booster: bool = False,
-                                callbacks: Optional[List[Callable]] = None,
-                                dataset_init_score: Optional[Union[
-                                    List, List[List], np.ndarray, pd.Series, pd.DataFrame]
-                                ] = None) -> LearnerReturnType:
+def lgbm_classification_learner(
+        df: pd.DataFrame,
+        features: List[str],
+        target: str,
+        learning_rate: float = 0.1,
+        num_estimators: int = 100,
+        extra_params: Optional[LogType] = None,
+        prediction_column: str = "prediction",
+        weight_column: Optional[str] = None,
+        encode_extra_cols: bool = True,
+        valid_sets: Optional[List[pd.DataFrame]] = None,
+        valid_names: Optional[List[str]] = None,
+        feval: Optional[Union[
+            Union[Callable[[npt.NDArray, Any], Tuple[str, float, bool]],
+                  Callable[[npt.NDArray, Any], List[Tuple[str, float, bool]]]],
+            List[Union[Callable[[npt.NDArray, Any],
+                       Tuple[str, float, bool]],
+                       Callable[[npt.NDArray, Any],
+                       List[Tuple[str, float, bool]]]]],
+            None
+        ]] = None,
+        init_model: Optional[Union[str, Path, 'Booster']] = None,
+        feature_name: Union[List[str], Literal['auto']] = 'auto',
+        categorical_feature: Union[List[str], List[int], Literal['auto']] = 'auto',
+        keep_training_booster: bool = False,
+        callbacks: Optional[List[Callable]] = None,
+        dataset_init_score: Optional[Union[List, List[List], npt.NDArray, pd.Series, pd.DataFrame]] = None
+) -> LearnerReturnType:
     """
     Fits an LGBM classifier to the dataset.
 
@@ -632,20 +655,37 @@ def lgbm_classification_learner(df: pd.DataFrame,
 
     features = features if not encode_extra_cols else expand_features_encoded(df, features)
 
-    dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
-                          silent=True, init_score=dataset_init_score)
-
-    bst = lgbm.train(params=params, train_set=dtrain, num_boost_round=num_estimators, valid_sets=valid_sets,
-                     valid_names=valid_names, feval=feval, init_model=init_model, feature_name=feature_name,
-                     categorical_feature=categorical_feature, keep_training_booster=keep_training_booster,
-                     callbacks=callbacks)
+    dtrain = lgbm.Dataset(
+        df[features].values,
+        label=df[target],
+        feature_name=list(map(str, features)),
+        weight=weights,
+        init_score=dataset_init_score
+    )
+
+    bst = lgbm.train(
+        params=params,
+        train_set=dtrain,
+        num_boost_round=num_estimators,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        feval=feval,
+        init_model=init_model,
+        feature_name=feature_name,
+        categorical_feature=categorical_feature,
+        keep_training_booster=keep_training_booster,
+        callbacks=callbacks
+    )
 
     def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
+        predictions = bst.predict(new_df[features].values)
+        if isinstance(predictions, List):
+            predictions = np.ndarray(predictions)
         if is_multiclass_classification:
             col_dict = {prediction_column + "_" + str(key): value
-                        for (key, value) in enumerate(bst.predict(new_df[features].values).T)}
+                        for (key, value) in enumerate(predictions.T)}
         else:
-            col_dict = {prediction_column: bst.predict(new_df[features].values)}
+            col_dict = {prediction_column: predictions}
 
         if apply_shap:
             import shap
@@ -675,16 +715,18 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
 
     p.__doc__ = learner_pred_fn_docstring("lgbm_classification_learner", shap=True)
 
-    log = {'lgbm_classification_learner': {
-        'features': features,
-        'target': target,
-        'prediction_column': prediction_column,
-        'package': "lightgbm",
-        'package_version': lgbm.__version__,
-        'parameters': assoc(params, "num_estimators", num_estimators),
-        'feature_importance': dict(zip(features, bst.feature_importance().tolist())),
-        'training_samples': len(df)},
-        'object': bst}
+    log: LearnerLogType = {
+        'lgbm_classification_learner': {
+            'features': features,
+            'target': target,
+            'prediction_column': prediction_column,
+            'package': "lightgbm",
+            'package_version': lgbm.__version__,
+            'parameters': assoc(params, "num_estimators", num_estimators),
+            'feature_importance': dict(zip(features, bst.feature_importance().tolist())),
+            'training_samples': len(df)},
+        'object': bst
+    }
 
     return p, p(df), log
 

diff --git a/src/fklearn/training/ensemble.py b/src/fklearn/training/ensemble.py
@@ -1,5 +1,7 @@
 from typing import Any, Dict, List, TypeVar
 
+import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from toolz import curry, assoc, compose
 
@@ -136,10 +138,14 @@ def xgb_octopus_classification_learner(train_set: pd.DataFrame,
     def p(df: pd.DataFrame) -> pd.DataFrame:
         pred_fn = compose(*pred_fns.values())
 
+        def lookup(df: pd.DataFrame) -> npt.NDArray:
+            idx, cols = pd.factorize(df.pred_bin.values.squeeze())
+            output = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
+            return output
+
         return (pred_fn(df)
                 .assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str))
-                .assign(prediction=lambda d: d.lookup(d.index.values,
-                                                      d.pred_bin.values.squeeze()))
+                .assign(prediction=lookup)
                 .rename(index=str, columns={"prediction": prediction_column})
                 .drop("pred_bin", axis=1))