ENH: Support imblearn (#77)

pandas-ml · Sep 5, 2016 · 86583d0 · 86583d0
1 parent 92a0900
commit 86583d0
Show file tree

Hide file tree

Showing 18 changed files with 385 additions and 19 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,10 +2,10 @@ language: python
 
 env:
   - PYTHON=2.7 PANDAS=0.17.0 SKLEARN=0.16.1
-  - PYTHON=2.7 PANDAS=0.18.0 SKLEARN=0.17.1
+  - PYTHON=2.7 PANDAS=0.18.0 SKLEARN=0.17.1 IMBALANCE=true
   - PYTHON=3.4 PANDAS=0.16.0 SKLEARN=0.16.1
   - PYTHON=3.4 PANDAS=0.17.0 SKLEARN=0.17.1
-  - PYTHON=3.5 PANDAS=0.18.0 SKLEARN=0.17.1
+  - PYTHON=3.5 PANDAS=0.18.0 SKLEARN=0.17.1 IMBALANCE=true COVERAGE=true
 
 addons:
   apt:
@@ -27,4 +27,6 @@ script:
   - flake8 --ignore E501 pandas_ml
 
 after_success:
-  - coveralls
+  - if [ "$COVERAGE" ]; then
+      coveralls;
+    fi
diff --git a/doc/source/imbalance.rst b/doc/source/imbalance.rst
@@ -0,0 +1,109 @@
+
+Handling imbalanced data
+========================
+
+This section describes how to use
+`imbalanced-learn <http://contrib.scikit-learn.org/imbalanced-learn/index.html>`_
+functionalities via ``pandas-ml`` to handle imbalanced data.
+
+Sampling
+--------
+
+Assuming we have ``ModelFrame`` which has imbalanced target values. The ``ModelFrame`` has
+data with 80 observations labeld with ``0`` and 20 observations labeled with ``1``.
+
+.. code-block:: python
+
+   >>> import numpy as np
+   >>> import pandas_ml as pdml
+   >>> df = pdml.ModelFrame(np.random.randn(100, 5),
+   ...                      target=np.array([0, 1]).repeat([80, 20]),
+   ...                      columns=list('ABCDE'))
+   >>> df
+       .target         A         B         C         D         E
+   0         0  1.467859  1.637449  0.175770  0.189108  0.775139
+   1         0 -1.706293 -0.598930 -0.343427  0.355235 -1.348378
+   2         0  0.030542  0.393779 -1.891991  0.041062  0.055530
+   3         0  0.320321 -1.062963 -0.416418 -0.629776  1.126027
+   ..      ...       ...       ...       ...       ...       ...
+   96        1 -1.199039  0.055702  0.675555 -0.416601 -1.676259
+   97        1 -1.264182 -0.167390 -0.939794 -0.638733 -0.806794
+   98        1 -0.616754  1.667483 -1.858449 -0.259630  1.236777
+   99        1 -1.374068 -0.400435 -1.825555  0.824052 -0.335694
+
+   [100 rows x 6 columns]
+
+   >>> df.target.value_counts()
+   0    80
+   1    20
+   Name: .target, dtype: int64
+
+You can access ``imbalanced-learn`` namespace via ``.imbalance`` accessor.
+Passing instanciated under-sampling class to ``ModelFrame.fit_sample`` returns
+under sampled ``ModelFrame`` (Note that ``.index`` is reset).
+
+.. code-block:: python
+
+   >>> sampler = df.imbalance.under_sampling.ClusterCentroids()
+   >>> sampler
+   ClusterCentroids(n_jobs=-1, random_state=None, ratio='auto')
+
+   >>> sampled = df.fit_sample(sampler)
+   >>> sampled
+       .target         A         B         C         D         E
+   0         1  0.232841 -1.364282  1.436854  0.563796 -0.372866
+   1         1 -0.159551  0.473617 -2.024209  0.760444 -0.820403
+   2         1  1.495356 -2.144495  0.076485  1.219948  0.382995
+   3         1 -0.736887  1.399623  0.557098  0.621909 -0.507285
+   ..      ...       ...       ...       ...       ...       ...
+   36        0  0.429978 -1.421307  0.771368  1.704277  0.645590
+   37        0  1.408448  0.132760 -1.082301 -1.195149  0.155057
+   38        0  0.362793 -0.682171  1.026482  0.663343 -2.371229
+   39        0 -0.796293 -0.196428 -0.747574  2.228031 -0.468669
+
+   [40 rows x 6 columns]
+
+   >>> sampled.target.value_counts()
+   1    20
+   0    20
+   Name: .target, dtype: int64
+
+As the same manner, you can perform over-sampling.
+
+.. code-block:: python
+
+   >>> sampler = df.imbalance.over_sampling.SMOTE()
+   >>> sampler
+   SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=None,
+   ratio='auto')
+
+   >>> sampled = df.fit_sample(sampler)
+   >>> sampled
+        .target         A         B         C         D         E
+   0          0  1.467859  1.637449  0.175770  0.189108  0.775139
+   1          0 -1.706293 -0.598930 -0.343427  0.355235 -1.348378
+   2          0  0.030542  0.393779 -1.891991  0.041062  0.055530
+   3          0  0.320321 -1.062963 -0.416418 -0.629776  1.126027
+   ..       ...       ...       ...       ...       ...       ...
+   156        1 -1.279399  0.218171 -0.487836 -0.573564  0.582580
+   157        1 -0.736964  0.239095 -0.422025 -0.841780  0.221591
+   158        1 -0.273911 -0.305608 -0.886088  0.062414 -0.001241
+   159        1  0.073145 -0.167884 -0.781611 -0.016734 -0.045330
+
+   [160 rows x 6 columns]'
+
+   >>> sampled.target.value_counts()
+   1    80
+   0    80
+   Name: .target, dtype: int64
+
+Following table shows ``imbalanced-learn`` module and corresponding ``ModelFrame`` module.
+
+================================  ==========================================
+``imbalanced-learn``              ``ModelFrame`` accessor
+================================  ==========================================
+``imblearn.under_sampling``       ``ModelFrame.imbalance.under_sampling``
+``imblearn.over_sampling``        ``ModelFrame.imbalance.over_sampling``
+``imblearn.combine``              ``ModelFrame.imbalance.combine``
+``imblearn.ensemble``             ``ModelFrame.imbalance.ensemble``
+================================  ==========================================
diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -15,6 +15,7 @@ Contents:
    whatsnew
    modelframe
    sklearn
+   imbalance
    xgboost
    patsy
    conf_mat

diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst
@@ -8,7 +8,8 @@ v0.4.0
 Enhancement
 ^^^^^^^^^^^
 
-- Support scikit-learn v0.17.x
+- Support scikit-learn v0.17.x.
+- Support imbalanced-learn via ``.imbalance`` accessor.
 
 Bug Fix
 ^^^^^^^

diff --git a/pandas_ml/compat.py b/pandas_ml/compat.py
@@ -2,7 +2,11 @@
 
 from distutils.version import LooseVersion
 
+import sklearn
+_SKLEARN_ge_017 = sklearn.__version__ >= LooseVersion('0.17.0')
 
-def _SKLEARN_ge_017():
-    import sklearn
-    return sklearn.__version__ >= LooseVersion('0.17.0')
+try:
+    import imblearn                 # noqa
+    _IMBLEARN_INSTALLED = True
+except ImportError:
+    _IMBLEARN_INSTALLED = False
diff --git a/pandas_ml/core/frame.py b/pandas_ml/core/frame.py
@@ -11,6 +11,7 @@
 from pandas_ml.core.generic import ModelPredictor, _shared_docs
 from pandas_ml.core.series import ModelSeries
 from pandas_ml.core.accessor import _AccessorMethods
+import pandas_ml.imbaccessors as imbaccessors
 import pandas_ml.skaccessors as skaccessors
 import pandas_ml.smaccessors as smaccessors
 import pandas_ml.snsaccessors as snsaccessors
@@ -397,14 +398,48 @@ def _wrap_predicted(self, predicted, estimator):
         """
         Wrapper for predict methods
         """
-
         if util._is_1d_varray(predicted):
             predicted = self._constructor_sliced(predicted, index=self.index)
         else:
             predicted = self._constructor(predicted, index=self.index)
         self._predicted = predicted
         return self._predicted
 
+    @Appender(_shared_docs['estimator_methods'] %
+              dict(funcname='fit_sample', returned='returned : sampling result'))
+    def fit_sample(self, estimator, *args, **kwargs):
+        # for imblearn
+        sampled_X, sampled_y = self._call(estimator, 'fit_sample', *args, **kwargs)
+        return self._wrap_sampled(sampled_X, sampled_y)
+
+    @Appender(_shared_docs['estimator_methods'] %
+              dict(funcname='sample', returned='returned : sampling result'))
+    def sample(self, estimator, *args, **kwargs):
+        # for imblearn
+        sampled_X, sampled_y = self._call(estimator, 'sample', *args, **kwargs)
+        return self._wrap_sampled(sampled_X, sampled_y)
+
+    def _wrap_sampled(self, sampled_X, sampled_y):
+        # revert sampled results to ModelFrame, index is being reset
+
+        def _wrap(x, y):
+            y = self._constructor_sliced(y, name=self.target.name)
+            result = self._constructor(data=x, target=y,
+                                       columns=self.data.columns)
+            return result
+
+        if sampled_X.ndim == 3 or sampled_X.ndim == 1:
+            # ensemble
+            # ndim=3 for EasyEnsemble
+            # ndim=1 for BalanceCascade
+            results = []
+            for x, y in zip(sampled_X, sampled_y):
+                result = _wrap(x, y)
+                results.append(result)
+        else:
+            results = _wrap(sampled_X, sampled_y)
+        return results
+
     @Appender(_shared_docs['estimator_methods'] %
               dict(funcname='transform', returned='returned : transformed result'))
     def transform(self, estimator, *args, **kwargs):
@@ -490,6 +525,8 @@ def score(self, estimator, *args, **kwargs):
         score = self._call(estimator, 'score', *args, **kwargs)
         return score
 
+    # accessors
+
     @property
     @Appender(_shared_docs['skaccessor_nolink'] %
               dict(module='calibration'))
@@ -626,6 +663,15 @@ def grid_search(self):
     def _grid_search(self):
         return skaccessors.GridSearchMethods(self)
 
+    @property
+    def imbalance(self):
+        """ Property to access ``imblearn``"""
+        return self._imbalance
+
+    @cache_readonly
+    def _imbalance(self):
+        return imbaccessors.ImbalanceMethods(self)
+
     @property
     @Appender(_shared_docs['skaccessor'] % dict(module='isotonic'))
     def isotonic(self):

diff --git a/pandas_ml/imbaccessors/__init__.py b/pandas_ml/imbaccessors/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+
+from pandas_ml.imbaccessors.base import ImbalanceMethods  # noqa
diff --git a/pandas_ml/imbaccessors/base.py b/pandas_ml/imbaccessors/base.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+
+from pandas.util.decorators import cache_readonly
+
+from pandas_ml.core.accessor import _AccessorMethods
+
+
+class ImbalanceMethods(_AccessorMethods):
+    """
+    Accessor to ``imblearn``.
+    """
+
+    _module_name = 'imblearn'
+
+    @property
+    def under_sampling(self):
+        """Property to access ``imblearn.under_sampling``"""
+        return self._under_sampling
+
+    @cache_readonly
+    def _under_sampling(self):
+        return _AccessorMethods(self._df, module_name='imblearn.under_sampling')
+
+    @property
+    def over_sampling(self):
+        """Property to access ``imblearn.over_sampling``"""
+        return self._over_sampling
+
+    @cache_readonly
+    def _over_sampling(self):
+        return _AccessorMethods(self._df, module_name='imblearn.over_sampling')
+
+    @property
+    def combine(self):
+        """Property to access ``imblearn.combine``"""
+        return self._combine
+
+    @cache_readonly
+    def _combine(self):
+        return _AccessorMethods(self._df, module_name='imblearn.combine')
+
+    @property
+    def ensemble(self):
+        """Property to access ``imblearn.ensemble``"""
+        return self._ensemble
+
+    @cache_readonly
+    def _ensemble(self):
+        return _AccessorMethods(self._df, module_name='imblearn.ensemble')
diff --git a/pandas_ml/imbaccessors/test/__init__.py b/pandas_ml/imbaccessors/test/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python