From 8051d61223b10002a7b4a4f66373e7cfeb976095 Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Fri, 30 Dec 2016 13:56:04 -0500 Subject: [PATCH] ERR: qcut uniquess checking (try 2) closes #7751 Add option to drop non-unique bins in qcut/cut Author: Ashish Singal Author: Ashish Closes #15000 from ashishsingal1/master and squashes the following commits: 698b4ec [Ashish Singal] Update tile.py b6bf401 [Ashish Singal] Update v0.20.0.txt 42bf482 [Ashish Singal] Update tile.py 221c0b3 [Ashish Singal] Update tile.py 2c5bc35 [Ashish] added duplicates='raise' test. other fixes to qcut for duplicates='raise' 3dbc416 [Ashish Singal] Update v0.20.0.txt 2161518 [Ashish Singal] Update tile.py 1ce77d0 [Ashish Singal] Update test_tile.py 3f98abc [Ashish Singal] Update tile.py 0b8efeb [Ashish Singal] Update tile.py a2dd8ce [Ashish] fixing duplicates check bee981c [Ashish] adding 'duplicates' option to qcut --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/tests/test_tile.py | 12 ++++++++++++ pandas/tools/tile.py | 26 ++++++++++++++++++++------ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1ff591c86f6fa..f7cd9154a9436 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -105,6 +105,7 @@ Other enhancements of sorting or an incorrect key. See :ref:`here ` - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) +- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index c9a96d80f35ba..8b180957801f9 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -272,6 +272,18 @@ def test_series_retbins(self): np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + def test_qcut_duplicates_drop(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + cats = qcut(values, 3, duplicates='drop') + ex_levels = ['[0, 1]', '(1, 3]'] + self.assertTrue((cats.categories == ex_levels).all()) + + def test_qcut_duplicates_raise(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index a372e113f1d7e..2875d9c14dc47 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, series_index, name) -def qcut(x, q, labels=None, retbins=False, precision=3): +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -151,6 +151,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3): as a scalar. precision : int The precision at which to store and display the bins labels + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + + .. versionadded:: 0.20.0 Returns ------- @@ -187,7 +191,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, - dtype=dtype) + dtype=dtype, duplicates=duplicates) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -195,14 +199,24 @@ def qcut(x, q, labels=None, retbins=False, precision=3): def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, - dtype=None): + dtype=None, duplicates='raise'): + + if duplicates not in ['raise', 'drop']: + raise ValueError("invalid value for 'duplicates' parameter, " + "valid options are: raise, drop") + + unique_bins = algos.unique(bins) + if len(unique_bins) < len(bins): + if duplicates == 'raise': + raise ValueError("Bin edges must be unique: {}. You " + "can drop duplicate edges by setting " + "'duplicates' param".format(repr(bins))) + else: + bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) - if len(algos.unique(bins)) < len(bins): - raise ValueError('Bin edges must be unique: %s' % repr(bins)) - if include_lowest: ids[x == bins[0]] = 1