Skip to content

Commit

Permalink
ERR: qcut uniquess checking (try 2)
Browse files Browse the repository at this point in the history
closes #7751
Add option to drop non-unique bins in qcut/cut

Author: Ashish Singal <ashish.singal1@gmail.com>
Author: Ashish <ashish.singal1@gmail.com>

Closes #15000 from ashishsingal1/master and squashes the following commits:

698b4ec [Ashish Singal] Update tile.py
b6bf401 [Ashish Singal] Update v0.20.0.txt
42bf482 [Ashish Singal] Update tile.py
221c0b3 [Ashish Singal] Update tile.py
2c5bc35 [Ashish] added duplicates='raise' test. other fixes to qcut for duplicates='raise'
3dbc416 [Ashish Singal] Update v0.20.0.txt
2161518 [Ashish Singal] Update tile.py
1ce77d0 [Ashish Singal] Update test_tile.py
3f98abc [Ashish Singal] Update tile.py
0b8efeb [Ashish Singal] Update tile.py
a2dd8ce [Ashish] fixing duplicates check
bee981c [Ashish] adding 'duplicates' option to qcut
  • Loading branch information
ashishsingal1 authored and jreback committed Dec 30, 2016
1 parent 17d7ddb commit 8051d61
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Expand Up @@ -105,6 +105,7 @@ Other enhancements
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`

- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`)
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
Expand Down
12 changes: 12 additions & 0 deletions pandas/tools/tests/test_tile.py
Expand Up @@ -272,6 +272,18 @@ def test_series_retbins(self):
np.array([0, 0, 1, 1], dtype=np.int8))
tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3]))

def test_qcut_duplicates_drop(self):
# GH 7751
values = [0, 0, 0, 0, 1, 2, 3]
cats = qcut(values, 3, duplicates='drop')
ex_levels = ['[0, 1]', '(1, 3]']
self.assertTrue((cats.categories == ex_levels).all())

def test_qcut_duplicates_raise(self):
# GH 7751
values = [0, 0, 0, 0, 1, 2, 3]
self.assertRaises(ValueError, qcut, values, 3, duplicates='raise')

def test_single_bin(self):
# issue 14652
expected = Series([0, 0])
Expand Down
26 changes: 20 additions & 6 deletions pandas/tools/tile.py
Expand Up @@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
series_index, name)


def qcut(x, q, labels=None, retbins=False, precision=3):
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
"""
Quantile-based discretization function. Discretize variable into
equal-sized buckets based on rank or based on sample quantiles. For example
Expand All @@ -151,6 +151,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
as a scalar.
precision : int
The precision at which to store and display the bins labels
duplicates : {default 'raise', 'drop'}, optional
If bin edges are not unique, raise ValueError or drop non-uniques.
.. versionadded:: 0.20.0
Returns
-------
Expand Down Expand Up @@ -187,22 +191,32 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
bins = algos.quantile(x, quantiles)
fac, bins = _bins_to_cuts(x, bins, labels=labels,
precision=precision, include_lowest=True,
dtype=dtype)
dtype=dtype, duplicates=duplicates)

return _postprocess_for_cut(fac, bins, retbins, x_is_series,
series_index, name)


def _bins_to_cuts(x, bins, right=True, labels=None,
precision=3, include_lowest=False,
dtype=None):
dtype=None, duplicates='raise'):

if duplicates not in ['raise', 'drop']:
raise ValueError("invalid value for 'duplicates' parameter, "
"valid options are: raise, drop")

unique_bins = algos.unique(bins)
if len(unique_bins) < len(bins):
if duplicates == 'raise':
raise ValueError("Bin edges must be unique: {}. You "
"can drop duplicate edges by setting "
"'duplicates' param".format(repr(bins)))
else:
bins = unique_bins

side = 'left' if right else 'right'
ids = bins.searchsorted(x, side=side)

if len(algos.unique(bins)) < len(bins):
raise ValueError('Bin edges must be unique: %s' % repr(bins))

if include_lowest:
ids[x == bins[0]] = 1

Expand Down

0 comments on commit 8051d61

Please sign in to comment.