diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 5213120b33f06..6e9248836fe36 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -421,6 +421,7 @@ Other Enhancements - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) - :meth:`DataFrame.corrwith` now supports Spearman's rank correlation, Kendall's tau as well as callable correlation methods. (:issue:`21925`) - :meth:`DataFrame.to_json`, :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_XXX` etc. now support tilde(~) in path argument. (:issue:`23473`) +- :func: qcut now accepts ``bounded`` as a keyword argument, allowing for unbounded quantiles such that the lower/upper bounds are -inf/inf (:issue:`17282`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index c107ed51226b0..43e8fc55be63d 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, - is_scalar, is_timedelta64_dtype) + is_integer_dtype, is_scalar, is_timedelta64_dtype) from pandas.core.dtypes.missing import isna from pandas import ( @@ -244,7 +244,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, series_index, name, dtype) -def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise', + bounded=True): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -271,6 +272,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): .. versionadded:: 0.20.0 + bounded : bool, default True + Use the min/max of the distribution as the lower/upper bounds if True, + otherwise use -inf/inf. Ignored if dtype is datetime/timedelta. + + .. versionadded:: 0.24.0 + Returns ------- out : Categorical or Series or array of integers if labels is False @@ -308,6 +315,11 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): else: quantiles = q bins = algos.quantile(x, quantiles) + if not bounded and not dtype: + if is_integer_dtype(bins): + bins = bins.astype(np.float64) + bins[0] = -np.inf + bins[-1] = np.inf fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 997df7fd7aa4c..4bcc1e4129040 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -197,3 +197,30 @@ def test_date_like_qcut_bins(arg, expected_bins): ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) + + +def test_qcut_unbounded(): + # GH 17282 + labels = qcut(range(5), 4, bounded=False) + left = labels.categories.left.values + right = labels.categories.right.values + expected = np.array([-np.inf, 1.0, 2.0, 3.0, np.inf]) + tm.assert_numpy_array_equal(left, expected[:-1]) + tm.assert_numpy_array_equal(right, expected[1:]) + + +@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) +def test_datetimetz_qcut_unbounded(bins): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + result = qcut(s, bins, bounded=False) + expected = Series(IntervalIndex([ + Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz)), + Interval(Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz)), + Interval(Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( + CDT(ordered=True)) + tm.assert_series_equal(result, expected)