-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
qcut: Option to return -inf/inf as lower/upper bound #22185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c8e2d63
12279de
66c1172
1d87989
e5316fd
b4e28c4
a17cc9b
c2f194c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
from pandas.core.dtypes.common import ( | ||
_NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype, | ||
is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, | ||
is_scalar, is_timedelta64_dtype) | ||
is_integer_dtype, is_scalar, is_timedelta64_dtype) | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas import ( | ||
|
@@ -244,7 +244,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, | |
series_index, name, dtype) | ||
|
||
|
||
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): | ||
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise', | ||
bounded=True): | ||
""" | ||
Quantile-based discretization function. Discretize variable into | ||
equal-sized buckets based on rank or based on sample quantiles. For example | ||
|
@@ -271,6 +272,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
bounded : bool, default True | ||
Use the min/max of the distribution as the lower/upper bounds if True, | ||
otherwise use -inf/inf. Ignored if dtype is datetime/timedelta. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
.. versionadded:: 0.24.0 | ||
|
||
Returns | ||
------- | ||
out : Categorical or Series or array of integers if labels is False | ||
|
@@ -308,6 +315,11 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): | |
else: | ||
quantiles = q | ||
bins = algos.quantile(x, quantiles) | ||
if not bounded and not dtype: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about |
||
if is_integer_dtype(bins): | ||
bins = bins.astype(np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably don't want to do this. It can cause precision issues for large integers, and I suspect it may be surprising for users. Could you instead use the info = np.iinf(bins.dtype)
bins[0] = info.min
bins[-1] = info.max There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comments. Not sure either approach is guaranteed to avoid unexpected results for users. I think either would work for my use cases, but any approach will be a compromise since there is no way to represent infinity for int types. Looking into your other comment about dtype, the same issues arise for datetime-like types. I'm leaning towards closing this PR since I think the unbounded concept can only be naturally represented for float types and isn't worth using hacks for all other types. |
||
bins[0] = -np.inf | ||
bins[-1] = np.inf | ||
fac, bins = _bins_to_cuts(x, bins, labels=labels, | ||
precision=precision, include_lowest=True, | ||
dtype=dtype, duplicates=duplicates) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -197,3 +197,30 @@ def test_date_like_qcut_bins(arg, expected_bins): | |
ser = Series(arg) | ||
result, result_bins = qcut(ser, 2, retbins=True) | ||
tm.assert_index_equal(result_bins, expected_bins) | ||
|
||
|
||
def test_qcut_unbounded(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you parametrize over bounded |
||
# GH 17282 | ||
labels = qcut(range(5), 4, bounded=False) | ||
left = labels.categories.left.values | ||
right = labels.categories.right.values | ||
expected = np.array([-np.inf, 1.0, 2.0, 3.0, np.inf]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rather than use numpy arrays, can you construct the expected Index and use |
||
tm.assert_numpy_array_equal(left, expected[:-1]) | ||
tm.assert_numpy_array_equal(right, expected[1:]) | ||
|
||
|
||
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) | ||
def test_datetimetz_qcut_unbounded(bins): | ||
# GH 19872 | ||
tz = 'US/Eastern' | ||
s = Series(date_range('20130101', periods=3, tz=tz)) | ||
result = qcut(s, bins, bounded=False) | ||
expected = Series(IntervalIndex([ | ||
Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz), | ||
Timestamp("2013-01-01 16:00:00", tz=tz)), | ||
Interval(Timestamp("2013-01-01 16:00:00", tz=tz), | ||
Timestamp("2013-01-02 08:00:00", tz=tz)), | ||
Interval(Timestamp("2013-01-02 08:00:00", tz=tz), | ||
Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( | ||
CDT(ordered=True)) | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to 0.25 at this point