Permalink
Browse files

ENH: implement qcut for quantile cuts, fix 32-bit build close #1378

  • Loading branch information...
1 parent 6e46099 commit 3e904fddda17151cdbe52e7d6cee241da7949154 @wesm wesm committed Jun 5, 2012
Showing with 151 additions and 38 deletions.
  1. +1 −0 RELEASE.rst
  2. +72 −0 pandas/core/algorithms.py
  3. +2 −2 pandas/core/series.py
  4. +14 −4 pandas/src/datetime.pyx
  5. +16 −1 pandas/tools/tests/test_tile.py
  6. +45 −30 pandas/tools/tile.py
  7. +1 −1 scripts/count_code.sh
View
@@ -80,6 +80,7 @@ pandas 0.8.0
- Add Panel.transpose method for rearranging axes (#695)
- Add new ``cut`` function (patterned after R) for discretizing data into
equal range-length bins or arbitrary breaks of your choosing (#415)
+ - Add new ``qcut`` for cutting with quantiles (#1378)
- Added Andrews curves plot tupe (#1325)
- Add support for tox and Travis CI (#1382)
View
@@ -7,6 +7,7 @@
import pandas.core.common as com
import pandas.lib as lib
+import pandas._algos as _algos
def match(to_match, values, na_sentinel=-1):
"""
@@ -179,6 +180,77 @@ def rank(values, axis=0, method='average', na_option='keep',
ascending=ascending)
return ranks
+def quantile(x, q, interpolation_method='fraction'):
+ """
+ Compute sample quantile or quantiles of the input array. For example, q=0.5
+ computes the median.
+
+ The `interpolation_method` parameter supports three values, namely
+ `fraction` (default), `lower` and `higher`. Interpolation is done only,
+ if the desired quantile lies between two data points `i` and `j`. For
+ `fraction`, the result is an interpolated value between `i` and `j`;
+ for `lower`, the result is `i`, for `higher` the result is `j`.
+
+ Parameters
+ ----------
+ a : ndarray
+ Values from which to extract score.
+ q : scalar or array
+ Percentile at which to extract score.
+ interpolation : {'fraction', 'lower', 'higher'}, optional
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ - fraction: `i + (j - i)*fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ -lower: `i`.
+ - higher: `j`.
+
+ Returns
+ -------
+ score : float
+ Score at percentile.
+
+ Examples
+ --------
+ >>> from scipy import stats
+ >>> a = np.arange(100)
+ >>> stats.scoreatpercentile(a, 50)
+ 49.5
+
+ """
+ values = np.sort(x)
+
+ def _get_score(at):
+ idx = at * (len(values) - 1)
+ if (idx % 1 == 0):
+ score = values[idx]
+ else:
+ if interpolation_method == 'fraction':
+ score = _interpolate(values[int(idx)], values[int(idx) + 1],
+ idx % 1)
+ elif interpolation_method == 'lower':
+ score = values[np.floor(idx)]
+ elif interpolation_method == 'higher':
+ score = values[np.ceil(idx)]
+ else:
+ raise ValueError("interpolation_method can only be 'fraction', " \
+ "'lower' or 'higher'")
+
+ return score
+
+ if np.isscalar(q):
+ return _get_score(q)
+ else:
+ q = np.asarray(q, np.float64)
+ return _algos.arrmap_float64(q, _get_score)
+
+def _interpolate(a, b, fraction):
+ """Returns the point at the given fraction between a and b, where
+ 'fraction' must be between 0 and 1.
+ """
+ return a + (b - a)*fraction
+
def _get_data_algo(values, func_map):
if com.is_float_dtype(values):
View
@@ -1083,8 +1083,8 @@ def value_counts(self):
-------
counts : Series
"""
- import pandas.core.algorithms as algos
- return algos.value_counts(self.values, sort=True, ascending=False)
+ from pandas.core.algorithms import value_counts
+ return value_counts(self.values, sort=True, ascending=False)
def unique(self):
"""
View
@@ -887,8 +887,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
result_b.fill(NPY_NAT)
# left side
- idx_shifted = np.maximum(0, trans.searchsorted(vals - DAY_NS,
- side='right') - 1)
+ idx_shifted = _ensure_int64(
+ np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1))
for i in range(n):
v = vals[i] - deltas[idx_shifted[i]]
@@ -899,8 +899,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
result_a[i] = v
# right side
- idx_shifted = np.maximum(0, trans.searchsorted(vals + DAY_NS,
- side='right') - 1)
+ idx_shifted = _ensure_int64(
+ np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))
for i in range(n):
v = vals[i] - deltas[idx_shifted[i]]
@@ -929,6 +929,16 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
return result
+cdef _ensure_int64(object arr):
+ if util.is_array(arr):
+ if (<ndarray> arr).descr.type_num == NPY_INT64:
+ return arr
+ else:
+ return arr.astype(np.int64)
+ else:
+ return np.array(arr, dtype=np.int64)
+
+
cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
cdef Py_ssize_t pivot, left = 0, right = n
@@ -7,7 +7,8 @@
import pandas.util.testing as tm
import pandas.core.common as com
-from pandas.tools.tile import cut
+from pandas.core.algorithms import quantile
+from pandas.tools.tile import cut, qcut
from numpy.testing import assert_equal, assert_almost_equal
@@ -84,6 +85,20 @@ def test_na_handling(self):
ex_labels = np.where(com.isnull(arr), np.nan, labels)
tm.assert_almost_equal(labels, ex_labels)
+ def test_qcut(self):
+ arr = np.random.randn(1000)
+
+ labels, bins = qcut(arr, 4, retbins=True)
+
+ ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
+
+ assert_almost_equal(bins, ex_bins)
+
+ ex_labels = cut(arr, ex_bins)
+
+ self.assert_(np.array_equal(labels, ex_labels))
+
+
if __name__ == '__main__':
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
exit=False)
View
@@ -3,6 +3,7 @@
"""
from pandas.core.api import DataFrame, Series
+import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.core.nanops as nanops
@@ -92,13 +93,56 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
if (np.diff(bins) < 0).any():
raise ValueError('bins must increase monotonically.')
+ return _bins_to_cuts(x, bins, right=right, labels=labels,
+ retbins=retbins, precision=precision)
+
+
+
+def qcut(x, q=4, labels=None, retbins=False, precision=3):
+ """
+ Quantile-based discretization function. Discretize variable into
+ equal-sized buckets based on rank or based on sample quantiles. For example
+ 1000 values for 10 quantiles would produce 1000 integers from 0 to 9
+ indicating the
+
+ Parameters
+ ----------
+ x : ndarray or Series
+ q : integer or array of quantiles
+ Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+ array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
+ labels : array or boolean, default None
+ Labels to use for bin edges, or False to return integer bin labels
+ retbins : bool, optional
+ Whether to return the bins or not. Can be useful if bins is given
+ as a scalar.
+
+ Returns
+ -------
+
+ Notes
+ -----
+
+ Examples
+ --------
+ """
+ if com.is_integer(q):
+ quantiles = np.linspace(0, 1, q + 1)
+ bins = algos.quantile(x, quantiles)
+ return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
+ precision=precision)
+ else:
+ raise NotImplementedError
+
+
+def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
+ precision=3):
side = 'left' if right else 'right'
ids = bins.searchsorted(x, side=side)
mask = com.isnull(x)
has_nas = mask.any()
-
if labels is not False:
if labels is None:
labels = bins
@@ -132,35 +176,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
return labels, bins
-def qcut(x, n, ties_method='average'):
- """
- Quantile-based discretization function. Discretize variable into
- equal-sized buckets based on rank. For example 1000 values for 10 quantiles
- would produce 1000 integers from 0 to 9 indicating the
-
- Parameters
- ----------
- x : ndarray or Series
- n : integer
- Number of quantiles. 10 for deciles, 4 for quartiles, etc.
- ties_method : {'average', 'min', 'max', 'first'}, default 'average'
- average: average rank of group
- min: lowest rank in group
- max: highest rank in group
- first: ranks assigned in order they appear in the array
-
- Returns
- -------
-
- Notes
- -----
-
- Examples
- --------
- """
- pass
-
-
def _format_label(x, precision=3):
fmt_str = '%%.%dg' % precision
if com.is_float(x):
View
@@ -1 +1 @@
-cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c"
+cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c|plib.c"

0 comments on commit 3e904fd

Please sign in to comment.