diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 8569209f2e946..45dc64d81c338 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1820,9 +1820,11 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int win, return output + def roll_generic(ndarray[float64_t, cast=True] input, int win, int minp, int offset, - object func, object args, object kwargs): + object func, object args, object kwargs, + object array_to_roll=None): cdef ndarray[double_t] output, counts, bufarr cdef Py_ssize_t i, n cdef float64_t *buf @@ -1837,32 +1839,41 @@ def roll_generic(ndarray[float64_t, cast=True] input, minp = _check_minp(win, minp, n, floor=0) output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), + np.array([0.] * offset))), + win, minp)[offset:] + + # default behavior is to roll over input array + if array_to_roll is None: + array_to_roll = input # truncated windows at the beginning, through first full-length window for i from 0 <= i < (int_min(win, n) - offset): if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) + output[i] = func(array_to_roll[0:(i + offset + 1)], + *args, + **kwargs) else: output[i] = NaN # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data + # array_to_roll is a numpy array and doing a slice of contiguous data does + # not make a copy for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) + # full length windows will start at index 1 and be of length win + output[i] = \ + func(array_to_roll[i - (win - offset) + 1:i + offset + 1], + *args, **kwargs) else: output[i] = NaN - bufarr.data = oldbuf # truncated windows at the end for i from int_max(n - offset, 0) <= i < n: if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) + output[i] = func(array_to_roll[int_max(i + offset - win + 1, 0):n], + *args, + **kwargs) else: output[i] = NaN diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 3cddae45e7516..c4183d748fc9c 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -355,7 +355,7 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, - how=None, args=(), kwargs={}, **kwds): + how=None, coercion=True, args=(), kwargs={}, **kwds): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -374,6 +374,10 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, Whether the label should correspond with center of window how : string, default 'mean' Method for down- or re-sampling + coercion: bool flag with default True. It tries to coerce args to a float + to optimize for speed. If rolling_apply() is invoked on objects that + cannot be coerced into a float, it raises a ValueError. Be sure + to set coercion=False in this case. args : tuple Passed on to func kwargs : dict @@ -385,7 +389,7 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, """ arg = _conv_timerule(arg, freq, how) - return_hook, values = _process_data_structure(arg) + return_hook, values = _process_data_structure(arg, coercion=coercion) if values.size == 0: result = values.copy() @@ -393,9 +397,18 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, # actually calculate the moment. Faster way to do this? offset = int((window - 1) / 2.) if center else 0 additional_nans = np.array([np.NaN] * offset) - calc = lambda x: func(np.concatenate((x, additional_nans)) if center else x, - window, minp=minp, args=args, kwargs=kwargs, - **kwds) + + if coercion: + calc = lambda x: func(np.concatenate((x, additional_nans)) if + center else x, window, minp=minp, args=args, + kwargs=kwargs, **kwds) + else: + p0 = np.arange(0, len(values), dtype=float) + calc = lambda x: func(np.concatenate((p0, additional_nans)) + if center else p0, window, minp=minp, + args=args, kwargs=kwargs, + array_to_roll=x, **kwds) + if values.ndim > 1: result = np.apply_along_axis(calc, axis, values) else: @@ -423,7 +436,7 @@ def _center_window(rs, window, axis): return rs -def _process_data_structure(arg, kill_inf=True): +def _process_data_structure(arg, kill_inf=True, coercion=True): if isinstance(arg, DataFrame): return_hook = lambda v: type(arg)(v, index=arg.index, columns=arg.columns) @@ -435,12 +448,13 @@ def _process_data_structure(arg, kill_inf=True): return_hook = lambda v: v values = arg - if not issubclass(values.dtype.type, float): - values = values.astype(float) + if coercion: + if not issubclass(values.dtype.type, float): + values = values.astype(float) - if kill_inf: - values = values.copy() - values[np.isinf(values)] = np.NaN + if kill_inf: + values = values.copy() + values[np.isinf(values)] = np.NaN return return_hook, values @@ -712,7 +726,7 @@ def call_cython(arg, window, minp, args=(), kwargs={}): def rolling_apply(arg, window, func, min_periods=None, freq=None, - center=False, args=(), kwargs={}): + center=False, coercion=True, args=(), kwargs={}): """Generic moving function application. Parameters @@ -731,6 +745,10 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, as a frequency string or DateOffset object. center : boolean, default False Whether the label should correspond with center of window + coercion: bool flag with default True. It tries to coerce args to a float + to optimize for speed. If rolling_apply() is invoked on objects that + cannot be coerced into a float, it raises a ValueError. Be sure + to set coercion=False in this case. args : tuple Passed on to func kwargs : dict @@ -750,11 +768,15 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ offset = int((window - 1) / 2.) if center else 0 - def call_cython(arg, window, minp, args, kwargs): + + def call_cython(arg, window, minp, args, kwargs, array_to_roll=None): minp = _use_window(minp, window) - return algos.roll_generic(arg, window, minp, offset, func, args, kwargs) - return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=False, args=args, kwargs=kwargs) + return algos.roll_generic(arg, window, minp, offset, func, args, + kwargs, array_to_roll) + + return _rolling_moment(arg, window, call_cython, min_periods, + freq=freq, center=False, coercion=coercion, + args=args, kwargs=kwargs) def rolling_window(arg, window=None, win_type=None, min_periods=None, diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index e2ed27156d2b5..69a436798e354 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -363,6 +363,27 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False): expected = Series([1., 2., 2.]) assert_series_equal(result, expected) + def test_rolling_apply_nonfloat(self): + ''' + test rolling_apply now also works for non-float data types if coercion + is set to False. The return type is still float but the 'roll' + is applied to arg which no longer has to be a float + ''' + # check rolling_apply with coercion set to False + orig = Series([ord('a'), ord('b'), ord('c')], dtype=float) + s = Series(['a', 'b', 'c']) + + for min_p in (None, 0): + s_res = mom.rolling_apply(s, 2, lambda x: ord(x[-1]), + coercion=False, min_periods=min_p) + o_res = mom.rolling_apply(orig, 2, lambda x: x[-1], + coercion=False, min_periods=min_p) + + # assert that NaN values appear at same place since min_periods + # defines the NaN values. Also assert that valid answers match + assert all(np.isfinite(s_res) == np.isfinite(o_res)) + assert all(s_res[np.isfinite(s_res)] == o_res[np.isfinite(o_res)]) + def test_rolling_apply_out_of_bounds(self): # #1850 arr = np.arange(4)