Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Roll apply nonfloat dtypes #11620

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1820,9 +1820,11 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int win,

return output


def roll_generic(ndarray[float64_t, cast=True] input,
int win, int minp, int offset,
object func, object args, object kwargs):
object func, object args, object kwargs,
object array_to_roll=None):
cdef ndarray[double_t] output, counts, bufarr
cdef Py_ssize_t i, n
cdef float64_t *buf
Expand All @@ -1837,32 +1839,41 @@ def roll_generic(ndarray[float64_t, cast=True] input,

minp = _check_minp(win, minp, n, floor=0)
output = np.empty(n, dtype=float)
counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:]
counts = roll_sum(np.concatenate((np.isfinite(input).astype(float),
np.array([0.] * offset))),
win, minp)[offset:]

# default behavior is to roll over input array
if array_to_roll is None:
array_to_roll = input

# truncated windows at the beginning, through first full-length window
for i from 0 <= i < (int_min(win, n) - offset):
if counts[i] >= minp:
output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs)
output[i] = func(array_to_roll[0:(i + offset + 1)],
*args,
**kwargs)
else:
output[i] = NaN

# remaining full-length windows
buf = <float64_t*> input.data
bufarr = np.empty(win, dtype=float)
oldbuf = <float64_t*> bufarr.data
# array_to_roll is a numpy array and doing a slice of contiguous data does
# not make a copy
for i from (win - offset) <= i < (n - offset):
buf = buf + 1
bufarr.data = <char*> buf
if counts[i] >= minp:
output[i] = func(bufarr, *args, **kwargs)
# full length windows will start at index 1 and be of length win
output[i] = \
func(array_to_roll[i - (win - offset) + 1:i + offset + 1],
*args, **kwargs)
else:
output[i] = NaN
bufarr.data = <char*> oldbuf

# truncated windows at the end
for i from int_max(n - offset, 0) <= i < n:
if counts[i] >= minp:
output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs)
output[i] = func(array_to_roll[int_max(i + offset - win + 1, 0):n],
*args,
**kwargs)
else:
output[i] = NaN

Expand Down
54 changes: 38 additions & 16 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None,


def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
how=None, args=(), kwargs={}, **kwds):
how=None, coercion=True, args=(), kwargs={}, **kwds):
"""
Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
Expand All @@ -374,6 +374,10 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
Whether the label should correspond with center of window
how : string, default 'mean'
Method for down- or re-sampling
coercion: bool flag with default True. It tries to coerce args to a float
to optimize for speed. If rolling_apply() is invoked on objects that
cannot be coerced into a float, it raises a ValueError. Be sure
to set coercion=False in this case.
args : tuple
Passed on to func
kwargs : dict
Expand All @@ -385,17 +389,26 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
"""
arg = _conv_timerule(arg, freq, how)

return_hook, values = _process_data_structure(arg)
return_hook, values = _process_data_structure(arg, coercion=coercion)

if values.size == 0:
result = values.copy()
else:
# actually calculate the moment. Faster way to do this?
offset = int((window - 1) / 2.) if center else 0
additional_nans = np.array([np.NaN] * offset)
calc = lambda x: func(np.concatenate((x, additional_nans)) if center else x,
window, minp=minp, args=args, kwargs=kwargs,
**kwds)

if coercion:
calc = lambda x: func(np.concatenate((x, additional_nans)) if
center else x, window, minp=minp, args=args,
kwargs=kwargs, **kwds)
else:
p0 = np.arange(0, len(values), dtype=float)
calc = lambda x: func(np.concatenate((p0, additional_nans))
if center else p0, window, minp=minp,
args=args, kwargs=kwargs,
array_to_roll=x, **kwds)

if values.ndim > 1:
result = np.apply_along_axis(calc, axis, values)
else:
Expand Down Expand Up @@ -423,7 +436,7 @@ def _center_window(rs, window, axis):
return rs


def _process_data_structure(arg, kill_inf=True):
def _process_data_structure(arg, kill_inf=True, coercion=True):
if isinstance(arg, DataFrame):
return_hook = lambda v: type(arg)(v, index=arg.index,
columns=arg.columns)
Expand All @@ -435,12 +448,13 @@ def _process_data_structure(arg, kill_inf=True):
return_hook = lambda v: v
values = arg

if not issubclass(values.dtype.type, float):
values = values.astype(float)
if coercion:
if not issubclass(values.dtype.type, float):
values = values.astype(float)

if kill_inf:
values = values.copy()
values[np.isinf(values)] = np.NaN
if kill_inf:
values = values.copy()
values[np.isinf(values)] = np.NaN

return return_hook, values

Expand Down Expand Up @@ -712,7 +726,7 @@ def call_cython(arg, window, minp, args=(), kwargs={}):


def rolling_apply(arg, window, func, min_periods=None, freq=None,
center=False, args=(), kwargs={}):
center=False, coercion=True, args=(), kwargs={}):
"""Generic moving function application.

Parameters
Expand All @@ -731,6 +745,10 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None,
as a frequency string or DateOffset object.
center : boolean, default False
Whether the label should correspond with center of window
coercion: bool flag with default True. It tries to coerce args to a float
to optimize for speed. If rolling_apply() is invoked on objects that
cannot be coerced into a float, it raises a ValueError. Be sure
to set coercion=False in this case.
args : tuple
Passed on to func
kwargs : dict
Expand All @@ -750,11 +768,15 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None,
of :meth:`~pandas.Series.resample` (i.e. using the `mean`).
"""
offset = int((window - 1) / 2.) if center else 0
def call_cython(arg, window, minp, args, kwargs):

def call_cython(arg, window, minp, args, kwargs, array_to_roll=None):
minp = _use_window(minp, window)
return algos.roll_generic(arg, window, minp, offset, func, args, kwargs)
return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
center=False, args=args, kwargs=kwargs)
return algos.roll_generic(arg, window, minp, offset, func, args,
kwargs, array_to_roll)

return _rolling_moment(arg, window, call_cython, min_periods,
freq=freq, center=False, coercion=coercion,
args=args, kwargs=kwargs)


def rolling_window(arg, window=None, win_type=None, min_periods=None,
Expand Down
21 changes: 21 additions & 0 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,27 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False):
expected = Series([1., 2., 2.])
assert_series_equal(result, expected)

def test_rolling_apply_nonfloat(self):
'''
test rolling_apply now also works for non-float data types if coercion
is set to False. The return type is still float but the 'roll'
is applied to arg which no longer has to be a float
'''
# check rolling_apply with coercion set to False
orig = Series([ord('a'), ord('b'), ord('c')], dtype=float)
s = Series(['a', 'b', 'c'])

for min_p in (None, 0):
s_res = mom.rolling_apply(s, 2, lambda x: ord(x[-1]),
coercion=False, min_periods=min_p)
o_res = mom.rolling_apply(orig, 2, lambda x: x[-1],
coercion=False, min_periods=min_p)

# assert that NaN values appear at same place since min_periods
# defines the NaN values. Also assert that valid answers match
assert all(np.isfinite(s_res) == np.isfinite(o_res))
assert all(s_res[np.isfinite(s_res)] == o_res[np.isfinite(o_res)])

def test_rolling_apply_out_of_bounds(self):
# #1850
arr = np.arange(4)
Expand Down