diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index 2e99e0fc8c47c..fe94e0f847562 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -53,6 +53,10 @@ Bug fixes characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) - Fix comparing a :class:`StringDtype` Series with mixed objects raising an error (:issue:`60228`) - Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`) +- Fix bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` ``skew()`` and ``kurt()`` calculations + that produced incorrect values for windows containing variance of zero. + The implementation has been updated to return ``NaN`` in these cases, matching skipy.stats behavior + and the non-rolling :meth:`Series.skew` and :meth:`Series.kurt` received the same behavior ensuring consistency. (:issue:`62864`) Other changes ~~~~~~~~~~~~~ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0c8ea28b60ce8..30fb3946060db 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -498,10 +498,10 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, if nobs < 3: result = NaN # GH 42064 46431 - # uniform case, force result to be 0 + # degenerate case, force result to be NaN elif num_consecutive_same_value >= nobs: - result = 0.0 - # #18044: with uniform distribution, floating issue will + result = NaN + # #18044: with degenerate distribution, floating issue will # cause B != 0. and cause the result is a very # large number. # @@ -694,7 +694,7 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, # GH 42064 46431 # uniform case, force result to be -3. elif num_consecutive_same_value >= nobs: - result = -3. + result = NaN else: dnobs = nobs A = x / dnobs diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 772f425beed96..27e0d6ba1760f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1287,10 +1287,10 @@ def nanskew( result = result.astype(dtype, copy=False) if isinstance(result, np.ndarray): - result = np.where(m2 == 0, 0, result) + result = np.where(m2 == 0, np.nan, result) result[count < 3] = np.nan else: - result = dtype.type(0) if m2 == 0 else result + result = np.nan if m2 == 0 else result if count < 3: return np.nan @@ -1400,7 +1400,7 @@ def nankurt( if count < 4: return np.nan if denominator == 0: - return values.dtype.type(0) + return np.nan with np.errstate(invalid="ignore", divide="ignore"): result = numerator / denominator - adj @@ -1410,7 +1410,7 @@ def nankurt( result = result.astype(dtype, copy=False) if isinstance(result, np.ndarray): - result = np.where(denominator == 0, 0, result) + result = np.where(denominator == 0, np.nan, result) result[count < 4] = np.nan return result diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 4af1ca1d4800a..5750946e9904d 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -9,7 +9,6 @@ import pandas as pd from pandas import ( - DataFrame, Series, date_range, ) @@ -228,28 +227,6 @@ def test_sem(self): result = s.sem(ddof=1) assert pd.isna(result) - def test_skew(self): - sp_stats = pytest.importorskip("scipy.stats") - - string_series = Series(range(20), dtype=np.float64, name="series") - - alt = lambda x: sp_stats.skew(x, bias=False) - self._check_stat_op("skew", alt, string_series) - - # test corner cases, skew() returns NaN unless there's at least 3 - # values - min_N = 3 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.skew()) - assert np.isnan(df.skew()).all() - else: - assert 0 == s.skew() - assert isinstance(s.skew(), np.float64) # GH53482 - assert (df.skew() == 0).all() - def test_kurt(self): sp_stats = pytest.importorskip("scipy.stats") @@ -257,18 +234,3 @@ def test_kurt(self): alt = lambda x: sp_stats.kurtosis(x, bias=False) self._check_stat_op("kurt", alt, string_series) - - def test_kurt_corner(self): - # test corner cases, kurt() returns NaN unless there's at least 4 - # values - min_N = 4 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.kurt()) - assert np.isnan(df.kurt()).all() - else: - assert 0 == s.kurt() - assert isinstance(s.kurt(), np.float64) # GH53482 - assert (df.kurt() == 0).all() diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 6788f2056bb9a..07bb93a90fbe0 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -559,10 +559,10 @@ def _skew_kurt_wrap(self, values, axis=None, func=None): result = func(values, axis=axis, bias=False) # fix for handling cases where all elements in an axis are the same if isinstance(result, np.ndarray): - result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0 + result[np.max(values, axis=axis) == np.min(values, axis=axis)] = np.nan return result elif np.max(values) == np.min(values): - return 0.0 + return np.nan return result def test_nanskew(self, skipna): @@ -1021,7 +1021,7 @@ def test_constant_series(self, val): # xref GH 11974 data = val * np.ones(300) skew = nanops.nanskew(data) - assert skew == 0.0 + assert np.isnan(skew) def test_all_finite(self): alpha, beta = 0.3, 0.1 @@ -1089,7 +1089,7 @@ def test_constant_series(self, val): # xref GH 11974 data = val * np.ones(300) kurt = nanops.nankurt(data) - tm.assert_equal(kurt, 0.0) + tm.assert_equal(kurt, np.nan) def test_all_finite(self): alpha, beta = 0.3, 0.1 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 18aafa0d7b71e..a65d3607ff533 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1833,10 +1833,12 @@ def test_rolling_skew_kurt_floating_artifacts(): sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) r = sr.rolling(4) + result = r.skew() - assert (result[-2:] == 0).all() + assert np.isnan(result[-2:]).all() + result = r.kurt() - assert (result[-2:] == -3).all() + assert np.isnan(result[-2:]).all() def test_numeric_only_frame(arithmetic_win_operators, numeric_only): diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 79c14f243e7cc..370cd6113a504 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -170,7 +170,7 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - expected = Series([np.nan] * 4 + [0.0])[::step] + expected = Series([np.nan] * 5)[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() @@ -191,7 +191,7 @@ def test_rolling_skew_edge_cases(step): def test_rolling_kurt_edge_cases(step): - expected = Series([np.nan] * 4 + [-3.0])[::step] + expected = Series([np.nan] * 5)[::step] # yields all NaN (0 variance) d = Series([1] * 5) @@ -212,16 +212,12 @@ def test_rolling_kurt_edge_cases(step): def test_rolling_skew_eq_value_fperr(step): - # #18804 all rolling skew for all equal values should return Nan - # #46717 update: all equal values should return 0 instead of NaN + # #18804 all rolling skew for all equal values should return NaN a = Series([1.1] * 15).rolling(window=10, step=step).skew() - assert (a[a.index >= 9] == 0).all() - assert a[a.index < 9].isna().all() + assert a.isna().all() def test_rolling_kurt_eq_value_fperr(step): # #18804 all rolling kurt for all equal values should return Nan - # #46717 update: all equal values should return -3 instead of NaN a = Series([1.1] * 15).rolling(window=10, step=step).kurt() - assert (a[a.index >= 9] == -3).all() - assert a[a.index < 9].isna().all() + assert a.isna().all()