From aff7346f3be41d1b190f311f862653b6088e2ad7 Mon Sep 17 00:00:00 2001 From: TomAugspurger Date: Sun, 8 Sep 2013 22:59:59 -0500 Subject: [PATCH] ENH/REF: Additional methods for interpolate ENH: the interpolate method argument can take more values for various types of interpolation REF: Moves Series.interpolate to core/generic. DataFrame gets interpolate CLN: clean up interpolate to use blocks ENH: Add additonal 1-d scipy interpolaters. DOC: examples for df interpolate and a plot DOC: release notes DOC: Scipy links and more expanation API: Don't use fill_value BUG: Raise on panels. API: Raise on non monotonic indecies if it matters BUG: Raise on only mixed types. ENH/DOC: Add `spline` interpolation. DOC: naming consistency --- doc/source/missing_data.rst | 89 ++++++++- doc/source/release.rst | 4 +- doc/source/v0.13.0.txt | 28 +++ pandas/core/common.py | 147 +++++++++++++++ pandas/core/generic.py | 120 +++++++++---- pandas/core/internals.py | 83 ++++++++- pandas/core/series.py | 51 ------ pandas/tests/test_generic.py | 338 ++++++++++++++++++++++++++++++++++- pandas/tests/test_series.py | 243 ++++++++++--------------- 9 files changed, 863 insertions(+), 240 deletions(-) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 34442852cae84..e7966aa71486c 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -271,8 +271,13 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -A linear **interpolate** method has been implemented on Series. The default -interpolation assumes equally spaced points. +.. versionadded:: 0.13.0 + + DataFrame now has the interpolation method. + :meth:`~pandas.Series.interpolate` also gained some additional methods. + +Both Series and Dataframe objects have an ``interpolate`` method that, by default, +performs linear interpolation at missing datapoints. .. ipython:: python :suppress: @@ -328,6 +333,86 @@ For a floating-point index, use ``method='values'``: ser.interpolate(method='values') +You can also interpolate with a DataFrame: + +.. ipython:: python + + df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df.interpolate() + +The ``method`` argument gives access to fancier interpolation methods. +If you have scipy_ installed, you can set pass the name of a 1-d interpolation routine to ``method``. +You'll want to consult the full scipy interpolation documentation_ and reference guide_ for details. +The appropriate interpolation method will depend on the type of data you are working with. +For example, if you are dealing with a time series that is growing at an increasing rate, +``method='quadratic'`` may be appropriate. If you have values approximating a cumulative +distribution function, then ``method='pchip'`` should work well. + +.. warning:: + + These methods require ``scipy``. + +.. ipython:: python + + df.interpolate(method='barycentric') + + df.interpolate(method='pchip') + +When interpolating via a polynomial or spline approximation, you must also specify +the degree or order of the approximation: + +.. ipython:: python + + df.interpolate(method='spline', order=2) + + df.interpolate(method='polynomial', order=2) + +Compare several methods: + +.. ipython:: python + + np.random.seed(2) + + ser = Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) + bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29, 34, 35, 36]) + ser[bad] = np.nan + methods = ['linear', 'quadratic', 'cubic'] + + df = DataFrame({m: s.interpolate(method=m) for m in methods}) + @savefig compare_interpolations.png + df.plot() + +Another use case is interpolation at *new* values. +Suppose you have 100 observations from some distribution. And let's suppose +that you're particularly interested in what's happening around the middle. +You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate +at the new values. + +.. ipython:: python + + ser = Series(np.sort(np.random.uniform(size=100))) + + # interpolate at new_index + new_index = ser.index + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + + interp_s = ser.reindex(new_index).interpolate(method='pchip') + + interp_s[49:51] + +.. _scipy: http://www.scipy.org +.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation +.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + + +Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword argument. +Use this to limit the number of consecutive interpolations, keeping ``NaN``s for interpolations that are too far from the last valid observation: + +.. ipython:: python + + ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser.interpolate(limit=2) + .. _missing_data.replace: Replacing Generic Values diff --git a/doc/source/release.rst b/doc/source/release.rst index 4a25a98f2cfbe..53db769f30941 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -78,7 +78,7 @@ Experimental Features - Add msgpack support via ``pd.read_msgpack()`` and ``pd.to_msgpack()`` / ``df.to_msgpack()`` for serialization of arbitrary pandas (and python objects) in a lightweight portable binary format (:issue:`686`) - Added PySide support for the qtpandas DataFrameModel and DataFrameWidget. - - Added :mod:`pandas.io.gbq` for reading from (and writing to) Google BigQuery into a DataFrame. (:issue:`4140`) + - Added :mod:`pandas.io.gbq` for reading from (and writing to) Google BigQuery into a DataFrame. (:issue:`4140`) Improvements to existing features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -174,6 +174,8 @@ Improvements to existing features - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table from semi-structured JSON data. :ref:`See the docs` (:issue:`1067`) - ``DataFrame.from_records()`` will now accept generators (:issue:`4910`) + - ``DataFrame.interpolate()`` and ``Series.interpolate()`` have been expanded to include + interpolation methods from scipy. (:issue:`4434`, :issue:`1892`) API Changes ~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index c6a4c280ca4bb..66ca0f22587c6 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -614,6 +614,34 @@ Experimental - Added PySide support for the qtpandas DataFrameModel and DataFrameWidget. +- DataFrame has a new ``interpolate`` method, similar to Series (:issue:`4434`, :issue:`1892`) + + .. ipython:: python + + df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df.interpolate() + + Additionally, the ``method`` argument to ``interpolate`` has been expanded + to include 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'piecewise_polynomial', 'pchip' or "polynomial" or 'spline' + and an integer representing the degree or order of the approximation. The new methods + require scipy_. Consult the Scipy reference guide_ and documentation_ for more information + about when the various methods are appropriate. See also the :ref:`pandas interpolation docs`. + + Interpolate now also accepts a ``limit`` keyword argument. + This works similar to ``fillna``'s limit: + + .. ipython:: python + + ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser.interpolate(limit=2) + +.. _scipy: http://www.scipy.org +.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation +.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + + .. _whatsnew_0130.refactoring: Internal Refactoring diff --git a/pandas/core/common.py b/pandas/core/common.py index a417e00af5d3e..64599327f72ec 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1244,6 +1244,153 @@ def backfill_2d(values, limit=None, mask=None): return values +def _clean_interp_method(method, order=None, **kwargs): + valid = ['linear', 'time', 'values', 'nearest', 'zero', 'slinear', + 'quadratic', 'cubic', 'barycentric', 'polynomial', + 'krogh', 'piecewise_polynomial', + 'pchip', 'spline'] + if method in ('spline', 'polynomial') and order is None: + raise ValueError("You must specify the order of the spline or " + "polynomial.") + if method not in valid: + raise ValueError("method must be one of {0}." + "Got '{1}' instead.".format(valid, method)) + return method + + +def interpolate_1d(xvalues, yvalues, method='linear', limit=None, + fill_value=None, bounds_error=False, **kwargs): + """ + Logic for the 1-d interpolation. The result should be 1-d, inputs + xvalues and yvalues will each be 1-d arrays of the same length. + + Bounds_error is currently hardcoded to False since non-scipy ones don't + take it as an argumnet. + """ + # Treat the original, non-scipy methods first. + + invalid = isnull(yvalues) + valid = ~invalid + + valid_y = yvalues[valid] + valid_x = xvalues[valid] + new_x = xvalues[invalid] + + if method == 'time': + if not getattr(xvalues, 'is_all_dates', None): + # if not issubclass(xvalues.dtype.type, np.datetime64): + raise ValueError('time-weighted interpolation only works ' + 'on Series or DataFrames with a ' + 'DatetimeIndex') + method = 'values' + + def _interp_limit(invalid, limit): + """mask off values that won't be filled since they exceed the limit""" + all_nans = np.where(invalid)[0] + violate = [invalid[x:x + limit + 1] for x in all_nans] + violate = np.array([x.all() & (x.size > limit) for x in violate]) + return all_nans[violate] + limit + + xvalues = getattr(xvalues, 'values', xvalues) + yvalues = getattr(yvalues, 'values', yvalues) + + if limit: + violate_limit = _interp_limit(invalid, limit) + if valid.any(): + firstIndex = valid.argmax() + valid = valid[firstIndex:] + invalid = invalid[firstIndex:] + result = yvalues.copy() + if valid.all(): + return yvalues + else: + # have to call np.array(xvalues) since xvalues could be an Index + # which cant be mutated + result = np.empty_like(np.array(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if method in ['linear', 'time', 'values']: + if method in ('values', 'index'): + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if issubclass(inds.dtype.type, np.datetime64): + inds = inds.view(pa.int64) + + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + else: + inds = xvalues + + inds = inds[firstIndex:] + + result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid], + yvalues[firstIndex:][valid]) + + if limit: + result[violate_limit] = np.nan + return result + + sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'spline', 'polynomial', + 'piecewise_polynomial', 'pchip'] + if method in sp_methods: + new_x = new_x[firstIndex:] + xvalues = xvalues[firstIndex:] + + result[firstIndex:][invalid] = _interpolate_scipy_wrapper(valid_x, + valid_y, new_x, method=method, fill_value=fill_value, + bounds_error=bounds_error, **kwargs) + if limit: + result[violate_limit] = np.nan + return result + + +def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, + bounds_error=False, order=None, **kwargs): + """ + passed off to scipy.interpolate.interp1d. method is scipy's kind. + Returns an array interpolated at new_x. Add any new methods to + the list in _clean_interp_method + """ + try: + from scipy import interpolate + except ImportError: + raise ImportError('{0} interpolation requires Scipy'.format(method)) + + new_x = np.asarray(new_x) + + # ignores some kwargs that could be passed along. + alt_methods = { + 'barycentric': interpolate.barycentric_interpolate, + 'krogh': interpolate.krogh_interpolate, + 'piecewise_polynomial': interpolate.piecewise_polynomial_interpolate, + } + + try: + alt_methods['pchip'] = interpolate.pchip_interpolate + except AttributeError: + if method == 'pchip': + raise ImportError("Your version of scipy does not support " + "PCHIP interpolation.") + + interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial'] + if method in interp1d_methods: + if method == 'polynomial': + method = order + terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value, + bounds_error=bounds_error) + new_y = terp(new_x) + elif method == 'spline': + terp = interpolate.UnivariateSpline(x, y, k=order) + new_y = terp(new_x) + else: + method = alt_methods[method] + new_y = method(x, y, new_x) + return new_y + + def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): """ perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9dadeb4ef6e97..cdac4939ed841 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,6 +13,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex from pandas.core.internals import BlockManager +import pandas.core.array as pa import pandas.core.common as com import pandas.core.datetools as datetools from pandas import compat, _np_version_under1p7 @@ -1963,58 +1964,109 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: return self._constructor(new_data) - def interpolate(self, to_replace, method='pad', axis=0, inplace=False, - limit=None): + def interpolate(self, method='linear', axis=0, limit=None, inplace=False, + downcast='infer', **kwargs): """Interpolate values according to different methods. Parameters ---------- - to_replace : dict, Series - method : str - axis : int - inplace : bool - limit : int, default None + method : {'linear', 'time', 'values', 'index' 'nearest', + 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'polynomial', 'spline' + 'piecewise_polynomial', 'pchip'} + 'linear': ignore the index and treat the values as equally spaced. default + 'time': interpolation works on daily and higher resolution + data to interpolate given length of interval + 'index': use the actual numerical values of the index + 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', + 'polynomial' is passed to `scipy.interpolate.interp1d` with the order given + both 'polynomial' and 'spline' requre that you also specify and order (int) + e.g. df.interpolate(method='polynomial', order=4) + 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all wrappers + around the scipy interpolation methods of similar names. See the + scipy documentation for more on their behavior: + http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation + http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + axis : {0, 1}, default 0 + 0: fill column-by-column + 1: fill row-by-row + limit : int, default None. Maximum number of consecutive NaNs to fill. + inplace : bool, default False + downcast : optional, 'infer' or None, defaults to 'infer' Returns ------- - frame : interpolated + Series or DataFrame of same shape interpolated at the NaNs See Also -------- reindex, replace, fillna + + Examples + -------- + + # Filling in NaNs: + >>> s = pd.Series([0, 1, np.nan, 3]) + >>> s.interpolate() + 0 0 + 1 1 + 2 2 + 3 3 + dtype: float64 """ - from warnings import warn - warn('{klass}.interpolate will be removed in v0.14, please use ' - 'either {klass}.fillna or {klass}.replace ' - 'instead'.format(klass=self.__class__), FutureWarning) - if self._is_mixed_type and axis == 1: - return self.T.replace(to_replace, method=method, limit=limit).T + if self.ndim > 2: + raise NotImplementedError("Interpolate has not been implemented " + "on Panel and Panel 4D objects.") - method = com._clean_fill_method(method) + if axis == 0: + ax = self._info_axis_name + elif axis == 1: + self = self.T + ax = 1 + ax = self._get_axis_number(ax) - if isinstance(to_replace, (dict, com.ABCSeries)): - if axis == 0: - return self.replace(to_replace, method=method, inplace=inplace, - limit=limit, axis=axis) - elif axis == 1: - obj = self.T - if inplace: - obj.replace(to_replace, method=method, limit=limit, - inplace=inplace, axis=0) - return obj.T - return obj.replace(to_replace, method=method, limit=limit, - inplace=inplace, axis=0).T - else: - raise ValueError('Invalid value for axis') + if self.ndim == 2: + alt_ax = 1 - ax + else: + alt_ax = ax + + if isinstance(self.index, MultiIndex) and method != 'linear': + raise ValueError("Only `method=linear` interpolation is supported " + "on MultiIndexes.") + + if self._data.get_dtype_counts().get('object') == len(self.T): + raise TypeError("Cannot interpolate with all NaNs.") + + # create/use the index + if method == 'linear': + index = np.arange(len(self._get_axis(alt_ax))) # prior default else: - new_data = self._data.interpolate(method=method, axis=axis, - limit=limit, inplace=inplace, - missing=to_replace, coerce=False) + index = self._get_axis(alt_ax) + + if pd.isnull(index).any(): + raise NotImplementedError("Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating.") + new_data = self._data.interpolate(method=method, + axis=ax, + index=index, + values=self, + limit=limit, + inplace=inplace, + downcast=downcast, + **kwargs) - if inplace: + if inplace: + if axis == 1: self._data = new_data + self = self.T else: - return self._constructor(new_data) + self._data = new_data + else: + res = self._constructor(new_data, index=self.index) + if axis == 1: + res = res.T + return res #---------------------------------------------------------------------- # Action Methods diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 070745d73b307..0d3a1cfe9dfe1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -9,7 +9,7 @@ from pandas.core.common import (_possibly_downcast_to_dtype, isnull, notnull, _NS_DTYPE, _TD_DTYPE, ABCSeries, ABCSparseSeries, - is_list_like, _infer_dtype_from_scalar) + is_list_like, _infer_dtype_from_scalar, _values_from_object) from pandas.core.index import (Index, MultiIndex, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import (_check_slice_bounds, _maybe_convert_indices, @@ -723,9 +723,46 @@ def create_block(v, m, n, item, reshape=True): return [make_block(new_values, self.items, self.ref_items, placement=self._ref_locs, fastpath=True)] - def interpolate(self, method='pad', axis=0, inplace=False, - limit=None, fill_value=None, coerce=False, - downcast=None): + def interpolate(self, method='pad', axis=0, index=None, + values=None, inplace=False, limit=None, + fill_value=None, coerce=False, downcast=None, **kwargs): + + # a fill na type method + try: + m = com._clean_fill_method(method) + except: + m = None + + if m is not None: + return self._interpolate_with_fill(method=m, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast) + # try an interp method + try: + m = com._clean_interp_method(method, **kwargs) + except: + m = None + + if m is not None: + return self._interpolate(method=m, + index=index, + values=values, + axis=axis, + limit=limit, + fill_value=fill_value, + inplace=inplace, + downcast=downcast, + **kwargs) + + raise ValueError("invalid method '{0}' to interpolate.".format(method)) + + def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, + limit=None, fill_value=None, coerce=False, downcast=None): + """ fillna but using the interpolate machinery """ # if we are coercing, then don't force the conversion # if the block can't hold the type @@ -745,6 +782,44 @@ def interpolate(self, method='pad', axis=0, inplace=False, blocks = [ make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) ] return self._maybe_downcast(blocks, downcast) + def _interpolate(self, method=None, index=None, values=None, + fill_value=None, axis=0, limit=None, + inplace=False, downcast=None, **kwargs): + """ interpolate using scipy wrappers """ + + data = self.values if inplace else self.values.copy() + + # only deal with floats + if not self.is_float: + if not self.is_integer: + return self + data = data.astype(np.float64) + + if fill_value is None: + fill_value = np.nan + + if method in ('krogh', 'piecewise_polynomial', 'pchip'): + if not index.is_monotonic: + raise ValueError("{0} interpolation requires that the " + "index be monotonic.".format(method)) + # process 1-d slices in the axis direction + + def func(x): + + # process a 1-d slice, returning it + # should the axis argument be handled below in apply_along_axis? + # i.e. not an arg to com.interpolate_1d + return com.interpolate_1d(index, x, method=method, limit=limit, + fill_value=fill_value, bounds_error=False, + **kwargs) + + # interp each column independently + interp_values = np.apply_along_axis(func, axis, data) + + blocks = [make_block(interp_values, self.items, self.ref_items, + ndim=self.ndim, klass=self.__class__, fastpath=True)] + return self._maybe_downcast(blocks, downcast) + def take(self, indexer, ref_items, axis=1): if axis < 1: raise AssertionError('axis must be at least 1, got %d' % axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index d185939d6abc9..3b4dcfba086ee 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2251,57 +2251,6 @@ def asof(self, where): new_values = com.take_1d(values, locs) return self._constructor(new_values, index=where, name=self.name) - def interpolate(self, method='linear'): - """ - Interpolate missing values (after the first valid value) - - Parameters - ---------- - method : {'linear', 'time', 'values'} - Interpolation method. - 'time' interpolation works on daily and higher resolution - data to interpolate given length of interval - 'values' using the actual index numeric values - - Returns - ------- - interpolated : Series - """ - if method == 'time': - if not self.is_time_series: - raise Exception('time-weighted interpolation only works' - 'on TimeSeries') - method = 'values' - # inds = pa.array([d.toordinal() for d in self.index]) - - if method == 'values': - inds = self.index.values - # hack for DatetimeIndex, #1646 - if issubclass(inds.dtype.type, np.datetime64): - inds = inds.view(pa.int64) - - if inds.dtype == np.object_: - inds = lib.maybe_convert_objects(inds) - else: - inds = pa.arange(len(self)) - - values = self.values - - invalid = isnull(values) - valid = -invalid - - result = values.copy() - if valid.any(): - firstIndex = valid.argmax() - valid = valid[firstIndex:] - invalid = invalid[firstIndex:] - inds = inds[firstIndex:] - - result[firstIndex:][invalid] = np.interp( - inds[invalid], inds[valid], values[firstIndex:][valid]) - - return self._constructor(result, index=self.index, name=self.name) - @property def weekday(self): return self._constructor([d.weekday() for d in self.index], index=self.index) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index b8c143e10111d..e9902bf4d1195 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -6,6 +6,7 @@ import nose import numpy as np +from numpy import nan import pandas as pd from pandas import (Index, Series, DataFrame, Panel, @@ -24,6 +25,20 @@ ensure_clean) import pandas.util.testing as tm + +def _skip_if_no_scipy(): + try: + import scipy.interpolate + except ImportError: + raise nose.SkipTest('scipy.interpolate missing') + + +def _skip_if_no_pchip(): + try: + from scipy.interpolate import pchip_interpolate + except ImportError: + raise nose.SkipTest('scipy.interpolate.pchip missing') + #------------------------------------------------------------------------------ # Generic types test cases @@ -173,6 +188,13 @@ class TestSeries(unittest.TestCase, Generic): _typ = Series _comparator = lambda self, x, y: assert_series_equal(x,y) + def setUp(self): + self.ts = tm.makeTimeSeries() # Was at top level in test_series + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + def test_rename_mi(self): s = Series([11,21,31], index=MultiIndex.from_tuples([("A",x) for x in ["a","B","c"]])) @@ -230,6 +252,152 @@ def test_nonzero_single_element(self): self.assertRaises(ValueError, lambda : bool(s)) self.assertRaises(ValueError, lambda : s.bool()) + def test_interpolate(self): + ts = Series(np.arange(len(self.ts), dtype=float), self.ts.index) + + ts_copy = ts.copy() + ts_copy[5:10] = np.NaN + + linear_interp = ts_copy.interpolate(method='linear') + self.assert_(np.array_equal(linear_interp, ts)) + + ord_ts = Series([d.toordinal() for d in self.ts.index], + index=self.ts.index).astype(float) + + ord_ts_copy = ord_ts.copy() + ord_ts_copy[5:10] = np.NaN + + time_interp = ord_ts_copy.interpolate(method='time') + self.assert_(np.array_equal(time_interp, ord_ts)) + + # try time interpolation on a non-TimeSeries + self.assertRaises(ValueError, self.series.interpolate, method='time') + + def test_interpolate_corners(self): + s = Series([np.nan, np.nan]) + assert_series_equal(s.interpolate(), s) + + s = Series([]).interpolate() + assert_series_equal(s.interpolate(), s) + + _skip_if_no_scipy() + s = Series([np.nan, np.nan]) + assert_series_equal(s.interpolate(method='polynomial', order=1), s) + + s = Series([]).interpolate() + assert_series_equal(s.interpolate(method='polynomial', order=1), s) + + def test_interpolate_index_values(self): + s = Series(np.nan, index=np.sort(np.random.rand(30))) + s[::3] = np.random.randn(10) + + vals = s.index.values.astype(float) + + result = s.interpolate(method='values') + + expected = s.copy() + bad = isnull(expected.values) + good = -bad + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad]) + + assert_series_equal(result[bad], expected) + + def test_interpolate_non_ts(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + with tm.assertRaises(ValueError): + s.interpolate(method='time') + + # New interpolation tests + def test_nan_interpolate(self): + s = Series([0, 1, np.nan, 3]) + result = s.interpolate() + expected = Series([0, 1, 2, 3]) + assert_series_equal(result, expected) + + _skip_if_no_scipy() + result = s.interpolate(method='polynomial', order=1) + assert_series_equal(result, expected) + + def test_nan_irregular_index(self): + s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) + result = s.interpolate() + expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9]) + assert_series_equal(result, expected) + + def test_nan_str_index(self): + s = Series([0, 1, 2, np.nan], index=list('abcd')) + result = s.interpolate() + expected = Series([0, 1, 2, 2], index=list('abcd')) + assert_series_equal(result, expected) + + def test_interp_quad(self): + _skip_if_no_scipy() + sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) + result = sq.interpolate(method='quadratic') + expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) + assert_series_equal(result, expected) + + def test_interp_scipy_basic(self): + _skip_if_no_scipy() + s = Series([1, 3, np.nan, 12, np.nan, 25]) + # slinear + expected = Series([1., 3., 7.5, 12., 18.5, 25.]) + result = s.interpolate(method='slinear') + assert_series_equal(result, expected) + # nearest + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method='nearest') + assert_series_equal(result, expected) + # zero + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method='zero') + assert_series_equal(result, expected) + # quadratic + expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) + result = s.interpolate(method='quadratic') + assert_series_equal(result, expected) + # cubic + expected = Series([1., 3., 6.8, 12., 18.2, 25.]) + result = s.interpolate(method='cubic') + assert_series_equal(result, expected) + + def test_interp_limit(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + expected = Series([1., 3., 5., 7., np.nan, 11.]) + result = s.interpolate(method='linear', limit=2) + assert_series_equal(result, expected) + + def test_interp_all_good(self): + # scipy + _skip_if_no_scipy() + s = Series([1, 2, 3]) + result = s.interpolate(method='polynomial', order=1) + assert_series_equal(result, s) + + # non-scipy + result = s.interpolate() + assert_series_equal(result, s) + + def test_interp_multiIndex(self): + idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + s = Series([1, 2, np.nan], index=idx) + + expected = s.copy() + expected.loc[2] = 2 + expected = expected.astype(np.int64) + result = s.interpolate() + assert_series_equal(result, expected) + + _skip_if_no_scipy() + with tm.assertRaises(ValueError): + s.interpolate(method='polynomial', order=1) + + def test_interp_nonmono_raise(self): + _skip_if_no_scipy() + s = pd.Series([1, 2, 3], index=[0, 2, 1]) + with tm.assertRaises(ValueError): + s.interpolate(method='krogh') class TestDataFrame(unittest.TestCase, Generic): _typ = DataFrame @@ -256,14 +424,178 @@ def test_nonzero_single_element(self): def test_get_numeric_data_preserve_dtype(self): # get the numeric data - o = DataFrame({'A' : [1,'2',3.] }) + o = DataFrame({'A': [1, '2', 3.]}) result = o._get_numeric_data() - expected = DataFrame(index=[0,1,2],dtype=object) + expected = DataFrame(index=[0, 1, 2], dtype=object) self._compare(result, expected) + def test_interp_basic(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + result = df.interpolate() + assert_frame_equal(result, expected) + + result = df.set_index('C').interpolate() + expected = df.set_index('C') + expected.A.loc[3] = 3 + expected.B.loc[5] = 9 + expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64) + + assert_frame_equal(result, expected) + + def test_interp_bad_method(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + with tm.assertRaises(ValueError): + df.interpolate(method='not_a_method') + + def test_interp_combo(self): + df = DataFrame({'A': [1., 2., np.nan, 4.], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + + result = df['A'].interpolate() + expected = Series([1, 2, 3, 4]) + assert_series_equal(result, expected) + + def test_interp_nan_idx(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) + df = df.set_index('A') + with tm.assertRaises(NotImplementedError): + df.interpolate(method='values') + + def test_interp_various(self): + _skip_if_no_scipy() + df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], + 'C': [1, 2, 3, 5, 8, 13, 21]}) + df = df.set_index('C') + expected = df.copy() + result = df.interpolate(method='polynomial', order=1) + + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923076 + assert_frame_equal(result, expected) + + result = df.interpolate(method='cubic') + expected.A.loc[3] = 2.81621174 + expected.A.loc[13] = 5.64146581 + assert_frame_equal(result, expected) + + result = df.interpolate(method='nearest') + expected.A.loc[3] = 2 + expected.A.loc[13] = 5 + assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method='quadratic') + expected.A.loc[3] = 2.82533638 + expected.A.loc[13] = 6.02817974 + assert_frame_equal(result, expected) + + result = df.interpolate(method='slinear') + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923077 + assert_frame_equal(result, expected) + + result = df.interpolate(method='zero') + expected.A.loc[3] = 2. + expected.A.loc[13] = 5 + assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method='quadratic') + expected.A.loc[3] = 2.82533638 + expected.A.loc[13] = 6.02817974 + assert_frame_equal(result, expected) + + def test_interp_alt_scipy(self): + _skip_if_no_scipy() + df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], + 'C': [1, 2, 3, 5, 8, 13, 21]}) + result = df.interpolate(method='barycentric') + expected = df.copy() + expected['A'].iloc[2] = 3 + expected['A'].iloc[5] = 6 + assert_frame_equal(result, expected) + + result = df.interpolate(method='krogh') + expectedk = df.copy() + expectedk['A'].iloc[2] = 3 + expectedk['A'].iloc[5] = 6 + expectedk['A'] = expected['A'].astype(np.int64) + assert_frame_equal(result, expectedk) + + _skip_if_no_pchip() + result = df.interpolate(method='pchip') + expected['A'].iloc[2] = 3 + expected['A'].iloc[5] = 6.125 + assert_frame_equal(result, expected) + + def test_interp_rowwise(self): + df = DataFrame({0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4]}) + result = df.interpolate(axis=1) + expected = df.copy() + expected[1].loc[3] = 5 + expected[2].loc[0] = 3 + expected[3].loc[1] = 3 + expected[4] = expected[4].astype(np.float64) + assert_frame_equal(result, expected) + + # scipy route + _skip_if_no_scipy() + result = df.interpolate(axis=1, method='values') + assert_frame_equal(result, expected) + + result = df.interpolate(axis=0) + expected = df.interpolate() + assert_frame_equal(result, expected) + + def test_rowwise_alt(self): + df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]}) + df.interpolate(axis=0) + + def test_interp_leading_nans(self): + df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], + "B": [np.nan, -3, -3.5, np.nan, -4]}) + result = df.interpolate() + expected = df.copy() + expected['B'].loc[3] = -3.75 + assert_frame_equal(result, expected) + + _skip_if_no_scipy() + result = df.interpolate(method='polynomial', order=1) + assert_frame_equal(result, expected) + + def test_interp_raise_on_only_mixed(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': ['a', 'b', 'c', 'd'], + 'C': [np.nan, 2, 5, 7], 'D': [np.nan, np.nan, 9, 9], + 'E': [1, 2, 3, 4]}) + with tm.assertRaises(TypeError): + df.interpolate(axis=1) + + def test_no_order(self): + _skip_if_no_scipy() + s = Series([0, 1, np.nan, 3]) + with tm.assertRaises(ValueError): + s.interpolate(method='polynomial') + with tm.assertRaises(ValueError): + s.interpolate(method='spline') + + def test_spline(self): + _skip_if_no_scipy() + s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) + result = s.interpolate(method='spline', order=1) + expected = Series([1., 2, 3, 4, 5, 6, 7]) # dtype? + assert_series_equal(result, expected) + + class TestPanel(unittest.TestCase, Generic): _typ = Panel - _comparator = lambda self, x, y: assert_panel_equal(x,y) + _comparator = lambda self, x, y: assert_panel_equal(x, y) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 8abc068fd6d24..6f4725f7db725 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2484,72 +2484,145 @@ def test_timedelta_fillna(self): raise nose.SkipTest("timedelta broken in np 1.6.1") #GH 3371 - from datetime import timedelta - - s = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130102'),Timestamp('20130103 9:01:01')]) + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) td = s.diff() # reg fillna result = td.fillna(0) - expected = Series([timedelta(0),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) - assert_series_equal(result,expected) + expected = Series([timedelta(0), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) # interprested as seconds result = td.fillna(1) - expected = Series([timedelta(seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) - assert_series_equal(result,expected) + expected = Series([timedelta(seconds=1), timedelta(0), + timedelta(1), timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) - result = td.fillna(timedelta(days=1,seconds=1)) - expected = Series([timedelta(days=1,seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) - assert_series_equal(result,expected) + result = td.fillna(timedelta(days=1, seconds=1)) + expected = Series([timedelta(days=1, seconds=1), timedelta(0), + timedelta(1), timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) result = td.fillna(np.timedelta64(int(1e9))) - expected = Series([timedelta(seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) - assert_series_equal(result,expected) + expected = Series([timedelta(seconds=1), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) from pandas import tslib result = td.fillna(tslib.NaT) - expected = Series([tslib.NaT,timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)],dtype='m8[ns]') - assert_series_equal(result,expected) + expected = Series([tslib.NaT, timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)], dtype='m8[ns]') + assert_series_equal(result, expected) # ffill td[2] = np.nan result = td.ffill() expected = td.fillna(0) expected[0] = np.nan - assert_series_equal(result,expected) + assert_series_equal(result, expected) # bfill td[2] = np.nan result = td.bfill() expected = td.fillna(0) - expected[2] = timedelta(days=1,seconds=9*3600+60+1) - assert_series_equal(result,expected) + expected[2] = timedelta(days=1, seconds=9*3600+60+1) + assert_series_equal(result, expected) def test_datetime64_fillna(self): - s = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130102'),Timestamp('20130103 9:01:01')]) + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) s[2] = np.nan # reg fillna result = s.fillna(Timestamp('20130104')) - expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130104'),Timestamp('20130103 9:01:01')]) - assert_series_equal(result,expected) + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130104'), Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) from pandas import tslib result = s.fillna(tslib.NaT) expected = s - assert_series_equal(result,expected) + assert_series_equal(result, expected) # ffill result = s.ffill() - expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130103 9:01:01')]) - assert_series_equal(result,expected) + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130101'), Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) # bfill result = s.bfill() - expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130103 9:01:01'),Timestamp('20130103 9:01:01')]) - assert_series_equal(result,expected) + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130103 9:01:01'), + Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) + + def test_fillna_int(self): + s = Series(np.random.randint(-100, 100, 50)) + s.fillna(method='ffill', inplace=True) + assert_series_equal(s.fillna(method='ffill', inplace=False), s) + + def test_fillna_raise(self): + s = Series(np.random.randint(-100, 100, 50)) + self.assertRaises(TypeError, s.fillna, [1, 2]) + self.assertRaises(TypeError, s.fillna, (1, 2)) + +# TimeSeries-specific + + def test_fillna(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + + self.assert_(np.array_equal(ts, ts.fillna(method='ffill'))) + + ts[2] = np.NaN + + self.assert_( + np.array_equal(ts.fillna(method='ffill'), [0., 1., 1., 3., 4.])) + self.assert_(np.array_equal(ts.fillna(method='backfill'), + [0., 1., 3., 3., 4.])) + + self.assert_(np.array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.])) + + self.assertRaises(ValueError, ts.fillna) + self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill') + + def test_fillna_bug(self): + x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + filled = x.fillna(method='ffill') + expected = Series([nan, 1., 1., 3., 3.], x.index) + assert_series_equal(filled, expected) + + filled = x.fillna(method='bfill') + expected = Series([1., 1., 3., 3., nan], x.index) + assert_series_equal(filled, expected) + + def test_fillna_inplace(self): + x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + y = x.copy() + + y.fillna(value=0, inplace=True) + + expected = x.fillna(value=0) + assert_series_equal(y, expected) + + def test_fillna_invalid_method(self): + try: + self.ts.fillna(method='ffil') + except ValueError as inst: + self.assert_('ffil' in str(inst)) + + def test_ffill(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + assert_series_equal(ts.ffill(), ts.fillna(method='ffill')) + + def test_bfill(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) def test_sub_of_datetime_from_TimeSeries(self): from pandas.tseries.timedeltas import _possibly_cast_to_timedelta @@ -4718,84 +4791,8 @@ def test_isin_with_string_scalar(self): s = Series(['aaa', 'b', 'c']) s.isin('aaa') - def test_fillna_int(self): - s = Series(np.random.randint(-100, 100, 50)) - s.fillna(method='ffill', inplace=True) - assert_series_equal(s.fillna(method='ffill', inplace=False), s) - - def test_fillna_raise(self): - s = Series(np.random.randint(-100, 100, 50)) - self.assertRaises(TypeError, s.fillna, [1, 2]) - self.assertRaises(TypeError, s.fillna, (1, 2)) - #------------------------------------------------------------------------------ # TimeSeries-specific - - def test_fillna(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) - - self.assert_(np.array_equal(ts, ts.fillna(method='ffill'))) - - ts[2] = np.NaN - - self.assert_( - np.array_equal(ts.fillna(method='ffill'), [0., 1., 1., 3., 4.])) - self.assert_(np.array_equal(ts.fillna(method='backfill'), - [0., 1., 3., 3., 4.])) - - self.assert_(np.array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.])) - - self.assertRaises(ValueError, ts.fillna) - self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill') - - def test_fillna_bug(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) - filled = x.fillna(method='ffill') - expected = Series([nan, 1., 1., 3., 3.], x.index) - assert_series_equal(filled, expected) - - filled = x.fillna(method='bfill') - expected = Series([1., 1., 3., 3., nan], x.index) - assert_series_equal(filled, expected) - - def test_fillna_inplace(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) - y = x.copy() - - y.fillna(value=0, inplace=True) - - expected = x.fillna(value=0) - assert_series_equal(y, expected) - - def test_fillna_empty(self): - # GH 4346 - - empty = Series() - - result = empty.reindex([1, 2, 3]) - expected = Series([np.nan, np.nan, np.nan], index=[1, 2, 3]) - assert_series_equal(result, expected) - - result = empty.reindex([1, 2, 3], fill_value=0.0) - expected = Series([0.0, 0.0, 0.0], index=[1, 2, 3]) - assert_series_equal(result, expected) - - def test_fillna_invalid_method(self): - try: - self.ts.fillna(method='ffil') - except ValueError as inst: - self.assert_('ffil' in str(inst)) - - def test_ffill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - assert_series_equal(ts.ffill(), ts.fillna(method='ffill')) - - def test_bfill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) - def test_cummethods_bool(self): def cummin(x): return np.minimum.accumulate(x) @@ -4974,50 +4971,6 @@ def test_asfreq(self): self.assert_(len(result) == 0) self.assert_(result is not ts) - def test_interpolate(self): - ts = Series(np.arange(len(self.ts), dtype=float), self.ts.index) - - ts_copy = ts.copy() - ts_copy[5:10] = np.NaN - - linear_interp = ts_copy.interpolate(method='linear') - self.assert_(np.array_equal(linear_interp, ts)) - - ord_ts = Series([d.toordinal() for d in self.ts.index], - index=self.ts.index).astype(float) - - ord_ts_copy = ord_ts.copy() - ord_ts_copy[5:10] = np.NaN - - time_interp = ord_ts_copy.interpolate(method='time') - self.assert_(np.array_equal(time_interp, ord_ts)) - - # try time interpolation on a non-TimeSeries - self.assertRaises(Exception, self.series.interpolate, method='time') - - def test_interpolate_corners(self): - s = Series([np.nan, np.nan]) - assert_series_equal(s.interpolate(), s) - - s = Series([]).interpolate() - assert_series_equal(s.interpolate(), s) - - def test_interpolate_index_values(self): - s = Series(np.nan, index=np.sort(np.random.rand(30))) - s[::3] = np.random.randn(10) - - vals = s.index.values.astype(float) - - result = s.interpolate(method='values') - - expected = s.copy() - bad = isnull(expected.values) - good = -bad - expected = Series( - np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad]) - - assert_series_equal(result[bad], expected) - def test_weekday(self): # Just run the function weekdays = self.ts.weekday