ENH limit_area added to interpolate1d

closes #16284
pandas-dev · Feb 1, 2018 · 35812ea · 35812ea
1 parent 09307dd
commit 35812ea
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 73 deletions.
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -190,7 +190,7 @@ Sum/Prod of Empties/Nans
 .. warning::
 
    This behavior is now standard as of v0.21.0; previously sum/prod would give different
-   results if the ``bottleneck`` package was installed. 
+   results if the ``bottleneck`` package was installed.
    See the :ref:`v0.21.0 whatsnew <whatsnew_0210.api_breaking.bottleneck>`.
 
 With ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, the result will be all-``NaN``.
@@ -353,7 +353,11 @@ examined :ref:`in the API <api.dataframe.missing>`.
 Interpolation
 ~~~~~~~~~~~~~
 
-Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method 
+.. versionadded:: 0.21.0
+
+  The ``limit_area`` keyword argument was added.
+
+Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method
 that, by default, performs linear interpolation at missing datapoints.
 
 .. ipython:: python
@@ -477,33 +481,54 @@ at the new values.
 .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
 .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
 
+.. _missing_data.interp_limits:
+
 Interpolation Limits
 ^^^^^^^^^^^^^^^^^^^^
 
 Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword
-argument. Use this argument to limit the number of consecutive interpolations,
-keeping ``NaN`` values for interpolations that are too far from the last valid
-observation:
+argument. Use this argument to limit the number of consecutive ``NaN`` values
+filled since the last valid observation:
 
 .. ipython:: python
 
-   ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13])
-   ser.interpolate(limit=2)
+   ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan])
 
-By default, ``limit`` applies in a forward direction, so that only ``NaN``
-values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or
-``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN``
-values before non-``NaN`` values, or both before and after non-``NaN`` values,
-respectively:
+   # fill all consecutive values in a forward direction
+   ser.interpolate()
 
-.. ipython:: python
+   # fill one consecutive value in a forward direction
+   ser.interpolate(limit=1)
+
+By default, ``NaN`` values are filled in a ``forward`` direction. Use
+``limit_direction`` parameter to fill ``backward`` or from ``both`` directions.
 
-   ser.interpolate(limit=1)  # limit_direction == 'forward'
+.. ipython:: python
 
+   # fill one consecutive value backwards
    ser.interpolate(limit=1, limit_direction='backward')
 
+   # fill one consecutive value in both directions
    ser.interpolate(limit=1, limit_direction='both')
 
+   # fill all consecutive values in both directions
+   ser.interpolate(limit_direction='both')
+
+By default, ``NaN`` values are filled whether they are inside (surrounded by)
+existing valid values, or outside existing valid values. Introduced in v0.23
+the ``limit_area`` parameter restricts filling to either inside or outside values.
+
+.. ipython:: python
+
+   # fill one consecutive inside value in both directions
+   ser.interpolate(limit_direction='both', limit_area='inside', limit=1)
+
+   # fill all consecutive outside values backward
+   ser.interpolate(limit_direction='backward', limit_area='outside')
+
+   # fill all consecutive outside values in both directions
+   ser.interpolate(limit_direction='both', limit_area='outside')
+
 .. _missing_data.replace:
 
 Replacing Generic Values

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -13,10 +13,38 @@ version.
 New features
 ~~~~~~~~~~~~
 
--
--
--
+.. _whatsnew_0210.enhancements.limit_area:
+
+``DataFrame.interpolate`` has gained the ``limit_area`` kwarg
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:meth:`DataFrame.interpolate` has gained a ``limit_area`` parameter to allow further control of which ``NaN`` s are replaced.
+Use `limit_area='inside'` to fill only NaNs surrounded by valid values or use `limit_area='outside'` to fill only ``NaN`` s
+outside the existing valid values while preserving those inside.  (:issue:`16284`) See the :ref:`full documentation here <missing_data.interp_limits>`.
+
 
+.. ipython:: python
+
+   ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan])
+   ser
+
+Fill one consecutive inside value in both directions
+
+.. ipython:: python
+
+   ser.interpolate(limit_direction='both', limit_area='inside', limit=1)
+
+Fill all consecutive outside values backward
+
+.. ipython:: python
+
+   ser.interpolate(limit_direction='backward', limit_area='outside')
+
+Fill all consecutive outside values in both directions
+
+.. ipython:: python
+
+   ser.interpolate(limit_direction='both', limit_area='outside')
 
 .. _whatsnew_0210.enhancements.get_dummies_dtype:
 
@@ -207,6 +235,7 @@ Other Enhancements
   :func:`pandas.api.extensions.register_index_accessor`, accessor for libraries downstream of pandas
   to register custom accessors like ``.cat`` on pandas objects. See
   :ref:`Registering Custom Accessors <developer.register-accessors>` for more (:issue:`14781`).
+
 - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)
 - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -5085,6 +5085,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
         limit : int, default None.
             Maximum number of consecutive NaNs to fill. Must be greater than 0.
         limit_direction : {'forward', 'backward', 'both'}, default 'forward'
+        limit_area : {'inside', 'outside'}, default None
+            * None: (default) no fill restriction
+            * 'inside' Only fill NaNs surrounded by valid values (interpolate).
+            * 'outside' Only fill NaNs outside valid values (extrapolate).
+            .. versionadded:: 0.21.0
+
             If limit is specified, consecutive NaNs will be filled in this
             direction.
         inplace : bool, default False
@@ -5118,7 +5124,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
 
     @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs)
     def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
-                    limit_direction='forward', downcast=None, **kwargs):
+                    limit_direction='forward', limit_area=None,
+                    downcast=None, **kwargs):
         """
         Interpolate values according to different methods.
         """
@@ -5167,6 +5174,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
         new_data = data.interpolate(method=method, axis=ax, index=index,
                                     values=_maybe_transposed_self, limit=limit,
                                     limit_direction=limit_direction,
+                                    limit_area=limit_area,
                                     inplace=inplace, downcast=downcast,
                                     **kwargs)
 

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1073,8 +1073,8 @@ def coerce_to_target_dtype(self, other):
 
     def interpolate(self, method='pad', axis=0, index=None, values=None,
                     inplace=False, limit=None, limit_direction='forward',
-                    fill_value=None, coerce=False, downcast=None, mgr=None,
-                    **kwargs):
+                    limit_area=None, fill_value=None, coerce=False,
+                    downcast=None, mgr=None, **kwargs):
 
         inplace = validate_bool_kwarg(inplace, 'inplace')
 
@@ -1115,6 +1115,7 @@ def check_int_bool(self, inplace):
             return self._interpolate(method=m, index=index, values=values,
                                      axis=axis, limit=limit,
                                      limit_direction=limit_direction,
+                                     limit_area=limit_area,
                                      fill_value=fill_value, inplace=inplace,
                                      downcast=downcast, mgr=mgr, **kwargs)
 
@@ -1148,8 +1149,8 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
 
     def _interpolate(self, method=None, index=None, values=None,
                      fill_value=None, axis=0, limit=None,
-                     limit_direction='forward', inplace=False, downcast=None,
-                     mgr=None, **kwargs):
+                     limit_direction='forward', limit_area=None,
+                     inplace=False, downcast=None, mgr=None, **kwargs):
         """ interpolate using scipy wrappers """
 
         inplace = validate_bool_kwarg(inplace, 'inplace')
@@ -1177,6 +1178,7 @@ def func(x):
             # i.e. not an arg to missing.interpolate_1d
             return missing.interpolate_1d(index, x, method=method, limit=limit,
                                           limit_direction=limit_direction,
+                                          limit_area=limit_area,
                                           fill_value=fill_value,
                                           bounds_error=False, **kwargs)
 

diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -111,7 +111,7 @@ def clean_interp_method(method, **kwargs):
 
 
 def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
-                   limit_direction='forward', fill_value=None,
+                   limit_direction='forward', limit_area=None, fill_value=None,
                    bounds_error=False, order=None, **kwargs):
     """
     Logic for the 1-d interpolation.  The result should be 1-d, inputs
@@ -151,28 +151,12 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
         raise ValueError(msg.format(valid=valid_limit_directions,
                                     invalid=limit_direction))
 
-    from pandas import Series
-    ys = Series(yvalues)
-    start_nans = set(range(ys.first_valid_index()))
-    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
-
-    # violate_limit is a list of the indexes in the series whose yvalue is
-    # currently NaN, and should still be NaN after the interpolation.
-    # Specifically:
-    #
-    # If limit_direction='forward' or None then the list will contain NaNs at
-    # the beginning of the series, and NaNs that are more than 'limit' away
-    # from the prior non-NaN.
-    #
-    # If limit_direction='backward' then the list will contain NaNs at
-    # the end of the series, and NaNs that are more than 'limit' away
-    # from the subsequent non-NaN.
-    #
-    # If limit_direction='both' then the list will contain NaNs that
-    # are more than 'limit' away from any non-NaN.
-    #
-    # If limit=None, then use default behavior of filling an unlimited number
-    # of NaNs in the direction specified by limit_direction
+    if limit_area is not None:
+        valid_limit_areas = ['inside', 'outside']
+        limit_area = limit_area.lower()
+        if limit_area not in valid_limit_areas:
+            raise ValueError('Invalid limit_area: expecting one of {}, got '
+                             '{}.'.format(valid_limit_areas, limit_area))
 
     # default limit is unlimited GH #16282
     if limit is None:
@@ -183,22 +167,43 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
     elif limit < 1:
         raise ValueError('Limit must be greater than 0')
 
-    # each possible limit_direction
-    # TODO: do we need sorted?
-    if limit_direction == 'forward' and limit is not None:
-        violate_limit = sorted(start_nans |
-                               set(_interp_limit(invalid, limit, 0)))
-    elif limit_direction == 'forward':
-        violate_limit = sorted(start_nans)
-    elif limit_direction == 'backward' and limit is not None:
-        violate_limit = sorted(end_nans |
-                               set(_interp_limit(invalid, 0, limit)))
+    from pandas import Series
+    ys = Series(yvalues)
+
+    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
+    all_nans = set(np.flatnonzero(invalid))
+    start_nans = set(range(ys.first_valid_index()))
+    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
+    mid_nans = all_nans - start_nans - end_nans
+
+    # Like the sets above, preserve_nans contains indices of invalid values,
+    # but in this case, it is the final set of indices that need to be
+    # preserved as NaN after the interpolation.
+
+    # For example if limit_direction='forward' then preserve_nans will
+    # contain indices of NaNs at the beginning of the series, and NaNs that
+    # are more than'limit' away from the prior non-NaN.
+
+    # set preserve_nans based on direction using _interp_limit
+    if limit_direction == 'forward':
+        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
     elif limit_direction == 'backward':
-        violate_limit = sorted(end_nans)
-    elif limit_direction == 'both' and limit is not None:
-        violate_limit = sorted(_interp_limit(invalid, limit, limit))
+        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
     else:
-        violate_limit = []
+        # both directions... just use _interp_limit
+        preserve_nans = set(_interp_limit(invalid, limit, limit))
+
+    # if limit_area is set, add either mid or outside indices
+    # to preserve_nans GH #16284
+    if limit_area == 'inside':
+        # preserve NaNs on the outside
+        preserve_nans |= start_nans | end_nans
+    elif limit_area == 'outside':
+        # preserve NaNs on the inside
+        preserve_nans |= mid_nans
+
+    # sort preserve_nans and covert to list
+    preserve_nans = sorted(preserve_nans)
 
     xvalues = getattr(xvalues, 'values', xvalues)
     yvalues = getattr(yvalues, 'values', yvalues)
@@ -215,7 +220,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
         else:
             inds = xvalues
         result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
-        result[violate_limit] = np.nan
+        result[preserve_nans] = np.nan
         return result
 
     sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
@@ -234,7 +239,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
                                                      fill_value=fill_value,
                                                      bounds_error=bounds_error,
                                                      order=order, **kwargs)
-        result[violate_limit] = np.nan
+        result[preserve_nans] = np.nan
         return result
 
 
@@ -646,8 +651,24 @@ def fill_zeros(result, x, y, name, fill):
 
 
 def _interp_limit(invalid, fw_limit, bw_limit):
-    """Get idx of values that won't be filled b/c they exceed the limits.
+    """
+    Get indexers of values that won't be filled
+    because they exceed the limits.
+
+    Parameters
+    ----------
+    invalid : boolean ndarray
+    fw_limit : int or None
+        forward limit to index
+    bw_limit : int or None
+        backward limit to index
+
+    Returns
+    -------
+    set of indexers
 
+    Notes
+    -----
     This is equivalent to the more readable, but slower
 
     .. code-block:: python
@@ -660,6 +681,8 @@ def _interp_limit(invalid, fw_limit, bw_limit):
     # 1. operate on the reversed array
     # 2. subtract the returned indicies from N - 1
     N = len(invalid)
+    f_idx = set()
+    b_idx = set()
 
     def inner(invalid, limit):
         limit = min(limit, N)
@@ -668,18 +691,25 @@ def inner(invalid, limit):
                set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))
         return idx
 
-    if fw_limit == 0:
-        f_idx = set(np.where(invalid)[0])
-    else:
-        f_idx = inner(invalid, fw_limit)
+    if fw_limit is not None:
 
-    if bw_limit == 0:
-        # then we don't even need to care about backwards, just use forwards
-        return f_idx
-    else:
-        b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit))))
         if fw_limit == 0:
-            return b_idx
+            f_idx = set(np.where(invalid)[0])
+        else:
+            f_idx = inner(invalid, fw_limit)
+
+    if bw_limit is not None:
+
+        if bw_limit == 0:
+            # then we don't even need to care about backwards
+            # just use forwards
+            return f_idx
+        else:
+            b_idx = list(inner(invalid[::-1], bw_limit))
+            b_idx = set(N - 1 - np.asarray(b_idx))
+            if fw_limit == 0:
+                return b_idx
+
     return f_idx & b_idx