Merge acf4f07 into 046f0ae

mansenfranzen · Mar 15, 2019 · f344414 · f344414
2 parents 046f0ae + acf4f07
commit f344414
Show file tree

Hide file tree

Showing 17 changed files with 1,203 additions and 35 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1 @@
+[flake8]
diff --git a/.travis.yml b/.travis.yml
@@ -9,31 +9,57 @@ python:
 
 env:
   - ENV_STRING=pandas0.24.1
-#  - ENV_STRING=pandas0.24.0
-#
-#  - ENV_STRING=pandas0.23.4
-#  - ENV_STRING=pandas0.23.3
-#  - ENV_STRING=pandas0.23.2
-#  - ENV_STRING=pandas0.23.1
-#  - ENV_STRING=pandas0.23.0
-#
-#  - ENV_STRING=pandas0.22.0
-#
-#  - ENV_STRING=pandas0.21.1
-#  - ENV_STRING=pandas0.21.0
-#
-#  - ENV_STRING=pandas0.20.3
-#  - ENV_STRING=pandas0.20.2
-#  - ENV_STRING=pandas0.20.1
-#  - ENV_STRING=pandas0.20.0
-#
-#  - ENV_STRING=pandas0.19.2
-#  - ENV_STRING=pandas0.19.1
-#  - ENV_STRING=pandas0.19.0
-#
+  - ENV_STRING=pandas0.24.0
+
+  - ENV_STRING=pandas0.23.4
+  - ENV_STRING=pandas0.23.3
+  - ENV_STRING=pandas0.23.2
+  - ENV_STRING=pandas0.23.1
+  - ENV_STRING=pandas0.23.0
+
+  - ENV_STRING=pandas0.22.0
+
+  - ENV_STRING=pandas0.21.1
+  - ENV_STRING=pandas0.21.0
+
+  - ENV_STRING=pandas0.20.3
+  - ENV_STRING=pandas0.20.2
+  - ENV_STRING=pandas0.20.1
+  - ENV_STRING=pandas0.20.0
+
+  - ENV_STRING=pandas0.19.2
+  - ENV_STRING=pandas0.19.1
+  - ENV_STRING=pandas0.19.0
+
   - ENV_STRING=pyspark2.4.0
-#  - ENV_STRING=pyspark2.3.1
+  - ENV_STRING=pyspark2.3.1
+
 
+# Remove python/pandas version interactions which do not have wheels on pypi
+matrix:
+  exclude:
+    - python: '3.7'
+      env: ENV_STRING=pandas0.22.0
+    - python: '3.7'
+      env: ENV_STRING=pandas0.21.1
+    - python: '3.7'
+      env: ENV_STRING=pandas0.21.0
+    - python: '3.7'
+      env: ENV_STRING=pandas0.20.3
+    - python: '3.7'
+      env: ENV_STRING=pandas0.20.2
+    - python: '3.7'
+      env: ENV_STRING=pandas0.20.1
+    - python: '3.7'
+      env: ENV_STRING=pandas0.20.0
+    - python: '3.7'
+      env: ENV_STRING=pandas0.19.2
+    - python: '3.7'
+      env: ENV_STRING=pandas0.19.1
+    - python: '3.7'
+      env: ENV_STRING=pandas0.19.0
+    - python: '3.6'
+      env: ENV_STRING=pandas0.19.0
 
 dist: xenial
 

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,5 +7,8 @@ Version 0.1.0
 
 This is the initial release of pywrangler.
 
-- Add `BaseWrangler` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
-- Enable `pandas` and `pyspark` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
+- Add sequential ``NaiveIterator`` and vectorized ``VectorizedCumSum`` pandas implementations for ``IntervalIdentifier`` wrangler (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
+- Add ``PandasWrangler`` (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
+- Add ``IntervalIdentifier`` wrangler interface (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
+- Add ``BaseWrangler`` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
+- Enable ``pandas`` and ``pyspark`` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
diff --git a/src/pywrangler/util/sanitizer.py b/src/pywrangler/util/sanitizer.py
@@ -0,0 +1,40 @@
+"""This module contains common helper functions for sanity checks and
+conversions.
+
+"""
+
+import collections
+from typing import Any, Tuple
+
+
+def ensure_tuple(values: Any) -> Tuple[Any]:
+    """For convenience, some parameters may accept a single value (string
+    for a column name) or multiple values (list of strings for column
+    names). This function ensures that the output is always a tuple of values.
+
+    Parameters
+    ----------
+    values: Any
+        Input values to be converted to tuples.
+
+    Returns
+    -------
+    tupled: Tuple[Any]
+
+    """
+
+    # None remains None
+    if values is None:
+        return None
+
+    # if not iterable, return tuple with single value
+    elif not isinstance(values, collections.Iterable):
+        return (values, )
+
+    # handle single string which is iterable but still is only one value
+    elif isinstance(values, str):
+        return (values, )
+
+    # anything else should ok to be converted to tuple
+    else:
+        return tuple(values)
diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py
@@ -17,11 +17,12 @@ class BaseWrangler:
     method. Furthermore, `get_params` and `set_params` methods are required for
     grid search and pipeline compatibility.
 
-    The `fit` method should contain any logic behind parameter validation (e.g.
-    type, shape and other sanity checks) and optional fitting (e.g. compute
-    mean and variance for scaling). The `transform` method includes the actual
-    computational transformation. The `fit_transform` simply applies the former
-    methods in sequence.
+    The `fit` method contains optional fitting (e.g. compute mean and variance
+    for scaling) which sets training data dependent transformation behaviour.
+    The `transform` method includes the actual computational transformation.
+    The `fit_transform` either applies the former methods in sequence or adds a
+    new implementation of both with better performance. The `__init__` method
+    should contain any logic behind parameter parsing and conversion.
 
     In contrast to sklearn, wranglers do only accept dataframes like objects
     (like pandas, spark or dask dataframes) as inputs to `fit` and `transform`.
@@ -42,14 +43,14 @@ class BaseWrangler:
     """
 
     @property
-    def preserves_sample_size(self):
+    def preserves_sample_size(self) -> bool:
         raise NotImplementedError
 
     @property
-    def computation_engine(self):
+    def computation_engine(self) -> str:
         raise NotImplementedError
 
-    def get_params(self):
+    def get_params(self) -> dict:
         """Retrieve all wrangler parameters set within the __init__ method.
 
         Returns
@@ -93,6 +94,8 @@ def set_params(self, **params):
 
             setattr(self, key, value)
 
+        return self
+
     def fit(self):
         raise NotImplementedError
 

diff --git a/src/pywrangler/wranglers/interfaces.py b/src/pywrangler/wranglers/interfaces.py
@@ -0,0 +1,98 @@
+"""This module contains computation engine independent wrangler interfaces
+and corresponding descriptions.
+
+"""
+
+from typing import Any, Iterable, Union
+
+from pywrangler.util import sanitizer
+from pywrangler.wranglers.base import BaseWrangler
+
+TYPE_COLUMNS = Union[str, Iterable[str]]
+
+
+class IntervalIdentifier(BaseWrangler):
+    """Defines the reference interface for the interval identification
+    wrangler.
+
+    An interval is defined as a range of values beginning with an opening
+    marker and ending with a closing marker (e.g. the interval daylight may be
+    defined as all events/values occurring between sunrise and sunset).
+
+    The interval identification wrangler assigns ids to values such that values
+    belonging to the same interval share the same interval id. For example, all
+    values of the first daylight interval are assigned with id 1. All values of
+    the second daylight interval will be assigned with id 2 and so on.
+
+    Values which do not belong to any valid interval are assigned the value 0
+    by definition.
+
+    Only the shortest valid interval is identified. Given multiple opening
+    markers in sequence without an intermittent closing marker, only the last
+    opening marker is relevant and the rest is ignored. Given multiple
+    closing markers in sequence without an intermittent opening marker, only
+    the first closing marker is relevant and the rest is ignored.
+
+    Opening and closing markers are included in their corresponding interval.
+
+    Parameters
+    ----------
+    marker_column: str
+        Name of column which contains the opening and closing markers.
+    marker_start: Any
+        A value defining the start of an interval.
+    marker_end: Any
+        A value defining the end of an interval.
+    order_columns: str, Iterable[str], optional
+        Column names which define the order of the data (e.g. a timestamp
+        column). Sort order can be defined with the parameter `ascending`.
+    groupby_columns: str, Iterable[str], optional
+        Column names which define how the data should be grouped/split into
+        separate entities. For distributed computation engines, groupby columns
+        should ideally reference partition keys to avoid data shuffling.
+    ascending: bool, Iterable[bool], optional
+        Sort ascending vs. descending. Specify list for multiple sort orders.
+        If a list is specified, length of the list must equal length of
+        `order_columns`. Default is True.
+    target_column_name: str, optional
+        Name of the resulting target column.
+
+    """
+
+    def __init__(self,
+                 marker_column: str,
+                 marker_start: Any,
+                 marker_end: Any,
+                 order_columns: TYPE_COLUMNS = None,
+                 groupby_columns: TYPE_COLUMNS = None,
+                 ascending: Union[bool, Iterable[bool]] = None,
+                 target_column_name: str = "iids"):
+
+        self.marker_column = marker_column
+        self.marker_start = marker_start
+        self.marker_end = marker_end
+        self.order_columns = sanitizer.ensure_tuple(order_columns)
+        self.groupby_columns = sanitizer.ensure_tuple(groupby_columns)
+        self.ascending = sanitizer.ensure_tuple(ascending)
+        self.target_column_name = target_column_name
+
+        # sanity checks for sort order
+        if self.ascending:
+
+            # check for equal number of items of order and sort columns
+            if len(self.order_columns) != len(self.ascending):
+                raise ValueError('`order_columns` and `ascending` must have '
+                                 'equal number of items.')
+
+            # check for correct sorting keywords
+            if not all([isinstance(x, bool) for x in self.ascending]):
+                raise ValueError('Only `True` and `False` are '
+                                 'as arguments for `ascending`')
+
+        # set default sort order if None is given
+        elif self.order_columns:
+            self.ascending = [True] * len(self.order_columns)
+
+    @property
+    def preserves_sample_size(self) -> bool:
+        return True
diff --git a/src/pywrangler/wranglers/pandas/__init__.py b/src/pywrangler/wranglers/pandas/__init__.py
diff --git a/src/pywrangler/wranglers/pandas/base.py b/src/pywrangler/wranglers/pandas/base.py
@@ -0,0 +1,80 @@
+"""This module contains the pandas base wrangler.
+
+"""
+
+from typing import Tuple
+
+import pandas as pd
+
+from pywrangler.wranglers.base import BaseWrangler
+
+
+class PandasWrangler(BaseWrangler):
+    """Contains methods common to all pandas based wranglers.
+
+    """
+
+    @property
+    def computation_engine(self):
+        return "pandas"
+
+    def validate_output_shape(self, df_in: pd.DataFrame, df_out: pd.DataFrame):
+        """If wrangler implementation preserves sample size, assert equal
+        sample sizes between input and output dataframe.
+
+        Using pandas, all data is in memory. Hence, getting shape information
+        is cheap and this check can be done regularly (in contrast to pyspark
+        where `df.count()` can be very expensive).
+
+        Parameters
+        ----------
+        df_in: pd.DataFrame
+            Input dataframe.
+        df_out: pd.DataFrame
+            Output dataframe.
+
+        """
+
+        if self.preserves_sample_size:
+            shape_in = df_in.shape[0]
+            shape_out = df_out.shape[0]
+
+            if shape_in != shape_out:
+                raise ValueError('Number of input samples ({}) does not match '
+                                 'number of ouput samples ({}) which should '
+                                 'be the case because wrangler is supposed to '
+                                 'preserve the number of samples.'
+                                 .format(shape_in, shape_out))
+
+    @staticmethod
+    def validate_empty_df(df: pd.DataFrame):
+        """Check for empty dataframe. By definition, wranglers operate on non
+        empty dataframe. Therefore, raise error if dataframe is empty.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Dataframe to check against.
+
+        """
+
+        if df.empty:
+            raise ValueError('Dataframe is empty.')
+
+    @staticmethod
+    def validate_columns(df: pd.DataFrame, columns: Tuple[str]):
+        """Check that columns exist in dataframe and raise error if otherwise.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Dataframe to check against.
+        columns: Tuple[str]
+            Columns to be validated.
+
+        """
+
+        for column in columns:
+            if column not in df.columns:
+                raise ValueError('Column with name `{}` does not exist. '
+                                 'Please check parameter settings.')