Skip to content

Commit

Permalink
Merge pull request #2 from mansenfranzen/feature_interval_identfier
Browse files Browse the repository at this point in the history
Feature interval identfier
  • Loading branch information
mansenfranzen committed Mar 16, 2019
2 parents 046f0ae + fc94238 commit e4aad01
Show file tree
Hide file tree
Showing 18 changed files with 1,303 additions and 35 deletions.
1 change: 1 addition & 0 deletions .flake8
@@ -0,0 +1 @@
[flake8]
72 changes: 49 additions & 23 deletions .travis.yml
Expand Up @@ -9,31 +9,57 @@ python:

env:
- ENV_STRING=pandas0.24.1
# - ENV_STRING=pandas0.24.0
#
# - ENV_STRING=pandas0.23.4
# - ENV_STRING=pandas0.23.3
# - ENV_STRING=pandas0.23.2
# - ENV_STRING=pandas0.23.1
# - ENV_STRING=pandas0.23.0
#
# - ENV_STRING=pandas0.22.0
#
# - ENV_STRING=pandas0.21.1
# - ENV_STRING=pandas0.21.0
#
# - ENV_STRING=pandas0.20.3
# - ENV_STRING=pandas0.20.2
# - ENV_STRING=pandas0.20.1
# - ENV_STRING=pandas0.20.0
#
# - ENV_STRING=pandas0.19.2
# - ENV_STRING=pandas0.19.1
# - ENV_STRING=pandas0.19.0
#
- ENV_STRING=pandas0.24.0

- ENV_STRING=pandas0.23.4
- ENV_STRING=pandas0.23.3
- ENV_STRING=pandas0.23.2
- ENV_STRING=pandas0.23.1
- ENV_STRING=pandas0.23.0

- ENV_STRING=pandas0.22.0

- ENV_STRING=pandas0.21.1
- ENV_STRING=pandas0.21.0

- ENV_STRING=pandas0.20.3
- ENV_STRING=pandas0.20.2
- ENV_STRING=pandas0.20.1
- ENV_STRING=pandas0.20.0

- ENV_STRING=pandas0.19.2
- ENV_STRING=pandas0.19.1
- ENV_STRING=pandas0.19.0

- ENV_STRING=pyspark2.4.0
# - ENV_STRING=pyspark2.3.1
- ENV_STRING=pyspark2.3.1


# Remove python/pandas version interactions which do not have wheels on pypi
matrix:
exclude:
- python: '3.7'
env: ENV_STRING=pandas0.22.0
- python: '3.7'
env: ENV_STRING=pandas0.21.1
- python: '3.7'
env: ENV_STRING=pandas0.21.0
- python: '3.7'
env: ENV_STRING=pandas0.20.3
- python: '3.7'
env: ENV_STRING=pandas0.20.2
- python: '3.7'
env: ENV_STRING=pandas0.20.1
- python: '3.7'
env: ENV_STRING=pandas0.20.0
- python: '3.7'
env: ENV_STRING=pandas0.19.2
- python: '3.7'
env: ENV_STRING=pandas0.19.1
- python: '3.7'
env: ENV_STRING=pandas0.19.0
- python: '3.6'
env: ENV_STRING=pandas0.19.0

dist: xenial

Expand Down
7 changes: 5 additions & 2 deletions CHANGELOG.rst
Expand Up @@ -7,5 +7,8 @@ Version 0.1.0

This is the initial release of pywrangler.

- Add `BaseWrangler` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Enable `pandas` and `pyspark` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Add sequential ``NaiveIterator`` and vectorized ``VectorizedCumSum`` pandas implementations for ``IntervalIdentifier`` wrangler (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``PandasWrangler`` (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``IntervalIdentifier`` wrangler interface (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``BaseWrangler`` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Enable ``pandas`` and ``pyspark`` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
40 changes: 40 additions & 0 deletions src/pywrangler/util/sanitizer.py
@@ -0,0 +1,40 @@
"""This module contains common helper functions for sanity checks and
conversions.
"""

import collections
from typing import Any, Tuple


def ensure_tuple(values: Any) -> Tuple[Any]:
"""For convenience, some parameters may accept a single value (string
for a column name) or multiple values (list of strings for column
names). This function ensures that the output is always a tuple of values.
Parameters
----------
values: Any
Input values to be converted to tuples.
Returns
-------
tupled: Tuple[Any]
"""

# None remains None
if values is None:
return None

# if not iterable, return tuple with single value
elif not isinstance(values, collections.Iterable):
return (values, )

# handle single string which is iterable but still is only one value
elif isinstance(values, str):
return (values, )

# anything else should ok to be converted to tuple
else:
return tuple(values)
8 changes: 8 additions & 0 deletions src/pywrangler/util/types.py
@@ -0,0 +1,8 @@
"""This module contains type definitions.
"""

from typing import Iterable, Union

TYPE_COLUMNS = Union[str, Iterable[str], None]
TYPE_ASCENDING = Union[bool, Iterable[bool], None]
19 changes: 11 additions & 8 deletions src/pywrangler/wranglers/base.py
Expand Up @@ -17,11 +17,12 @@ class BaseWrangler:
method. Furthermore, `get_params` and `set_params` methods are required for
grid search and pipeline compatibility.
The `fit` method should contain any logic behind parameter validation (e.g.
type, shape and other sanity checks) and optional fitting (e.g. compute
mean and variance for scaling). The `transform` method includes the actual
computational transformation. The `fit_transform` simply applies the former
methods in sequence.
The `fit` method contains optional fitting (e.g. compute mean and variance
for scaling) which sets training data dependent transformation behaviour.
The `transform` method includes the actual computational transformation.
The `fit_transform` either applies the former methods in sequence or adds a
new implementation of both with better performance. The `__init__` method
should contain any logic behind parameter parsing and conversion.
In contrast to sklearn, wranglers do only accept dataframes like objects
(like pandas, spark or dask dataframes) as inputs to `fit` and `transform`.
Expand All @@ -42,14 +43,14 @@ class BaseWrangler:
"""

@property
def preserves_sample_size(self):
def preserves_sample_size(self) -> bool:
raise NotImplementedError

@property
def computation_engine(self):
def computation_engine(self) -> str:
raise NotImplementedError

def get_params(self):
def get_params(self) -> dict:
"""Retrieve all wrangler parameters set within the __init__ method.
Returns
Expand Down Expand Up @@ -93,6 +94,8 @@ def set_params(self, **params):

setattr(self, key, value)

return self

def fit(self):
raise NotImplementedError

Expand Down
96 changes: 96 additions & 0 deletions src/pywrangler/wranglers/interfaces.py
@@ -0,0 +1,96 @@
"""This module contains computation engine independent wrangler interfaces
and corresponding descriptions.
"""


from pywrangler.util import sanitizer
from pywrangler.util.types import TYPE_ASCENDING, TYPE_COLUMNS
from pywrangler.wranglers.base import BaseWrangler


class IntervalIdentifier(BaseWrangler):
"""Defines the reference interface for the interval identification
wrangler.
An interval is defined as a range of values beginning with an opening
marker and ending with a closing marker (e.g. the interval daylight may be
defined as all events/values occurring between sunrise and sunset).
The interval identification wrangler assigns ids to values such that values
belonging to the same interval share the same interval id. For example, all
values of the first daylight interval are assigned with id 1. All values of
the second daylight interval will be assigned with id 2 and so on.
Values which do not belong to any valid interval are assigned the value 0
by definition.
Only the shortest valid interval is identified. Given multiple opening
markers in sequence without an intermittent closing marker, only the last
opening marker is relevant and the rest is ignored. Given multiple
closing markers in sequence without an intermittent opening marker, only
the first closing marker is relevant and the rest is ignored.
Opening and closing markers are included in their corresponding interval.
Parameters
----------
marker_column: str
Name of column which contains the opening and closing markers.
marker_start: Any
A value defining the start of an interval.
marker_end: Any
A value defining the end of an interval.
order_columns: str, Iterable[str], optional
Column names which define the order of the data (e.g. a timestamp
column). Sort order can be defined with the parameter `ascending`.
groupby_columns: str, Iterable[str], optional
Column names which define how the data should be grouped/split into
separate entities. For distributed computation engines, groupby columns
should ideally reference partition keys to avoid data shuffling.
ascending: bool, Iterable[bool], optional
Sort ascending vs. descending. Specify list for multiple sort orders.
If a list is specified, length of the list must equal length of
`order_columns`. Default is True.
target_column_name: str, optional
Name of the resulting target column.
"""

def __init__(self,
marker_column: str,
marker_start,
marker_end,
order_columns: TYPE_COLUMNS = None,
groupby_columns: TYPE_COLUMNS = None,
ascending: TYPE_ASCENDING = None,
target_column_name: str = "iids"):

self.marker_column = marker_column
self.marker_start = marker_start
self.marker_end = marker_end
self.order_columns = sanitizer.ensure_tuple(order_columns)
self.groupby_columns = sanitizer.ensure_tuple(groupby_columns)
self.ascending = sanitizer.ensure_tuple(ascending)
self.target_column_name = target_column_name

# sanity checks for sort order
if self.ascending:

# check for equal number of items of order and sort columns
if len(self.order_columns) != len(self.ascending):
raise ValueError('`order_columns` and `ascending` must have '
'equal number of items.')

# check for correct sorting keywords
if not all([isinstance(x, bool) for x in self.ascending]):
raise ValueError('Only `True` and `False` are '
'as arguments for `ascending`')

# set default sort order if None is given
elif self.order_columns:
self.ascending = (True, ) * len(self.order_columns)

@property
def preserves_sample_size(self) -> bool:
return True
Empty file.

0 comments on commit e4aad01

Please sign in to comment.