Skip to content

Commit

Permalink
Merge acf4f07 into 046f0ae
Browse files Browse the repository at this point in the history
  • Loading branch information
mansenfranzen committed Mar 15, 2019
2 parents 046f0ae + acf4f07 commit f344414
Show file tree
Hide file tree
Showing 17 changed files with 1,203 additions and 35 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[flake8]
72 changes: 49 additions & 23 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,57 @@ python:

env:
- ENV_STRING=pandas0.24.1
# - ENV_STRING=pandas0.24.0
#
# - ENV_STRING=pandas0.23.4
# - ENV_STRING=pandas0.23.3
# - ENV_STRING=pandas0.23.2
# - ENV_STRING=pandas0.23.1
# - ENV_STRING=pandas0.23.0
#
# - ENV_STRING=pandas0.22.0
#
# - ENV_STRING=pandas0.21.1
# - ENV_STRING=pandas0.21.0
#
# - ENV_STRING=pandas0.20.3
# - ENV_STRING=pandas0.20.2
# - ENV_STRING=pandas0.20.1
# - ENV_STRING=pandas0.20.0
#
# - ENV_STRING=pandas0.19.2
# - ENV_STRING=pandas0.19.1
# - ENV_STRING=pandas0.19.0
#
- ENV_STRING=pandas0.24.0

- ENV_STRING=pandas0.23.4
- ENV_STRING=pandas0.23.3
- ENV_STRING=pandas0.23.2
- ENV_STRING=pandas0.23.1
- ENV_STRING=pandas0.23.0

- ENV_STRING=pandas0.22.0

- ENV_STRING=pandas0.21.1
- ENV_STRING=pandas0.21.0

- ENV_STRING=pandas0.20.3
- ENV_STRING=pandas0.20.2
- ENV_STRING=pandas0.20.1
- ENV_STRING=pandas0.20.0

- ENV_STRING=pandas0.19.2
- ENV_STRING=pandas0.19.1
- ENV_STRING=pandas0.19.0

- ENV_STRING=pyspark2.4.0
# - ENV_STRING=pyspark2.3.1
- ENV_STRING=pyspark2.3.1


# Remove python/pandas version interactions which do not have wheels on pypi
matrix:
exclude:
- python: '3.7'
env: ENV_STRING=pandas0.22.0
- python: '3.7'
env: ENV_STRING=pandas0.21.1
- python: '3.7'
env: ENV_STRING=pandas0.21.0
- python: '3.7'
env: ENV_STRING=pandas0.20.3
- python: '3.7'
env: ENV_STRING=pandas0.20.2
- python: '3.7'
env: ENV_STRING=pandas0.20.1
- python: '3.7'
env: ENV_STRING=pandas0.20.0
- python: '3.7'
env: ENV_STRING=pandas0.19.2
- python: '3.7'
env: ENV_STRING=pandas0.19.1
- python: '3.7'
env: ENV_STRING=pandas0.19.0
- python: '3.6'
env: ENV_STRING=pandas0.19.0

dist: xenial

Expand Down
7 changes: 5 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,8 @@ Version 0.1.0

This is the initial release of pywrangler.

- Add `BaseWrangler` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Enable `pandas` and `pyspark` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Add sequential ``NaiveIterator`` and vectorized ``VectorizedCumSum`` pandas implementations for ``IntervalIdentifier`` wrangler (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``PandasWrangler`` (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``IntervalIdentifier`` wrangler interface (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
- Add ``BaseWrangler`` class defining wrangler interface (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
- Enable ``pandas`` and ``pyspark`` testing on TravisCI (`#1 <https://github.com/mansenfranzen/pywrangler/pull/1>`_).
40 changes: 40 additions & 0 deletions src/pywrangler/util/sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""This module contains common helper functions for sanity checks and
conversions.
"""

import collections
from typing import Any, Tuple


def ensure_tuple(values: Any) -> Tuple[Any]:
"""For convenience, some parameters may accept a single value (string
for a column name) or multiple values (list of strings for column
names). This function ensures that the output is always a tuple of values.
Parameters
----------
values: Any
Input values to be converted to tuples.
Returns
-------
tupled: Tuple[Any]
"""

# None remains None
if values is None:
return None

# if not iterable, return tuple with single value
elif not isinstance(values, collections.Iterable):
return (values, )

# handle single string which is iterable but still is only one value
elif isinstance(values, str):
return (values, )

# anything else should ok to be converted to tuple
else:
return tuple(values)
19 changes: 11 additions & 8 deletions src/pywrangler/wranglers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ class BaseWrangler:
method. Furthermore, `get_params` and `set_params` methods are required for
grid search and pipeline compatibility.
The `fit` method should contain any logic behind parameter validation (e.g.
type, shape and other sanity checks) and optional fitting (e.g. compute
mean and variance for scaling). The `transform` method includes the actual
computational transformation. The `fit_transform` simply applies the former
methods in sequence.
The `fit` method contains optional fitting (e.g. compute mean and variance
for scaling) which sets training data dependent transformation behaviour.
The `transform` method includes the actual computational transformation.
The `fit_transform` either applies the former methods in sequence or adds a
new implementation of both with better performance. The `__init__` method
should contain any logic behind parameter parsing and conversion.
In contrast to sklearn, wranglers do only accept dataframes like objects
(like pandas, spark or dask dataframes) as inputs to `fit` and `transform`.
Expand All @@ -42,14 +43,14 @@ class BaseWrangler:
"""

@property
def preserves_sample_size(self):
def preserves_sample_size(self) -> bool:
raise NotImplementedError

@property
def computation_engine(self):
def computation_engine(self) -> str:
raise NotImplementedError

def get_params(self):
def get_params(self) -> dict:
"""Retrieve all wrangler parameters set within the __init__ method.
Returns
Expand Down Expand Up @@ -93,6 +94,8 @@ def set_params(self, **params):

setattr(self, key, value)

return self

def fit(self):
raise NotImplementedError

Expand Down
98 changes: 98 additions & 0 deletions src/pywrangler/wranglers/interfaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""This module contains computation engine independent wrangler interfaces
and corresponding descriptions.
"""

from typing import Any, Iterable, Union

from pywrangler.util import sanitizer
from pywrangler.wranglers.base import BaseWrangler

TYPE_COLUMNS = Union[str, Iterable[str]]


class IntervalIdentifier(BaseWrangler):
"""Defines the reference interface for the interval identification
wrangler.
An interval is defined as a range of values beginning with an opening
marker and ending with a closing marker (e.g. the interval daylight may be
defined as all events/values occurring between sunrise and sunset).
The interval identification wrangler assigns ids to values such that values
belonging to the same interval share the same interval id. For example, all
values of the first daylight interval are assigned with id 1. All values of
the second daylight interval will be assigned with id 2 and so on.
Values which do not belong to any valid interval are assigned the value 0
by definition.
Only the shortest valid interval is identified. Given multiple opening
markers in sequence without an intermittent closing marker, only the last
opening marker is relevant and the rest is ignored. Given multiple
closing markers in sequence without an intermittent opening marker, only
the first closing marker is relevant and the rest is ignored.
Opening and closing markers are included in their corresponding interval.
Parameters
----------
marker_column: str
Name of column which contains the opening and closing markers.
marker_start: Any
A value defining the start of an interval.
marker_end: Any
A value defining the end of an interval.
order_columns: str, Iterable[str], optional
Column names which define the order of the data (e.g. a timestamp
column). Sort order can be defined with the parameter `ascending`.
groupby_columns: str, Iterable[str], optional
Column names which define how the data should be grouped/split into
separate entities. For distributed computation engines, groupby columns
should ideally reference partition keys to avoid data shuffling.
ascending: bool, Iterable[bool], optional
Sort ascending vs. descending. Specify list for multiple sort orders.
If a list is specified, length of the list must equal length of
`order_columns`. Default is True.
target_column_name: str, optional
Name of the resulting target column.
"""

def __init__(self,
marker_column: str,
marker_start: Any,
marker_end: Any,
order_columns: TYPE_COLUMNS = None,
groupby_columns: TYPE_COLUMNS = None,
ascending: Union[bool, Iterable[bool]] = None,
target_column_name: str = "iids"):

self.marker_column = marker_column
self.marker_start = marker_start
self.marker_end = marker_end
self.order_columns = sanitizer.ensure_tuple(order_columns)
self.groupby_columns = sanitizer.ensure_tuple(groupby_columns)
self.ascending = sanitizer.ensure_tuple(ascending)
self.target_column_name = target_column_name

# sanity checks for sort order
if self.ascending:

# check for equal number of items of order and sort columns
if len(self.order_columns) != len(self.ascending):
raise ValueError('`order_columns` and `ascending` must have '
'equal number of items.')

# check for correct sorting keywords
if not all([isinstance(x, bool) for x in self.ascending]):
raise ValueError('Only `True` and `False` are '
'as arguments for `ascending`')

# set default sort order if None is given
elif self.order_columns:
self.ascending = [True] * len(self.order_columns)

@property
def preserves_sample_size(self) -> bool:
return True
Empty file.
80 changes: 80 additions & 0 deletions src/pywrangler/wranglers/pandas/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""This module contains the pandas base wrangler.
"""

from typing import Tuple

import pandas as pd

from pywrangler.wranglers.base import BaseWrangler


class PandasWrangler(BaseWrangler):
"""Contains methods common to all pandas based wranglers.
"""

@property
def computation_engine(self):
return "pandas"

def validate_output_shape(self, df_in: pd.DataFrame, df_out: pd.DataFrame):
"""If wrangler implementation preserves sample size, assert equal
sample sizes between input and output dataframe.
Using pandas, all data is in memory. Hence, getting shape information
is cheap and this check can be done regularly (in contrast to pyspark
where `df.count()` can be very expensive).
Parameters
----------
df_in: pd.DataFrame
Input dataframe.
df_out: pd.DataFrame
Output dataframe.
"""

if self.preserves_sample_size:
shape_in = df_in.shape[0]
shape_out = df_out.shape[0]

if shape_in != shape_out:
raise ValueError('Number of input samples ({}) does not match '
'number of ouput samples ({}) which should '
'be the case because wrangler is supposed to '
'preserve the number of samples.'
.format(shape_in, shape_out))

@staticmethod
def validate_empty_df(df: pd.DataFrame):
"""Check for empty dataframe. By definition, wranglers operate on non
empty dataframe. Therefore, raise error if dataframe is empty.
Parameters
----------
df: pd.DataFrame
Dataframe to check against.
"""

if df.empty:
raise ValueError('Dataframe is empty.')

@staticmethod
def validate_columns(df: pd.DataFrame, columns: Tuple[str]):
"""Check that columns exist in dataframe and raise error if otherwise.
Parameters
----------
df: pd.DataFrame
Dataframe to check against.
columns: Tuple[str]
Columns to be validated.
"""

for column in columns:
if column not in df.columns:
raise ValueError('Column with name `{}` does not exist. '
'Please check parameter settings.')
Loading

0 comments on commit f344414

Please sign in to comment.