Skip to content

Commit

Permalink
Merge 4bc5889 into 35b26d2
Browse files Browse the repository at this point in the history
  • Loading branch information
mansenfranzen committed Mar 1, 2019
2 parents 35b26d2 + 4bc5889 commit 2c52815
Show file tree
Hide file tree
Showing 12 changed files with 404 additions and 5 deletions.
13 changes: 10 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
sudo: false
# sudo: false

language: python

python:
- '3.5'
# - '3.6'
# - '3.7'
- '3.6'
- '3.7'

env:
- ENV_STRING=pandas0.24.1
Expand Down Expand Up @@ -34,8 +34,15 @@ env:
- ENV_STRING=pyspark2.4.0
# - ENV_STRING=pyspark2.3.1


dist: xenial

before_install:
- source tests/travis_java_install.sh

install:
- travis_retry pip install --upgrade pip
- travis_retry pip install --upgrade setuptools
- travis_retry pip install coveralls flake8 tox

script:
Expand Down
Empty file added src/pywrangler/util/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions src/pywrangler/util/_pprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""This module contains helper functions for printing.
"""

import typing

ITERABLE = typing.Union[typing.List[str], typing.Tuple[str]]
ENUM = typing.Union[ITERABLE, dict]


def _join(lines: ITERABLE) -> str:
"""Join given lines.
Parameters
----------
lines: list, tuple
Iterable to join.
Returns
-------
joined: str
"""

return "\n".join(lines)


def _indent(lines: ITERABLE, indent: int = 3) -> list:
"""Indent given lines and optionally join.
Parameters
----------
lines: list, tuple
Iterable to indent.
indent: int, optional
Indentation count.
"""

spacing = " " * indent
return [spacing + x for x in lines]


def header(name: str, indent: int = 0, underline: str = "-") -> str:
"""Create header with underline.
Parameters
----------
name: str
Name of title.
indent: int, optional
Indentation count.
underline: str, optional
Underline character.
Returns
-------
header: str
"""

_indent = " " * indent

_header = _indent + name
_underline = _indent + underline*len(name) + "\n"

return _join([_header, _underline])


def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
"""Create enumeration with bullet points.
Parameters
----------
values: list, tuple, dict
Iterable vales. If dict, creates key/value pairs..
indent: int, optional
Indentation count.
bullet_char: str, optional
Bullet character.
Returns
-------
enumeration: str
"""

if isinstance(values, dict):
_values = ["{key}: {value}".format(key=key, value=value)
for key, value in sorted(values.items())]
else:
_values = values

with_bullets = ["{} {}".format(bullet_char, x) for x in _values]
indented = _indent(with_bullets, indent)

return _join(indented)
Empty file.
119 changes: 119 additions & 0 deletions src/pywrangler/wranglers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""This module contains the BaseWrangler definition and the wrangler base
classes including wrangler descriptions and parameters.
"""

import inspect

from pywrangler.util import _pprint


class BaseWrangler:
"""Defines the basic interface common to all data wranglers.
In analogy to sklearn transformers (see link below), all wranglers have to
implement `fit`, `transform` and `fit_transform` methods. In addition,
parameters (e.g. column names) need to be provided via the `__init__`
method. Furthermore, `get_params` and `set_params` methods are required for
grid search and pipeline compatibility.
The `fit` method should contain any logic behind parameter validation (e.g.
type, shape and other sanity checks) and optional fitting (e.g. compute
mean and variance for scaling). The `transform` method includes the actual
computational transformation. The `fit_transform` simply applies the former
methods in sequence.
In contrast to sklearn, wranglers do only accept dataframes like objects
(like pandas, spark or dask dataframes) as inputs to `fit` and `transform`.
The relevant columns and their respective meaning is provided via the
`__init__` method. In addition, wranglers may accept multiple input
dataframes with different shapes. Also, the number of samples may also
change between input and output (which is not allowed in sklearn). The
`preserves_sample_size` indicates whether sample size (number of rows) may
change during transformation.
The wrangler's employed computation engine is given via
`computation_engine`.
See also
--------
https://scikit-learn.org/stable/developers/contributing.html
"""

@property
def preserves_sample_size(self):
raise NotImplementedError

@property
def computation_engine(self):
raise NotImplementedError

def get_params(self):
"""Retrieve all wrangler parameters set within the __init__ method.
Returns
-------
param_dict: dictionary
Parameter names as keys and corresponding values as values
"""

init = self.__class__.__init__
signature = inspect.signature(init)
parameters = signature.parameters.values()

param_names = [x.name for x in parameters if x.name != "self"]
param_dict = {x: getattr(self, x) for x in param_names}

return param_dict

def set_params(self, **params):
"""Set wrangler parameters
Parameters
----------
params: dict
Dictionary containing new values to be updated on wrangler. Keys
have to match parameter names of wrangler.
Returns
-------
self
"""

valid_params = self.get_params()
for key, value in params.items():
if key not in valid_params:
raise ValueError('Invalid parameter {} for wrangler {}. '
'Check the list of available parameters '
'with `wrangler.get_params().keys()`.'
.format(key, self))

setattr(self, key, value)

def fit(self):
raise NotImplementedError

def transform(self):
raise NotImplementedError

def fit_transform(self):
raise NotImplementedError

def __repr__(self):

template = '{wrangler_name} ({computation_engine})\n\n{parameters}'\

parameters = (_pprint.header("Parameters", 3) +
_pprint.enumeration(self.get_params().items(), 3))

_repr = template.format(wrangler_name=self.__class__.__name__,
computation_engine=self.computation_engine,
parameters=parameters)

if not self.preserves_sample_size:
_repr += "\n\n Note: Does not preserve sample size."

return _repr
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def pytest_addoption(parser):

def pytest_collection_modifyitems(config, items):
"""By default, pyspark and dask tests are skipped if not otherwise declared
via command line or the TSWRANGLER_TEST_ENV environment variable.
via command line or the PYWRANGLER_TEST_ENV environment variable.
"""

for skip_item in ("pyspark", "dask"):

tox_env = os.environ.get("TSWRANGLER_TEST_ENV", "").lower()
tox_env = os.environ.get("PYWRANGLER_TEST_ENV", "").lower()
run_env = skip_item in tox_env
run_cmd = config.getoption("--{}".format(skip_item))

Expand Down
22 changes: 22 additions & 0 deletions tests/travis_java_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

# Spark requires Java 8 in order to work properly. However, TravisCI's Ubuntu
# 16.04 ships with Java 11 and Java can't be set with `jdk` when python is
# selected as language. Ubuntu 14.04 does not work due to missing python 3.7
# support on TravisCI which does have Java 8 as default.

if [[ $ENV_STRING == *"spark"* ]]; then
# show current JAVA_HOME and java version
echo "Current JAVA_HOME: $JAVA_HOME"
echo "Current java -version:"
java -version

# install Java 8
sudo add-apt-repository -y ppa:openjdk-r/ppa
sudo apt-get -qq update
sudo apt-get install -y openjdk-8-jdk --no-install-recommends
sudo update-java-alternatives -s java-1.8.0-openjdk-amd64

# change JAVA_HOME to Java 8
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
fi
Empty file added tests/util/__init__.py
Empty file.
79 changes: 79 additions & 0 deletions tests/util/test_pprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Test printing helpers.
"""

import pytest

from pywrangler.util import _pprint


def test_join():

test_input = ["a", "b", "c"]
test_output = "a\nb\nc"

assert _pprint._join(test_input) == test_output


def test_indent():

test_input = ["a", "b", "c"]
test_output = [" a", " b", " c"]

assert _pprint._indent(test_input, 3) == test_output


def test_header():

test_input = "Header"
test_output = 'Header\n------\n'

assert _pprint.header(test_input) == test_output


def test_header_with_indent():

test_input = "Header"
test_output = ' Header\n ------\n'

assert _pprint.header(test_input, indent=3) == test_output


def test_header_with_underline():

test_input = "Header"
test_output = 'Header\n======\n'

assert _pprint.header(test_input, underline="=") == test_output


def test_enumeration_dict():

test_input = {"a": 1, "b": 2}
test_output = '- a: 1\n- b: 2'

assert _pprint.enumeration(test_input) == test_output


def test_enumeration_list():

test_input = ["note 1", "note 2"]
test_output = '- note 1\n- note 2'

assert _pprint.enumeration(test_input) == test_output


def test_enumeration_list_with_indent():

test_input = ["note 1", "note 2"]
test_output = ' - note 1\n - note 2'

assert _pprint.enumeration(test_input, indent=4) == test_output


def test_enumeration_list_with_bullet():

test_input = ["note 1", "note 2"]
test_output = 'o note 1\no note 2'

assert _pprint.enumeration(test_input, bullet_char="o") == test_output
Empty file added tests/wranglers/__init__.py
Empty file.
Loading

0 comments on commit 2c52815

Please sign in to comment.