-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
404 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
"""This module contains helper functions for printing. | ||
""" | ||
|
||
import typing | ||
|
||
ITERABLE = typing.Union[typing.List[str], typing.Tuple[str]] | ||
ENUM = typing.Union[ITERABLE, dict] | ||
|
||
|
||
def _join(lines: ITERABLE) -> str: | ||
"""Join given lines. | ||
Parameters | ||
---------- | ||
lines: list, tuple | ||
Iterable to join. | ||
Returns | ||
------- | ||
joined: str | ||
""" | ||
|
||
return "\n".join(lines) | ||
|
||
|
||
def _indent(lines: ITERABLE, indent: int = 3) -> list: | ||
"""Indent given lines and optionally join. | ||
Parameters | ||
---------- | ||
lines: list, tuple | ||
Iterable to indent. | ||
indent: int, optional | ||
Indentation count. | ||
""" | ||
|
||
spacing = " " * indent | ||
return [spacing + x for x in lines] | ||
|
||
|
||
def header(name: str, indent: int = 0, underline: str = "-") -> str: | ||
"""Create header with underline. | ||
Parameters | ||
---------- | ||
name: str | ||
Name of title. | ||
indent: int, optional | ||
Indentation count. | ||
underline: str, optional | ||
Underline character. | ||
Returns | ||
------- | ||
header: str | ||
""" | ||
|
||
_indent = " " * indent | ||
|
||
_header = _indent + name | ||
_underline = _indent + underline*len(name) + "\n" | ||
|
||
return _join([_header, _underline]) | ||
|
||
|
||
def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str: | ||
"""Create enumeration with bullet points. | ||
Parameters | ||
---------- | ||
values: list, tuple, dict | ||
Iterable vales. If dict, creates key/value pairs.. | ||
indent: int, optional | ||
Indentation count. | ||
bullet_char: str, optional | ||
Bullet character. | ||
Returns | ||
------- | ||
enumeration: str | ||
""" | ||
|
||
if isinstance(values, dict): | ||
_values = ["{key}: {value}".format(key=key, value=value) | ||
for key, value in sorted(values.items())] | ||
else: | ||
_values = values | ||
|
||
with_bullets = ["{} {}".format(bullet_char, x) for x in _values] | ||
indented = _indent(with_bullets, indent) | ||
|
||
return _join(indented) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
"""This module contains the BaseWrangler definition and the wrangler base | ||
classes including wrangler descriptions and parameters. | ||
""" | ||
|
||
import inspect | ||
|
||
from pywrangler.util import _pprint | ||
|
||
|
||
class BaseWrangler: | ||
"""Defines the basic interface common to all data wranglers. | ||
In analogy to sklearn transformers (see link below), all wranglers have to | ||
implement `fit`, `transform` and `fit_transform` methods. In addition, | ||
parameters (e.g. column names) need to be provided via the `__init__` | ||
method. Furthermore, `get_params` and `set_params` methods are required for | ||
grid search and pipeline compatibility. | ||
The `fit` method should contain any logic behind parameter validation (e.g. | ||
type, shape and other sanity checks) and optional fitting (e.g. compute | ||
mean and variance for scaling). The `transform` method includes the actual | ||
computational transformation. The `fit_transform` simply applies the former | ||
methods in sequence. | ||
In contrast to sklearn, wranglers do only accept dataframes like objects | ||
(like pandas, spark or dask dataframes) as inputs to `fit` and `transform`. | ||
The relevant columns and their respective meaning is provided via the | ||
`__init__` method. In addition, wranglers may accept multiple input | ||
dataframes with different shapes. Also, the number of samples may also | ||
change between input and output (which is not allowed in sklearn). The | ||
`preserves_sample_size` indicates whether sample size (number of rows) may | ||
change during transformation. | ||
The wrangler's employed computation engine is given via | ||
`computation_engine`. | ||
See also | ||
-------- | ||
https://scikit-learn.org/stable/developers/contributing.html | ||
""" | ||
|
||
@property | ||
def preserves_sample_size(self): | ||
raise NotImplementedError | ||
|
||
@property | ||
def computation_engine(self): | ||
raise NotImplementedError | ||
|
||
def get_params(self): | ||
"""Retrieve all wrangler parameters set within the __init__ method. | ||
Returns | ||
------- | ||
param_dict: dictionary | ||
Parameter names as keys and corresponding values as values | ||
""" | ||
|
||
init = self.__class__.__init__ | ||
signature = inspect.signature(init) | ||
parameters = signature.parameters.values() | ||
|
||
param_names = [x.name for x in parameters if x.name != "self"] | ||
param_dict = {x: getattr(self, x) for x in param_names} | ||
|
||
return param_dict | ||
|
||
def set_params(self, **params): | ||
"""Set wrangler parameters | ||
Parameters | ||
---------- | ||
params: dict | ||
Dictionary containing new values to be updated on wrangler. Keys | ||
have to match parameter names of wrangler. | ||
Returns | ||
------- | ||
self | ||
""" | ||
|
||
valid_params = self.get_params() | ||
for key, value in params.items(): | ||
if key not in valid_params: | ||
raise ValueError('Invalid parameter {} for wrangler {}. ' | ||
'Check the list of available parameters ' | ||
'with `wrangler.get_params().keys()`.' | ||
.format(key, self)) | ||
|
||
setattr(self, key, value) | ||
|
||
def fit(self): | ||
raise NotImplementedError | ||
|
||
def transform(self): | ||
raise NotImplementedError | ||
|
||
def fit_transform(self): | ||
raise NotImplementedError | ||
|
||
def __repr__(self): | ||
|
||
template = '{wrangler_name} ({computation_engine})\n\n{parameters}'\ | ||
|
||
parameters = (_pprint.header("Parameters", 3) + | ||
_pprint.enumeration(self.get_params().items(), 3)) | ||
|
||
_repr = template.format(wrangler_name=self.__class__.__name__, | ||
computation_engine=self.computation_engine, | ||
parameters=parameters) | ||
|
||
if not self.preserves_sample_size: | ||
_repr += "\n\n Note: Does not preserve sample size." | ||
|
||
return _repr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
# Spark requires Java 8 in order to work properly. However, TravisCI's Ubuntu | ||
# 16.04 ships with Java 11 and Java can't be set with `jdk` when python is | ||
# selected as language. Ubuntu 14.04 does not work due to missing python 3.7 | ||
# support on TravisCI which does have Java 8 as default. | ||
|
||
if [[ $ENV_STRING == *"spark"* ]]; then | ||
# show current JAVA_HOME and java version | ||
echo "Current JAVA_HOME: $JAVA_HOME" | ||
echo "Current java -version:" | ||
java -version | ||
|
||
# install Java 8 | ||
sudo add-apt-repository -y ppa:openjdk-r/ppa | ||
sudo apt-get -qq update | ||
sudo apt-get install -y openjdk-8-jdk --no-install-recommends | ||
sudo update-java-alternatives -s java-1.8.0-openjdk-amd64 | ||
|
||
# change JAVA_HOME to Java 8 | ||
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 | ||
fi |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
"""Test printing helpers. | ||
""" | ||
|
||
import pytest | ||
|
||
from pywrangler.util import _pprint | ||
|
||
|
||
def test_join(): | ||
|
||
test_input = ["a", "b", "c"] | ||
test_output = "a\nb\nc" | ||
|
||
assert _pprint._join(test_input) == test_output | ||
|
||
|
||
def test_indent(): | ||
|
||
test_input = ["a", "b", "c"] | ||
test_output = [" a", " b", " c"] | ||
|
||
assert _pprint._indent(test_input, 3) == test_output | ||
|
||
|
||
def test_header(): | ||
|
||
test_input = "Header" | ||
test_output = 'Header\n------\n' | ||
|
||
assert _pprint.header(test_input) == test_output | ||
|
||
|
||
def test_header_with_indent(): | ||
|
||
test_input = "Header" | ||
test_output = ' Header\n ------\n' | ||
|
||
assert _pprint.header(test_input, indent=3) == test_output | ||
|
||
|
||
def test_header_with_underline(): | ||
|
||
test_input = "Header" | ||
test_output = 'Header\n======\n' | ||
|
||
assert _pprint.header(test_input, underline="=") == test_output | ||
|
||
|
||
def test_enumeration_dict(): | ||
|
||
test_input = {"a": 1, "b": 2} | ||
test_output = '- a: 1\n- b: 2' | ||
|
||
assert _pprint.enumeration(test_input) == test_output | ||
|
||
|
||
def test_enumeration_list(): | ||
|
||
test_input = ["note 1", "note 2"] | ||
test_output = '- note 1\n- note 2' | ||
|
||
assert _pprint.enumeration(test_input) == test_output | ||
|
||
|
||
def test_enumeration_list_with_indent(): | ||
|
||
test_input = ["note 1", "note 2"] | ||
test_output = ' - note 1\n - note 2' | ||
|
||
assert _pprint.enumeration(test_input, indent=4) == test_output | ||
|
||
|
||
def test_enumeration_list_with_bullet(): | ||
|
||
test_input = ["note 1", "note 2"] | ||
test_output = 'o note 1\no note 2' | ||
|
||
assert _pprint.enumeration(test_input, bullet_char="o") == test_output |
Empty file.
Oops, something went wrong.