Merge 4bc5889 into 35b26d2

mansenfranzen · Mar 1, 2019 · 2c52815 · 2c52815
2 parents 35b26d2 + 4bc5889
commit 2c52815
Show file tree

Hide file tree

Showing 12 changed files with 404 additions and 5 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,11 +1,11 @@
-sudo: false
+# sudo: false
 
 language: python
 
 python:
   - '3.5'
-#  - '3.6'
-#  - '3.7'
+  - '3.6'
+  - '3.7'
 
 env:
   - ENV_STRING=pandas0.24.1
@@ -34,8 +34,15 @@ env:
   - ENV_STRING=pyspark2.4.0
 #  - ENV_STRING=pyspark2.3.1
 
+
+dist: xenial
+
+before_install:
+  - source tests/travis_java_install.sh
+
 install:
   - travis_retry pip install --upgrade pip
+  - travis_retry pip install --upgrade setuptools
   - travis_retry pip install coveralls flake8 tox
 
 script:

diff --git a/src/pywrangler/util/__init__.py b/src/pywrangler/util/__init__.py
diff --git a/src/pywrangler/util/_pprint.py b/src/pywrangler/util/_pprint.py
@@ -0,0 +1,97 @@
+"""This module contains helper functions for printing.
+
+"""
+
+import typing
+
+ITERABLE = typing.Union[typing.List[str], typing.Tuple[str]]
+ENUM = typing.Union[ITERABLE, dict]
+
+
+def _join(lines: ITERABLE) -> str:
+    """Join given lines.
+
+    Parameters
+    ----------
+    lines: list, tuple
+        Iterable to join.
+
+    Returns
+    -------
+    joined: str
+
+    """
+
+    return "\n".join(lines)
+
+
+def _indent(lines: ITERABLE, indent: int = 3) -> list:
+    """Indent given lines and optionally join.
+
+    Parameters
+    ----------
+    lines: list, tuple
+        Iterable to indent.
+    indent: int, optional
+        Indentation count.
+
+    """
+
+    spacing = " " * indent
+    return [spacing + x for x in lines]
+
+
+def header(name: str, indent: int = 0, underline: str = "-") -> str:
+    """Create header with underline.
+
+    Parameters
+    ----------
+    name: str
+        Name of title.
+    indent: int, optional
+        Indentation count.
+    underline: str, optional
+        Underline character.
+
+    Returns
+    -------
+    header: str
+
+    """
+
+    _indent = " " * indent
+
+    _header = _indent + name
+    _underline = _indent + underline*len(name) + "\n"
+
+    return _join([_header, _underline])
+
+
+def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
+    """Create enumeration with bullet points.
+
+    Parameters
+    ----------
+    values: list, tuple, dict
+        Iterable vales. If dict, creates key/value pairs..
+    indent: int, optional
+        Indentation count.
+    bullet_char: str, optional
+        Bullet character.
+
+    Returns
+    -------
+    enumeration: str
+
+    """
+
+    if isinstance(values, dict):
+        _values = ["{key}: {value}".format(key=key, value=value)
+                   for key, value in sorted(values.items())]
+    else:
+        _values = values
+
+    with_bullets = ["{} {}".format(bullet_char, x) for x in _values]
+    indented = _indent(with_bullets, indent)
+
+    return _join(indented)
diff --git a/src/pywrangler/wranglers/__init__.py b/src/pywrangler/wranglers/__init__.py
diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py
@@ -0,0 +1,119 @@
+"""This module contains the BaseWrangler definition and the wrangler base
+classes including wrangler descriptions and parameters.
+
+"""
+
+import inspect
+
+from pywrangler.util import _pprint
+
+
+class BaseWrangler:
+    """Defines the basic interface common to all data wranglers.
+
+    In analogy to sklearn transformers (see link below), all wranglers have to
+    implement `fit`, `transform` and `fit_transform` methods. In addition,
+    parameters (e.g. column names) need to be provided via the `__init__`
+    method. Furthermore, `get_params` and `set_params` methods are required for
+    grid search and pipeline compatibility.
+
+    The `fit` method should contain any logic behind parameter validation (e.g.
+    type, shape and other sanity checks) and optional fitting (e.g. compute
+    mean and variance for scaling). The `transform` method includes the actual
+    computational transformation. The `fit_transform` simply applies the former
+    methods in sequence.
+
+    In contrast to sklearn, wranglers do only accept dataframes like objects
+    (like pandas, spark or dask dataframes) as inputs to `fit` and `transform`.
+    The relevant columns and their respective meaning is provided via the
+    `__init__` method. In addition, wranglers may accept multiple input
+    dataframes with different shapes. Also, the number of samples may also
+    change between input and output (which is not allowed in sklearn). The
+    `preserves_sample_size` indicates whether sample size (number of rows) may
+    change during transformation.
+
+    The wrangler's employed computation engine is given via
+    `computation_engine`.
+
+    See also
+    --------
+    https://scikit-learn.org/stable/developers/contributing.html
+
+    """
+
+    @property
+    def preserves_sample_size(self):
+        raise NotImplementedError
+
+    @property
+    def computation_engine(self):
+        raise NotImplementedError
+
+    def get_params(self):
+        """Retrieve all wrangler parameters set within the __init__ method.
+
+        Returns
+        -------
+        param_dict: dictionary
+            Parameter names as keys and corresponding values as values
+
+        """
+
+        init = self.__class__.__init__
+        signature = inspect.signature(init)
+        parameters = signature.parameters.values()
+
+        param_names = [x.name for x in parameters if x.name != "self"]
+        param_dict = {x: getattr(self, x) for x in param_names}
+
+        return param_dict
+
+    def set_params(self, **params):
+        """Set wrangler parameters
+
+        Parameters
+        ----------
+        params: dict
+            Dictionary containing new values to be updated on wrangler. Keys
+            have to match parameter names of wrangler.
+
+        Returns
+        -------
+        self
+
+        """
+
+        valid_params = self.get_params()
+        for key, value in params.items():
+            if key not in valid_params:
+                raise ValueError('Invalid parameter {} for wrangler {}. '
+                                 'Check the list of available parameters '
+                                 'with `wrangler.get_params().keys()`.'
+                                 .format(key, self))
+
+            setattr(self, key, value)
+
+    def fit(self):
+        raise NotImplementedError
+
+    def transform(self):
+        raise NotImplementedError
+
+    def fit_transform(self):
+        raise NotImplementedError
+
+    def __repr__(self):
+
+        template = '{wrangler_name} ({computation_engine})\n\n{parameters}'\
+
+        parameters = (_pprint.header("Parameters", 3) +
+                      _pprint.enumeration(self.get_params().items(), 3))
+
+        _repr = template.format(wrangler_name=self.__class__.__name__,
+                                computation_engine=self.computation_engine,
+                                parameters=parameters)
+
+        if not self.preserves_sample_size:
+            _repr += "\n\n   Note: Does not preserve sample size."
+
+        return _repr
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -33,13 +33,13 @@ def pytest_addoption(parser):
 
 def pytest_collection_modifyitems(config, items):
     """By default, pyspark and dask tests are skipped if not otherwise declared
-    via command line or the TSWRANGLER_TEST_ENV environment variable.
+    via command line or the PYWRANGLER_TEST_ENV environment variable.
 
     """
 
     for skip_item in ("pyspark", "dask"):
 
-        tox_env = os.environ.get("TSWRANGLER_TEST_ENV", "").lower()
+        tox_env = os.environ.get("PYWRANGLER_TEST_ENV", "").lower()
         run_env = skip_item in tox_env
         run_cmd = config.getoption("--{}".format(skip_item))
 

diff --git a/tests/travis_java_install.sh b/tests/travis_java_install.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Spark requires Java 8 in order to work properly. However, TravisCI's Ubuntu
+# 16.04 ships with Java 11 and Java can't be set with `jdk` when python is
+# selected as language. Ubuntu 14.04 does not work due to missing python 3.7
+# support on TravisCI which does have Java 8 as default.
+
+if [[ $ENV_STRING == *"spark"* ]]; then
+  # show current JAVA_HOME and java version
+  echo "Current JAVA_HOME: $JAVA_HOME"
+  echo "Current java -version:"
+  java -version
+
+  # install Java 8
+  sudo add-apt-repository -y ppa:openjdk-r/ppa
+  sudo apt-get -qq update
+  sudo apt-get install -y openjdk-8-jdk --no-install-recommends
+  sudo update-java-alternatives -s java-1.8.0-openjdk-amd64
+
+  # change JAVA_HOME to Java 8
+  export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+fi
diff --git a/tests/util/__init__.py b/tests/util/__init__.py
diff --git a/tests/util/test_pprint.py b/tests/util/test_pprint.py
@@ -0,0 +1,79 @@
+"""Test printing helpers.
+
+"""
+
+import pytest
+
+from pywrangler.util import _pprint
+
+
+def test_join():
+
+    test_input = ["a", "b", "c"]
+    test_output = "a\nb\nc"
+
+    assert _pprint._join(test_input) == test_output
+
+
+def test_indent():
+
+    test_input = ["a", "b", "c"]
+    test_output = ["   a", "   b", "   c"]
+
+    assert _pprint._indent(test_input, 3) == test_output
+
+
+def test_header():
+
+    test_input = "Header"
+    test_output = 'Header\n------\n'
+
+    assert _pprint.header(test_input) == test_output
+
+
+def test_header_with_indent():
+
+    test_input = "Header"
+    test_output = '   Header\n   ------\n'
+
+    assert _pprint.header(test_input, indent=3) == test_output
+
+
+def test_header_with_underline():
+
+    test_input = "Header"
+    test_output = 'Header\n======\n'
+
+    assert _pprint.header(test_input, underline="=") == test_output
+
+
+def test_enumeration_dict():
+
+    test_input = {"a": 1, "b": 2}
+    test_output = '- a: 1\n- b: 2'
+
+    assert _pprint.enumeration(test_input) == test_output
+
+
+def test_enumeration_list():
+
+    test_input = ["note 1", "note 2"]
+    test_output = '- note 1\n- note 2'
+
+    assert _pprint.enumeration(test_input) == test_output
+
+
+def test_enumeration_list_with_indent():
+
+    test_input = ["note 1", "note 2"]
+    test_output = '    - note 1\n    - note 2'
+
+    assert _pprint.enumeration(test_input, indent=4) == test_output
+
+
+def test_enumeration_list_with_bullet():
+
+    test_input = ["note 1", "note 2"]
+    test_output = 'o note 1\no note 2'
+
+    assert _pprint.enumeration(test_input, bullet_char="o") == test_output
diff --git a/tests/wranglers/__init__.py b/tests/wranglers/__init__.py