From 075070ca3f49644edde3761ae66caaf4f26ad930 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:24:05 +0100
Subject: [PATCH 01/48] Add `sizeof` pretty printer. Add `align_values` and
 `align_width` to `enumeration`.

---
 src/pywrangler/util/_pprint.py | 61 ++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/src/pywrangler/util/_pprint.py b/src/pywrangler/util/_pprint.py
index 580f4d5..d602168 100644
--- a/src/pywrangler/util/_pprint.py
+++ b/src/pywrangler/util/_pprint.py
@@ -67,7 +67,8 @@ def header(name: str, indent: int = 0, underline: str = "-") -> str:
     return _join([_header, _underline])
 
 
-def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
+def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-",
+                align_values: bool = True, align_width: int = 0) -> str:
     """Create enumeration with bullet points.
 
     Parameters
@@ -78,6 +79,12 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
         Indentation count.
     bullet_char: str, optional
         Bullet character.
+    align_values: bool, optional
+        If dict is provided, align all values to the same column. The longest
+        key defines the exact position.
+    align_width: int, optional
+        If dict is provided and `align_values` is True, manually set the align
+        width.
 
     Returns
     -------
@@ -86,7 +93,14 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
     """
 
     if isinstance(values, dict):
-        _values = ["{key}: {value}".format(key=key, value=value)
+        fstring = "{key:>{align_width}}: {value}"
+        if align_values and not align_width:
+            align_width = max([len(x) for x in values.keys()])
+
+        _values = [fstring.format(key=key,
+                                  value=value,
+                                  align_width=align_width)
+
                    for key, value in sorted(values.items())]
     else:
         _values = values
@@ -95,3 +109,46 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str:
     indented = _indent(with_bullets, indent)
 
     return _join(indented)
+
+
+def sizeof(size: float, precision: int = 2, align: str = ">",
+           width=None) -> str:
+    """Helper function to format size in human readable format.
+
+    Parameters
+    ----------
+    size: float
+        The size in bytes to be converted into human readable format.
+    precision: int, optional
+        Define shown precision.
+    align: {'<', '^', '>'}, optional
+        Format align specifier.
+    width: int
+        Define maximum width for number.
+
+    Returns
+    -------
+    human_fmt: str
+        Human readable representation of given `size`.
+
+    Notes
+    -----
+    Credit to https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
+
+    """  # noqa: E501
+
+    template = "{size:{align}{width}.{precision}f} {unit}B"
+
+    if width is None:
+        width = precision + 5
+
+    kwargs = dict(width=width, precision=precision, align=align)
+
+    # iterate units (multiples of 1024 bytes)
+    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
+        if abs(size) < 1024.0:
+            return template.format(size=size, unit=unit, **kwargs)
+        size /= 1024.0
+
+    else:
+        return template.format(size=size, unit='Yi', **kwargs)

From 447102052dd714bed58e549a1dcf3a60dfe3dc99 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:24:43 +0100
Subject: [PATCH 02/48] Add `helper` module.

---
 src/pywrangler/util/helper.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 src/pywrangler/util/helper.py

diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py
new file mode 100644
index 0000000..b845033
--- /dev/null
+++ b/src/pywrangler/util/helper.py
@@ -0,0 +1,38 @@
+"""This module contains commonly used helper functions or classes.
+
+"""
+
+from typing import Callable
+
+
+def cached_property(method: Callable) -> property:
+    """Decorated method will be called only on first access to calculate a
+    cached property value. After that, the cached value is returned.
+
+    Parameters
+    ---------
+    method: Callable
+        Getter method to be lazily evaluated.
+
+    Returns
+    -------
+    property
+
+    Notes
+    -----
+    Credit goes to python-pptx: https://github.com/scanny/python-pptx/blob/master/pptx/util.py
+
+    """  # noqa: E501
+
+    cache_attr_name = '__{}'.format(method.__name__)
+    docstring = method.__doc__
+
+    def get_prop_value(obj):
+        try:
+            return getattr(obj, cache_attr_name)
+        except AttributeError:
+            value = method(obj)
+            setattr(obj, cache_attr_name, value)
+            return value
+
+    return property(get_prop_value, doc=docstring)

From c8ec9b869b6689b4e1789f8bf84219d609b1fb3b Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:25:33 +0100
Subject: [PATCH 03/48] Add single pandas dataframe to exception of iterables.

---
 src/pywrangler/util/sanitizer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pywrangler/util/sanitizer.py b/src/pywrangler/util/sanitizer.py
index ab4e5ab..31a133d 100644
--- a/src/pywrangler/util/sanitizer.py
+++ b/src/pywrangler/util/sanitizer.py
@@ -6,6 +6,8 @@
 import collections
 from typing import Any, Tuple
 
+import pandas as pd
+
 
 def ensure_tuple(values: Any) -> Tuple[Any]:
     """For convenience, some parameters may accept a single value (string
@@ -31,8 +33,8 @@ def ensure_tuple(values: Any) -> Tuple[Any]:
     elif not isinstance(values, collections.Iterable):
         return (values, )
 
-    # handle single string which is iterable but still is only one value
-    elif isinstance(values, str):
+    # handle exception which are iterable but still count as one value
+    elif isinstance(values, (str, pd.DataFrame)):
         return (values, )
 
     # anything else should ok to be converted to tuple

From 8997f91c560f10942a5a52fcd8dea1dc53ab79de Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:26:20 +0100
Subject: [PATCH 04/48] Replace `+` operator with `add` method following
 pandas/numpy warning.

---
 src/pywrangler/wranglers/pandas/interval_identifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pywrangler/wranglers/pandas/interval_identifier.py b/src/pywrangler/wranglers/pandas/interval_identifier.py
index 74b1dfa..844fa08 100644
--- a/src/pywrangler/wranglers/pandas/interval_identifier.py
+++ b/src/pywrangler/wranglers/pandas/interval_identifier.py
@@ -195,10 +195,10 @@ def _transform(self, series: pd.Series) -> List[int]:
         bool_end_shift = bool_end.shift().fillna(False)
 
         # get increasing ids for intervals (in/valid) with cumsum
-        ser_ids = (bool_start + bool_end_shift).cumsum()
+        ser_ids = bool_start.add(bool_end_shift).cumsum()
 
         # separate valid vs invalid: ids with start AND end marker are valid
-        bool_valid_ids = (bool_start + bool_end).groupby(ser_ids).sum().eq(2)
+        bool_valid_ids = bool_start.add(bool_end).groupby(ser_ids).sum().eq(2)
 
         valid_ids = bool_valid_ids.index[bool_valid_ids].values
         bool_valid = ser_ids.isin(valid_ids)

From dc4c599a77a03c442763c09fa83f2c1c9970f0e0 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:27:50 +0100
Subject: [PATCH 05/48] Pass dict instead of items to print parameters
 correctly.

---
 src/pywrangler/wranglers/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py
index c535650..94b02a4 100644
--- a/src/pywrangler/wranglers/base.py
+++ b/src/pywrangler/wranglers/base.py
@@ -110,7 +110,7 @@ def __repr__(self):
         template = '{wrangler_name} ({computation_engine})\n\n{parameters}'\
 
         parameters = (_pprint.header("Parameters", 3) +
-                      _pprint.enumeration(self.get_params().items(), 3))
+                      _pprint.enumeration(self.get_params(), 3))
 
         _repr = template.format(wrangler_name=self.__class__.__name__,
                                 computation_engine=self.computation_engine,

From f064839c5472c106ee757117827e02bc00506dbb Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:28:06 +0100
Subject: [PATCH 06/48] Add `exceptions` module.

---
 src/pywrangler/exceptions.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 src/pywrangler/exceptions.py

diff --git a/src/pywrangler/exceptions.py b/src/pywrangler/exceptions.py
new file mode 100644
index 0000000..09bccf3
--- /dev/null
+++ b/src/pywrangler/exceptions.py
@@ -0,0 +1,13 @@
+"""The module contains package wide custom exceptions and warnings.
+
+"""
+
+
+class NotProfiledError(ValueError, AttributeError):
+    """Exception class to raise if profiling results are acquired before
+    calling `profile`.
+
+    This class inherits from both ValueError and AttributeError to help with
+    exception handling
+
+    """

From 82929d1133b397ae0cc7f520850e9158493ebebe Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:28:48 +0100
Subject: [PATCH 07/48] Add `benchmark` module.

---
 src/pywrangler/benchmark.py | 380 ++++++++++++++++++++++++++++++++++++
 1 file changed, 380 insertions(+)
 create mode 100644 src/pywrangler/benchmark.py

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
new file mode 100644
index 0000000..babc88f
--- /dev/null
+++ b/src/pywrangler/benchmark.py
@@ -0,0 +1,380 @@
+"""This module contains benchmarking utility.
+
+"""
+
+import gc
+import inspect
+import sys
+from typing import Iterable, List
+
+import numpy as np
+
+from pywrangler.exceptions import NotProfiledError
+from pywrangler.util import sanitizer
+from pywrangler.util._pprint import enumeration, header, sizeof
+from pywrangler.util.helper import cached_property
+
+
+def allocate_memory(size: float) -> np.ndarray:
+    """Occupies memory by creating numpy array with given size (MB).
+
+    Numpy is used deliberately to specifically define the used memory via
+    dtype.
+
+    Parameters
+    ----------
+    size: float
+        Size in MB to be occupied.
+
+    Returns
+    -------
+    memory_holder: np.ndarray
+
+    """
+
+    if size <= 0:
+        return None
+
+    empty_size = sys.getsizeof(np.ones(0))
+
+    size_in_bytes = np.ceil(size * (2 ** 20)).astype(np.int64) - empty_size
+    memory_holder = np.ones(size_in_bytes, dtype=np.int8)
+
+    return memory_holder
+
+
+class BaseProfile:
+    """Base class defining interface and providing common helper methods.
+
+    By convention, the profiled object should always the be the first argument
+    (ignoring self) passed to `__init__`. All public, relevant profiling
+    metrics have to be defined as properties. All private attributes (methods
+    and variables) need to start with an underscore.
+
+    """
+
+    def profile(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def report(self):
+        """Creates basic report consisting the name of the profiler class, the
+        name of the profiled object, and all defined metrics/properties.
+
+        """
+
+        # get name of profiler
+        profiler_name = self.__class__.__name__
+
+        # get name of profiled object
+        parameters = inspect.signature(self.__init__).parameters.keys()
+        profiled_object = getattr(self, '_{}'.format(list(parameters)[0]))
+
+        try:
+            profiled_obj_name = profiled_object.__name__
+        except AttributeError:
+            profiled_obj_name = profiled_object.__class__.__name__
+
+        # get relevant metrics
+        ignore = ('profile', 'report', 'profile_report')
+        metric_names = [x for x in dir(self)
+                        if not x.startswith('_')
+                        and x not in ignore]
+        metric_values = {x: getattr(self, x) for x in metric_names}
+
+        print(header('{}: {}'.format(profiler_name, profiled_obj_name)), '\n',
+              enumeration(metric_values), sep='')
+
+    def profile_report(self, *args, **kwargs):
+        self.profile(*args, **kwargs).report()
+
+    def _check_is_profiled(self, attributes: Iterable[str]) -> None:
+        """Check if `profile` was already called by ensuring passed attributes
+        are not `None`.
+
+        Parameters
+        ----------
+        attributes:
+            Attribute name(s) given as string or a list/tuple of strings
+
+        Returns
+        -------
+        None
+
+        Raises
+        ------
+        NotProfiledError
+
+        Notes
+        -----
+        Inspired by sklearns `check_is_fitted`.
+
+        """
+
+        if any([getattr(self, x) is None for x in attributes]):
+            msg = ("This {}'s instance is not profiled yet. Call 'profile' "
+                   "with appropriate arguments before using this method."
+                   .format(self.__class__.__name__))
+
+            raise NotProfiledError(msg)
+
+    @staticmethod
+    def _mb_to_bytes(size_mib: float) -> int:
+        """Helper method to convert MiB to Bytes.
+
+        Parameters
+        ----------
+        size_mib: float
+            Size in MiB
+
+        Returns
+        -------
+        size_bytes: int
+            Size in bytes.
+
+        """
+
+        return int(size_mib * (2 ** 20))
+
+
+class MemoryProfile(BaseProfile):
+    """Approximate the maximum increase in memory usage when calling a given
+    function. The maximum increase is defined as the difference between the
+    maximum memory usage during function execution and the baseline memory
+    usage before function execution.
+
+    In addition, compute the mean increase in baseline memory usage between
+    repetitions which might indicate memory leakage.
+
+    The current solution is based on `memory_profiler` and is inspired by the
+    IPython `%memit` magic which additionally calls `gc.collect()` before
+    executing the function to get more stable results.
+
+    Parameters
+    ----------
+    func: callable
+        Callable object to be memory profiled.
+    repetitions: int, optional
+        Number of repetitions.
+
+    """
+
+    def __init__(self, func, repetitions=5):
+        self._func = func
+        self._repetitions = repetitions
+
+        self._max_usages = None
+        self._baselines = None
+
+    def profile(self, *args, **kwargs):
+        """Executes the actual memory profiling.
+
+        Parameters
+        ----------
+        args: iterable, optional
+            Optional positional arguments passed to `func`.
+        kwargs: mapping, optional
+            Optional keyword arguments passed to `func`.
+
+        """
+
+        from memory_profiler import memory_usage
+
+        counter = 0
+        baselines = []
+        max_usages = []
+        mem_args = (self._func, args, kwargs)
+
+        while counter < self._repetitions:
+            gc.collect()
+            baseline = memory_usage()[0]
+            max_usage = memory_usage(mem_args, max_usage=True)[0]
+
+            baselines.append(self._mb_to_bytes(baseline))
+            max_usages.append(self._mb_to_bytes(max_usage))
+            counter += 1
+
+        self._max_usages = max_usages
+        self._baselines = baselines
+
+        return self
+
+    @property
+    def max_usages(self) -> List[int]:
+        """Returns the absolute, maximum memory usages for each iteration in
+        bytes.
+
+        """
+
+        self._check_is_profiled(['_max_usages', '_baselines'])
+
+        return self._max_usages
+
+    @property
+    def baselines(self) -> List[int]:
+        """Returns the absolute, baseline memory usages for each iteration in
+        bytes. The baseline memory usage is defined as the memory usage before
+        function execution.
+
+        """
+
+        self._check_is_profiled(['_max_usages', '_baselines'])
+
+        return self._baselines
+
+    @property
+    def increases(self) -> List[int]:
+        """Returns the absolute memory increase for each iteration in bytes.
+        The memory increase is defined as the difference between the maximum
+        memory usage during function execution and the baseline memory usage
+        before function execution.
+
+        """
+
+        return np.subtract(self.max_usages, self.baselines).tolist()
+
+    @property
+    def increases_mean(self) -> float:
+        """Returns the mean of the absolute memory increases across all
+        iterations.
+
+        """
+
+        return float(np.mean(self.increases))
+
+    @property
+    def increases_std(self) -> float:
+        """Returns the standard variation of the absolute memory increases
+        across all iterations.
+
+        """
+
+        return float(np.std(self.increases))
+
+    @property
+    def baseline_change(self) -> float:
+        """Returns the mean change in baseline memory usage across all
+        all iterations. The baseline memory usage is defined as the memory
+        usage before function execution.
+        """
+
+        changes = np.diff(self.baselines)
+        return float(np.mean(changes))
+
+
+class PandasMemoryProfiler(BaseProfile):
+    """Approximate memory usage for wrangler execution via `fit_transform`
+    for given input dataframes.
+
+    Computes the ratio of maximum memory usage and input memory usage as an
+    estimate of how many times more memory is required for wrangler execution
+    in regard to the input memory usage.
+
+    """
+
+    def __init__(self, wrangler, repetitions=5, precision=2):
+        self._wrangler = wrangler
+        self._repetitions = repetitions
+        self._precision = precision
+
+        self._memory_profile = None
+        self._usage_input = None
+        self._usage_output = None
+
+    def profile(self, *dfs, **kwargs):
+
+        memory_profile = MemoryProfile(self._wrangler.fit_transform,
+                                       self._repetitions)
+        self._memory_profile = memory_profile.profile(*dfs, **kwargs)
+
+        self._usage_input = self._memory_usage_dfs(*dfs)
+
+        dfs_output = self._wrangler.fit_transform(*dfs)
+        dfs_output = sanitizer.ensure_tuple(dfs_output)
+        self._usage_output = self._memory_usage_dfs(*dfs_output)
+
+        return self
+
+    @property
+    def usage_increases_mean(self):
+        """Returns the mean of the absolute memory increases across all
+        iterations.
+
+        """
+
+        self._check_is_profiled(['_memory_profile'])
+        return self._memory_profile.increases_mean
+
+    @property
+    def usage_input(self) -> float:
+        """Returns the memory usage of the input dataframes in bytes.
+
+        """
+
+        self._check_is_profiled(['_usage_input'])
+        return self._usage_input
+
+    @property
+    def usage_output(self) -> float:
+        """Returns the memory usage of the output dataframes in bytes.
+
+        """
+
+        self._check_is_profiled(['_usage_output'])
+        return self._usage_output
+
+    @cached_property
+    def usage_ratio(self) -> float:
+        """Returns the ratio of maximum memory usage and input memory usage.
+        A value of 0 means no memory consumption during execution. A value of 1
+        means that the wrangler additionally requires the same amount of the
+        input memory usage during the `transform` step. A value of 2 means that
+        the wrangler requires twice the amount of the input dataframes memory
+        usage.
+
+        """
+
+        return self.usage_increases_mean / self.usage_input
+
+    def report(self):
+        """Profile memory usage via `profile` and provide human readable
+        report.
+
+        """
+
+        # string part for header
+        wrangler_name = self._wrangler.__class__.__name__
+        str_header = header("{} - memory usage".format(wrangler_name))
+
+        # string part for input and output dfs
+        dict_dfs = {"Input dfs": sizeof(self.usage_input, self._precision),
+                    "Ouput dfs": sizeof(self.usage_output, self._precision)}
+
+        str_dfs = enumeration(dict_dfs, align_width=15, bullet_char="")
+
+        # string part for transform/fit and ratio
+        str_inc = sizeof(self.usage_increases_mean)
+        str_std = sizeof(self._memory_profile.increases_std,
+                         self._precision, width=0)
+        str_inc += " (Std: {})".format(str_std)
+        str_ratio = "{:>7.2f}".format(self.usage_ratio)
+        dict_inc = {"Fit/Transform": str_inc,
+                    "Ratio": str_ratio}
+
+        str_inc = enumeration(dict_inc, align_width=15, bullet_char="")
+
+        # build complete string and print
+        template = "{}\n{}\n\n{}"
+        report_string = template.format(str_header, str_dfs, str_inc)
+
+        print(report_string)
+
+    @staticmethod
+    def _memory_usage_dfs(*dfs) -> int:
+        """Return the memory usage in Bytes for all dataframes `dfs`.
+
+        """
+
+        mem_usages = [df.memory_usage(deep=True, index=True).sum()
+                      for df in dfs]
+
+        return int(np.sum(mem_usages))

From 24ae1c1dafde7f62ff330c7206a3460c007533d8 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:29:21 +0100
Subject: [PATCH 08/48] Comment `src` to avoid duplicated coverage.

---
 .coveragerc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.coveragerc b/.coveragerc
index d1652c5..776fdf7 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -6,7 +6,7 @@ source = pywrangler
 
 [paths]
 source =
-    src/
+#    src/
     */site-packages/
 
 [report]

From 132ad48ab47b991151e1cd733c672174dbe43103 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Thu, 21 Mar 2019 21:29:53 +0100
Subject: [PATCH 09/48] Add `memory_profiler` to testing.

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 20d4f29..5685a63 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,6 +32,7 @@ testing =
     pytest
     pytest-cov
     tox
+    memory_profiler
 
 dev =
     sphinx
@@ -52,6 +53,7 @@ norecursedirs =
     dist
     build
     .tox
+
 testpaths = tests
 
 [aliases]

From f800ec609b90e330433a2496894ed0387fe87f6b Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 22 Mar 2019 14:18:02 +0100
Subject: [PATCH 10/48] Add more expressive doc strings. Add tests.

---
 src/pywrangler/benchmark.py | 143 ++++++++++++++++++++--------
 tests/test_benchmark.py     | 183 ++++++++++++++++++++++++++++++++++++
 2 files changed, 289 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_benchmark.py

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index babc88f..ad553fb 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -8,11 +8,13 @@
 from typing import Iterable, List
 
 import numpy as np
+import pandas as pd
 
 from pywrangler.exceptions import NotProfiledError
 from pywrangler.util import sanitizer
 from pywrangler.util._pprint import enumeration, header, sizeof
 from pywrangler.util.helper import cached_property
+from pywrangler.wranglers.pandas.base import PandasWrangler
 
 
 def allocate_memory(size: float) -> np.ndarray:
@@ -43,17 +45,23 @@ def allocate_memory(size: float) -> np.ndarray:
     return memory_holder
 
 
-class BaseProfile:
-    """Base class defining interface and providing common helper methods.
+class BaseProfiler:
+    """Base class defining interface and providing common helper methods for
+    memory and time profiler.
 
     By convention, the profiled object should always the be the first argument
-    (ignoring self) passed to `__init__`. All public, relevant profiling
-    metrics have to be defined as properties. All private attributes (methods
-    and variables) need to start with an underscore.
+    (ignoring self) passed to `__init__`. All public profiling metrics have to
+    be defined as properties. All private attributes need to start with an
+    underscore.
 
     """
 
     def profile(self, *args, **kwargs):
+        """Contains the actual profiling implementation and should always
+        return self.
+
+        """
+
         raise NotImplementedError
 
     def report(self):
@@ -85,6 +93,10 @@ def report(self):
               enumeration(metric_values), sep='')
 
     def profile_report(self, *args, **kwargs):
+        """Calls profile and report in sequence.
+
+        """
+
         self.profile(*args, **kwargs).report()
 
     def _check_is_profiled(self, attributes: Iterable[str]) -> None:
@@ -136,7 +148,7 @@ def _mb_to_bytes(size_mib: float) -> int:
         return int(size_mib * (2 ** 20))
 
 
-class MemoryProfile(BaseProfile):
+class MemoryProfiler(BaseProfiler):
     """Approximate the maximum increase in memory usage when calling a given
     function. The maximum increase is defined as the difference between the
     maximum memory usage during function execution and the baseline memory
@@ -205,7 +217,7 @@ def max_usages(self) -> List[int]:
 
         """
 
-        self._check_is_profiled(['_max_usages', '_baselines'])
+        self._check_is_profiled(['_max_usages'])
 
         return self._max_usages
 
@@ -217,7 +229,7 @@ def baselines(self) -> List[int]:
 
         """
 
-        self._check_is_profiled(['_max_usages', '_baselines'])
+        self._check_is_profiled(['_baselines'])
 
         return self._baselines
 
@@ -261,43 +273,78 @@ def baseline_change(self) -> float:
         return float(np.mean(changes))
 
 
-class PandasMemoryProfiler(BaseProfile):
-    """Approximate memory usage for wrangler execution via `fit_transform`
-    for given input dataframes.
+class PandasMemoryProfiler(BaseProfiler):
+    """Approximate memory usage for pandas wrangler instances.
 
-    Computes the ratio of maximum memory usage and input memory usage as an
-    estimate of how many times more memory is required for wrangler execution
-    in regard to the input memory usage.
+    Memory consumption is profiled while calling `fit_transform` for given
+    input dataframes.
+
+    As a key metric, `usage_ratio` is computed. It refers to the amount of
+    memory which is required to execute the `fit_transform` step. More
+    concretely, it estimates how much more memory is used standardized by the
+    input memory usage (memory usage increase during function execution divided
+    by memory usage of input dataframes). In other words, if you have a 1GB
+    input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free
+    memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input
+    dataframe would require 1GB free memory available for computation.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.pandas.base.PandasWrangler
+        The wrangler instance to be profiled.
+    repetitions: int
+        The number of measurements for memory profiling.
+
+    Attributes
+    ----------
+    usage_increases_mean: float
+        The mean of the absolute memory increases across all iterations in
+        bytes.
+    usage_input: int
+        Memory usage of input dataframes in bytes.
+    usage_output: int
+        Memory usage of output dataframes in bytes.
+    usage_ratio: float
+        The amount of memory required for computation in units of input
+        memory usage.
 
     """
 
-    def __init__(self, wrangler, repetitions=5, precision=2):
+    def __init__(self, wrangler: PandasWrangler, repetitions: int = 5):
         self._wrangler = wrangler
         self._repetitions = repetitions
-        self._precision = precision
 
         self._memory_profile = None
         self._usage_input = None
         self._usage_output = None
 
-    def profile(self, *dfs, **kwargs):
+    def profile(self, *dfs: pd.DataFrame, **kwargs):
+        """Profiles the actual memory usage given input dataframes `dfs`
+        which are passed to `fit_transform`.
+
 
-        memory_profile = MemoryProfile(self._wrangler.fit_transform,
-                                       self._repetitions)
-        self._memory_profile = memory_profile.profile(*dfs, **kwargs)
 
+        """
+
+        # usage input
         self._usage_input = self._memory_usage_dfs(*dfs)
 
+        # usage output
         dfs_output = self._wrangler.fit_transform(*dfs)
         dfs_output = sanitizer.ensure_tuple(dfs_output)
         self._usage_output = self._memory_usage_dfs(*dfs_output)
 
+        # usage during fit_transform
+        memory_profile = MemoryProfiler(self._wrangler.fit_transform,
+                                        self._repetitions)
+        self._memory_profile = memory_profile.profile(*dfs, **kwargs)
+
         return self
 
     @property
-    def usage_increases_mean(self):
+    def usage_increases_mean(self) -> float:
         """Returns the mean of the absolute memory increases across all
-        iterations.
+        iterations in bytes.
 
         """
 
@@ -324,12 +371,14 @@ def usage_output(self) -> float:
 
     @cached_property
     def usage_ratio(self) -> float:
-        """Returns the ratio of maximum memory usage and input memory usage.
-        A value of 0 means no memory consumption during execution. A value of 1
-        means that the wrangler additionally requires the same amount of the
-        input memory usage during the `transform` step. A value of 2 means that
-        the wrangler requires twice the amount of the input dataframes memory
-        usage.
+        """Refers to the amount of memory which is required to execute the
+        `fit_transform` step. More concretely, it estimates how much more
+        memory is used standardized by the input memory usage (memory usage
+        increase during function execution divided by memory usage of input
+        dataframes). In other words, if you have a 1GB input dataframe, and the
+        `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to
+        succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would
+        require 1GB free memory available for computation.
 
         """
 
@@ -337,30 +386,40 @@ def usage_ratio(self) -> float:
 
     def report(self):
         """Profile memory usage via `profile` and provide human readable
-        report.
+        report including memory usage of input and output dataframes, memory
+        usage during `fit_transform`, the usage ratio and shows if
+        the wrangler may have side effects in regard to memory consumption via
+        the change in baseline memory usage.
+
+        Returns
+        -------
+        None. Prints report to stdout.
 
         """
 
+        enum_kwargs = dict(align_width=15, bullet_char="")
+
         # string part for header
         wrangler_name = self._wrangler.__class__.__name__
         str_header = header("{} - memory usage".format(wrangler_name))
 
         # string part for input and output dfs
-        dict_dfs = {"Input dfs": sizeof(self.usage_input, self._precision),
-                    "Ouput dfs": sizeof(self.usage_output, self._precision)}
+        dict_dfs = {"Input dfs": sizeof(self.usage_input),
+                    "Ouput dfs": sizeof(self.usage_output)}
 
-        str_dfs = enumeration(dict_dfs, align_width=15, bullet_char="")
+        str_dfs = enumeration(dict_dfs, **enum_kwargs)
 
         # string part for transform/fit and ratio
         str_inc = sizeof(self.usage_increases_mean)
-        str_std = sizeof(self._memory_profile.increases_std,
-                         self._precision, width=0)
+        str_std = sizeof(self._memory_profile.increases_std, width=0)
         str_inc += " (Std: {})".format(str_std)
         str_ratio = "{:>7.2f}".format(self.usage_ratio)
+        str_baseline_change = sizeof(self._memory_profile.baseline_change)
         dict_inc = {"Fit/Transform": str_inc,
-                    "Ratio": str_ratio}
+                    "Ratio": str_ratio,
+                    "Baseline change": str_baseline_change}
 
-        str_inc = enumeration(dict_inc, align_width=15, bullet_char="")
+        str_inc = enumeration(dict_inc, **enum_kwargs)
 
         # build complete string and print
         template = "{}\n{}\n\n{}"
@@ -369,9 +428,19 @@ def report(self):
         print(report_string)
 
     @staticmethod
-    def _memory_usage_dfs(*dfs) -> int:
+    def _memory_usage_dfs(*dfs: pd.DataFrame) -> int:
         """Return the memory usage in Bytes for all dataframes `dfs`.
 
+        Parameters
+        ----------
+        dfs: pd.DataFrame
+            The pandas dataframes for which memory usage should be computed.
+
+        Returns
+        -------
+        memory_usage: int
+            The computed memory usage in bytes.
+
         """
 
         mem_usages = [df.memory_usage(deep=True, index=True).sum()
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
new file mode 100644
index 0000000..9fc67e4
--- /dev/null
+++ b/tests/test_benchmark.py
@@ -0,0 +1,183 @@
+"""This module contains tests for the benchmark utilities.
+
+"""
+
+import sys
+
+import pytest
+
+import numpy as np
+import pandas as pd
+
+from pywrangler.benchmark import (
+    BaseProfiler,
+    MemoryProfiler,
+    PandasMemoryProfiler,
+    allocate_memory
+)
+from pywrangler.exceptions import NotProfiledError
+from pywrangler.wranglers.pandas.base import PandasSingleNoFit
+
+MIB = 2 ** 20
+
+
+def test_allocate_memory_empty():
+    memory_holder = allocate_memory(0)
+
+    assert memory_holder is None
+
+
+def test_allocate_memory_5mb():
+    memory_holder = allocate_memory(5)
+
+    assert sys.getsizeof(memory_holder) == 5 * (2 ** 20)
+
+
+def test_base_profiler_not_implemented():
+    base_profiler = BaseProfiler()
+
+    for will_raise in ('profile', 'profile_report'):
+        with pytest.raises(NotImplementedError):
+            getattr(base_profiler, will_raise)()
+
+
+def test_base_profiler_check_is_profiled():
+    base_profiler = BaseProfiler()
+    base_profiler._not_set = None
+    base_profiler._is_set = "value"
+
+    with pytest.raises(NotProfiledError):
+        base_profiler._check_is_profiled(['_not_set'])
+
+    base_profiler._check_is_profiled(['_is_set'])
+
+
+def test_base_profiler_mb_to_bytes():
+    assert BaseProfiler._mb_to_bytes(1) == 1048576
+    assert BaseProfiler._mb_to_bytes(1.5) == 1572864
+    assert BaseProfiler._mb_to_bytes(0.33) == 346030
+
+
+def test_memory_profiler_return_self():
+    def dummy():
+        pass
+
+    memory_profiler = MemoryProfiler(dummy)
+    assert memory_profiler.profile() is memory_profiler
+
+
+def test_memory_profiler_properties():
+    def dummy():
+        pass
+
+    memory_profiler = MemoryProfiler(dummy)
+    memory_profiler._baselines = [0, 1, 2, 3]
+    memory_profiler._max_usages = [4, 5, 7, 8]
+
+    assert memory_profiler.max_usages == memory_profiler._max_usages
+    assert memory_profiler.baselines == memory_profiler._baselines
+    assert memory_profiler.increases == [4, 4, 5, 5]
+    assert memory_profiler.increases_mean == 4.5
+    assert memory_profiler.increases_std == 0.5
+    assert memory_profiler.baseline_change == 1
+
+
+def test_memory_profiler_no_side_effect():
+    def no_side_effect():
+        dummy = 5
+        return dummy
+
+    assert MemoryProfiler(no_side_effect).profile().baseline_change < 0.5 * MIB
+
+
+def test_memory_profiler_side_effect():
+    side_effect_container = []
+
+    def side_effect():
+        memory_holder = allocate_memory(5)
+        side_effect_container.append(memory_holder)
+
+        return memory_holder
+
+    assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB
+
+
+def test_memory_profiler_no_increase():
+    def no_increase():
+        pass
+
+    assert MemoryProfiler(no_increase).profile().increases_mean < 0.1 * MIB
+    assert MemoryProfiler(no_increase).profile().increases_std < 0.1 * MIB
+
+
+def test_memory_profiler_increase():
+    def increase():
+        memory_holder = allocate_memory(30)
+        return memory_holder
+
+    assert MemoryProfiler(increase).profile().increases_mean > 29 * MIB
+
+
+def test_pandas_memory_profiler_memory_usage_dfs():
+    df1 = pd.DataFrame(np.random.rand(10))
+    df2 = pd.DataFrame(np.random.rand(10))
+
+    test_input = [df1, df2]
+    test_output = int(df1.memory_usage(index=True, deep=True).sum() +
+                      df2.memory_usage(index=True, deep=True).sum())
+
+    assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output
+
+
+def test_pandas_memory_profiler_return_self():
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame()
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler is memory_profiler.profile(pd.DataFrame())
+
+
+def test_pandas_memory_profiler_usage_increases_mean():
+    empty_df = pd.DataFrame()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame(allocate_memory(30))
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler.profile(empty_df).usage_increases_mean > 29 * MIB
+
+
+def test_pandas_memory_profiler_usage_input_output():
+    df_input = pd.DataFrame(np.random.rand(1000))
+    df_output = pd.DataFrame(np.random.rand(10000))
+
+    test_df_input = df_input.memory_usage(index=True, deep=True).sum()
+    test_df_output = df_output.memory_usage(index=True, deep=True).sum()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return df_output
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input)
+
+    assert memory_profiler.usage_input == test_df_input
+    assert memory_profiler.usage_output == test_df_output
+
+
+def test_pandas_memory_profiler_usage_ratio():
+    usage_mib = 30
+    df_input = pd.DataFrame(np.random.rand(1000000))
+    usage_input = df_input.memory_usage(index=True, deep=True).sum()
+    test_output = ((usage_mib - 1) * MIB) / usage_input
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame(allocate_memory(usage_mib))
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler.profile(df_input).usage_ratio > test_output

From ebfb04ea083d0cf50be5dc8dd03edffcea12cd4f Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 22 Mar 2019 14:21:06 +0100
Subject: [PATCH 11/48] Add and adjust tests for `enumeration` and `sizeof`.

---
 tests/util/test_pprint.py | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/tests/util/test_pprint.py b/tests/util/test_pprint.py
index eb3967e..35a71b0 100644
--- a/tests/util/test_pprint.py
+++ b/tests/util/test_pprint.py
@@ -2,13 +2,10 @@
 
 """
 
-import pytest
-
 from pywrangler.util import _pprint
 
 
 def test_join():
-
     test_input = ["a", "b", "c"]
     test_output = "a\nb\nc"
 
@@ -16,7 +13,6 @@ def test_join():
 
 
 def test_indent():
-
     test_input = ["a", "b", "c"]
     test_output = ["   a", "   b", "   c"]
 
@@ -24,7 +20,6 @@ def test_indent():
 
 
 def test_header():
-
     test_input = "Header"
     test_output = 'Header\n------\n'
 
@@ -32,7 +27,6 @@ def test_header():
 
 
 def test_header_with_indent():
-
     test_input = "Header"
     test_output = '   Header\n   ------\n'
 
@@ -40,23 +34,34 @@ def test_header_with_indent():
 
 
 def test_header_with_underline():
-
     test_input = "Header"
     test_output = 'Header\n======\n'
 
     assert _pprint.header(test_input, underline="=") == test_output
 
 
-def test_enumeration_dict():
+def test_enumeration_dict_align_values_false():
+    test_input = {"a": 1, "bb": 2}
+    test_output = '- a: 1\n- bb: 2'
+
+    assert _pprint.enumeration(test_input, align_values=False) == test_output
+
 
-    test_input = {"a": 1, "b": 2}
-    test_output = '- a: 1\n- b: 2'
+def test_enumeration_dict_align_values():
+    test_input = {"a": 1, "bb": 2}
+    test_output = '-  a: 1\n- bb: 2'
 
     assert _pprint.enumeration(test_input) == test_output
 
 
-def test_enumeration_list():
+def test_enumeration_dict_align_values_with_align_width():
+    test_input = {"a": 1, "bb": 2}
+    test_output = '-   a: 1\n-  bb: 2'
+
+    assert _pprint.enumeration(test_input, align_width=3) == test_output
 
+
+def test_enumeration_list():
     test_input = ["note 1", "note 2"]
     test_output = '- note 1\n- note 2'
 
@@ -64,7 +69,6 @@ def test_enumeration_list():
 
 
 def test_enumeration_list_with_indent():
-
     test_input = ["note 1", "note 2"]
     test_output = '    - note 1\n    - note 2'
 
@@ -72,8 +76,15 @@ def test_enumeration_list_with_indent():
 
 
 def test_enumeration_list_with_bullet():
-
     test_input = ["note 1", "note 2"]
     test_output = 'o note 1\no note 2'
 
     assert _pprint.enumeration(test_input, bullet_char="o") == test_output
+
+
+def test_sizeof():
+    assert _pprint.sizeof(1024, precision=1, width=0) == '1.0 KiB'
+    assert _pprint.sizeof(1024, precision=1) == '   1.0 KiB'
+    assert _pprint.sizeof(1024, precision=1, align="<") == '1.0    KiB'
+    assert _pprint.sizeof(1024 ** 2, precision=1, width=0) == '1.0 MiB'
+    assert _pprint.sizeof(1024 ** 8, precision=2, width=0) == '1.00 YiB'

From e8a8a296397d2351e5b4c5f25c4f04ad2c18d7d2 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 23 Mar 2019 15:37:17 +0100
Subject: [PATCH 12/48] Add `TimeProfiler` and `PandasTimeProfiler` with tests
 to benchmark module.

---
 src/pywrangler/benchmark.py | 160 ++++++++++++++++++++++++++++++++----
 tests/test_benchmark.py     |  60 ++++++++++++++
 2 files changed, 206 insertions(+), 14 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index ad553fb..6db33f5 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -5,7 +5,8 @@
 import gc
 import inspect
 import sys
-from typing import Iterable, List
+import timeit
+from typing import Callable, Iterable, List, Union
 
 import numpy as np
 import pandas as pd
@@ -157,7 +158,7 @@ class MemoryProfiler(BaseProfiler):
     In addition, compute the mean increase in baseline memory usage between
     repetitions which might indicate memory leakage.
 
-    The current solution is based on `memory_profiler` and is inspired by the
+    The implementation is based on `memory_profiler` and is inspired by the
     IPython `%memit` magic which additionally calls `gc.collect()` before
     executing the function to get more stable results.
 
@@ -273,7 +274,143 @@ def baseline_change(self) -> float:
         return float(np.mean(changes))
 
 
-class PandasMemoryProfiler(BaseProfiler):
+class TimeProfiler(BaseProfiler):
+    """Approximate the time required to call a given function.
+
+    The implementation is based on standard library's `timeit` module. By
+    default, the number of repetitions is estimated if not set explicitly.
+
+    Parameters
+    ----------
+    func: callable
+        Callable object to be memory profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    median: float
+        The median of the timing measurements in seconds.
+    standard_deviation: float
+        The standard deviation of the timing measurements in seconds.
+    fastast: float
+        The fastest value of the timing measurements in seconds.
+    repetitions: int
+        The number of measurements.
+
+    """
+
+    def __init__(self, func: Callable, repetitions: Union[None, int] = None):
+        self._func = func
+        self._repetitions = repetitions
+
+        self._timings = None
+        self._timings_mean = None
+        self._timings_std = None
+        self._fastest = None
+
+    def profile(self, *args, **kwargs):
+        """Executes the actual time profiling.
+
+        Parameters
+        ----------
+        args: iterable, optional
+            Optional positional arguments passed to `func`.
+        kwargs: mapping, optional
+            Optional keyword arguments passed to `func`.
+
+        """
+
+        def wrapper():
+            """Helper function without arguments which is passed to `repeat`
+            which only calls given function with provided args and kwargs.
+
+            """
+
+            self._func(*args, **kwargs)
+
+        timer = timeit.Timer(stmt=wrapper)
+
+        if self._repetitions is None:
+            repeat, _ = timer.autorange(None)
+        else:
+            repeat = self._repetitions
+
+        self._timings = timer.repeat(number=1, repeat=repeat)
+
+        return self
+
+    @property
+    def median(self) -> float:
+        """Returns the median of all timeit measurements in seconds.
+
+        """
+        self._check_is_profiled(['_timings'])
+
+        return float(np.median(self._timings))
+
+    @property
+    def standard_deviation(self) -> float:
+        """Returns the standard deviation of all timeit measurements in
+        seconds.
+
+        """
+        self._check_is_profiled(['_timings'])
+
+        return float(np.std(self._timings))
+
+    @property
+    def fastest(self) -> float:
+        """Returns the fastest timing measurement in seconds.
+
+        """
+
+        self._check_is_profiled(['_timings'])
+
+        return min(self._timings)
+
+    @property
+    def repetitions(self) -> int:
+        """Returns the number of measurements.
+
+        """
+
+        return len(self._timings)
+
+
+class PandasTimeProfiler(TimeProfiler):
+    """Approximate time which pandas wrangler instances require during their
+    `fit_transform` step.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.pandas.base.PandasWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    median: float
+        The median of the timing measurements in seconds.
+    standard_deviation: float
+        The standard deviation of the timing measurements in seconds.
+    fastast: float
+        The fastest value of the timing measurements in seconds.
+    repetitions: int
+        The number of measurements.
+
+    """
+
+    def __init__(self, wrangler: PandasWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+        super().__init__(wrangler.fit_transform, repetitions)
+
+
+class PandasMemoryProfiler(MemoryProfiler):
     """Approximate memory usage for pandas wrangler instances.
 
     Memory consumption is profiled while calling `fit_transform` for given
@@ -312,18 +449,16 @@ class PandasMemoryProfiler(BaseProfiler):
 
     def __init__(self, wrangler: PandasWrangler, repetitions: int = 5):
         self._wrangler = wrangler
-        self._repetitions = repetitions
 
-        self._memory_profile = None
         self._usage_input = None
         self._usage_output = None
 
+        super().__init__(wrangler.fit_transform, repetitions)
+
     def profile(self, *dfs: pd.DataFrame, **kwargs):
         """Profiles the actual memory usage given input dataframes `dfs`
         which are passed to `fit_transform`.
 
-
-
         """
 
         # usage input
@@ -335,9 +470,7 @@ def profile(self, *dfs: pd.DataFrame, **kwargs):
         self._usage_output = self._memory_usage_dfs(*dfs_output)
 
         # usage during fit_transform
-        memory_profile = MemoryProfiler(self._wrangler.fit_transform,
-                                        self._repetitions)
-        self._memory_profile = memory_profile.profile(*dfs, **kwargs)
+        super().profile(*dfs, **kwargs)
 
         return self
 
@@ -348,8 +481,7 @@ def usage_increases_mean(self) -> float:
 
         """
 
-        self._check_is_profiled(['_memory_profile'])
-        return self._memory_profile.increases_mean
+        return self.increases_mean
 
     @property
     def usage_input(self) -> float:
@@ -411,10 +543,10 @@ def report(self):
 
         # string part for transform/fit and ratio
         str_inc = sizeof(self.usage_increases_mean)
-        str_std = sizeof(self._memory_profile.increases_std, width=0)
+        str_std = sizeof(self.increases_std, width=0)
         str_inc += " (Std: {})".format(str_std)
         str_ratio = "{:>7.2f}".format(self.usage_ratio)
-        str_baseline_change = sizeof(self._memory_profile.baseline_change)
+        str_baseline_change = sizeof(self.baseline_change)
         dict_inc = {"Fit/Transform": str_inc,
                     "Ratio": str_ratio,
                     "Baseline change": str_baseline_change}
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 9fc67e4..15685fc 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -3,6 +3,7 @@
 """
 
 import sys
+import time
 
 import pytest
 
@@ -13,6 +14,8 @@
     BaseProfiler,
     MemoryProfiler,
     PandasMemoryProfiler,
+    PandasTimeProfiler,
+    TimeProfiler,
     allocate_memory
 )
 from pywrangler.exceptions import NotProfiledError
@@ -181,3 +184,60 @@ def transform(self, df):
     memory_profiler = PandasMemoryProfiler(DummyWrangler())
 
     assert memory_profiler.profile(df_input).usage_ratio > test_output
+
+
+def test_time_profiler_return_self():
+    def dummy():
+        pass
+
+    time_profiler = TimeProfiler(dummy, 1)
+    assert time_profiler.profile() is time_profiler
+
+
+def test_time_profiler_properties():
+    def dummy():
+        pass
+
+    time_profiler = TimeProfiler(dummy)
+    time_profiler._timings = [1, 1, 3, 3]
+
+    assert time_profiler.median == 2
+    assert time_profiler.standard_deviation == 1
+    assert time_profiler.fastest == 1
+    assert time_profiler.repetitions == 4
+
+
+def test_time_profiler_repetitions():
+    def dummy():
+        pass
+
+    time_profiler = TimeProfiler(dummy, repetitions=10).profile()
+
+    assert time_profiler.repetitions == 10
+
+
+def test_time_profiler_fastest():
+    sleep = 0.0001
+
+    def dummy():
+        time.sleep(sleep)
+        pass
+
+    time_profiler = TimeProfiler(dummy, repetitions=1).profile()
+
+    assert time_profiler.fastest >= sleep
+
+
+def test_pandas_time_profiler_fastest():
+
+    sleep = 0.0001
+    df_dummy = pd.DataFrame()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            time.sleep(sleep)
+            pass
+
+    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_dummy)
+
+    time_profiler.fastest >= sleep

From 8d6c495e393d6336068a227a98e73a99a122e17f Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 19:31:44 +0200
Subject: [PATCH 13/48] Improve doc strings. Simplify attributes of
 `PandasMemoryProfiler`. Add `timings` property to `TimeProfiler.`

---
 src/pywrangler/benchmark.py | 110 +++++++++++++++++++-----------------
 tests/test_benchmark.py     |  10 ++--
 2 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index 6db33f5..8b6156f 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -19,15 +19,15 @@
 
 
 def allocate_memory(size: float) -> np.ndarray:
-    """Occupies memory by creating numpy array with given size (MB).
+    """Helper function for testing to allocate memory by creating numpy array
+    with given size in MiB.
 
-    Numpy is used deliberately to specifically define the used memory via
-    dtype.
+    Numpy is used deliberately to define the used memory via dtype.
 
     Parameters
     ----------
     size: float
-        Size in MB to be occupied.
+        Size in MiB to be occupied.
 
     Returns
     -------
@@ -47,13 +47,13 @@ def allocate_memory(size: float) -> np.ndarray:
 
 
 class BaseProfiler:
-    """Base class defining interface and providing common helper methods for
-    memory and time profiler.
+    """Base class defining interface and common helper methods for memory and
+    time profiler.
 
-    By convention, the profiled object should always the be the first argument
-    (ignoring self) passed to `__init__`. All public profiling metrics have to
-    be defined as properties. All private attributes need to start with an
-    underscore.
+    By convention, the profiled object should always be the first argument
+    (ignoring self) passed to `__init__`.
+    All public profiling metrics have to should be defined as properties. All
+    private attributes need to start with an underscore.
 
     """
 
@@ -66,8 +66,8 @@ def profile(self, *args, **kwargs):
         raise NotImplementedError
 
     def report(self):
-        """Creates basic report consisting the name of the profiler class, the
-        name of the profiled object, and all defined metrics/properties.
+        """Print simple report consisting of the name of the profiler class,
+        the name of the profiled object, and all defined metrics/properties.
 
         """
 
@@ -150,18 +150,14 @@ def _mb_to_bytes(size_mib: float) -> int:
 
 
 class MemoryProfiler(BaseProfiler):
-    """Approximate the maximum increase in memory usage when calling a given
-    function. The maximum increase is defined as the difference between the
-    maximum memory usage during function execution and the baseline memory
-    usage before function execution.
+    """Approximate the increase in memory usage when calling a given function.
+    Memory increase is defined as the difference between the maximum memory
+    usage during function execution and the baseline memory usage before
+    function execution.
 
     In addition, compute the mean increase in baseline memory usage between
     repetitions which might indicate memory leakage.
 
-    The implementation is based on `memory_profiler` and is inspired by the
-    IPython `%memit` magic which additionally calls `gc.collect()` before
-    executing the function to get more stable results.
-
     Parameters
     ----------
     func: callable
@@ -169,6 +165,12 @@ class MemoryProfiler(BaseProfiler):
     repetitions: int, optional
         Number of repetitions.
 
+    Notes
+    -----
+    The implementation is based on `memory_profiler` and is inspired by the
+    IPython `%memit` magic which additionally calls `gc.collect()` before
+    executing the function to get more stable results.
+
     """
 
     def __init__(self, func, repetitions=5):
@@ -275,10 +277,9 @@ def baseline_change(self) -> float:
 
 
 class TimeProfiler(BaseProfiler):
-    """Approximate the time required to call a given function.
+    """Approximate the time required to execute a function call.
 
-    The implementation is based on standard library's `timeit` module. By
-    default, the number of repetitions is estimated if not set explicitly.
+    By default, the number of repetitions is estimated if not set explicitly.
 
     Parameters
     ----------
@@ -290,6 +291,8 @@ class TimeProfiler(BaseProfiler):
 
     Attributes
     ----------
+    timings: list
+        The timing measurements in seconds.
     median: float
         The median of the timing measurements in seconds.
     standard_deviation: float
@@ -299,6 +302,10 @@ class TimeProfiler(BaseProfiler):
     repetitions: int
         The number of measurements.
 
+    Notes
+    -----
+    The implementation is based on standard library's `timeit` module.
+
     """
 
     def __init__(self, func: Callable, repetitions: Union[None, int] = None):
@@ -341,11 +348,20 @@ def wrapper():
 
         return self
 
+    @property
+    def timings(self) -> List[float]:
+        """Returns the timeit measurements in seconds.
+
+        """
+
+        return self._timings
+
     @property
     def median(self) -> float:
         """Returns the median of all timeit measurements in seconds.
 
         """
+
         self._check_is_profiled(['_timings'])
 
         return float(np.median(self._timings))
@@ -356,6 +372,7 @@ def standard_deviation(self) -> float:
         seconds.
 
         """
+
         self._check_is_profiled(['_timings'])
 
         return float(np.std(self._timings))
@@ -380,7 +397,7 @@ def repetitions(self) -> int:
 
 
 class PandasTimeProfiler(TimeProfiler):
-    """Approximate time which pandas wrangler instances require during their
+    """Approximate time that a pandas wrangler instance requires to execute the
     `fit_transform` step.
 
     Parameters
@@ -393,6 +410,8 @@ class PandasTimeProfiler(TimeProfiler):
 
     Attributes
     ----------
+    timings: list
+        The timing measurements in seconds.
     median: float
         The median of the timing measurements in seconds.
     standard_deviation: float
@@ -411,12 +430,10 @@ def __init__(self, wrangler: PandasWrangler,
 
 
 class PandasMemoryProfiler(MemoryProfiler):
-    """Approximate memory usage for pandas wrangler instances.
-
-    Memory consumption is profiled while calling `fit_transform` for given
-    input dataframes.
+    """Approximate memory usage that a pandas wrangler instance requires to
+    execute the `fit_transform` step.
 
-    As a key metric, `usage_ratio` is computed. It refers to the amount of
+    As a key metric, `ratio` is computed. It refers to the amount of
     memory which is required to execute the `fit_transform` step. More
     concretely, it estimates how much more memory is used standardized by the
     input memory usage (memory usage increase during function execution divided
@@ -434,14 +451,14 @@ class PandasMemoryProfiler(MemoryProfiler):
 
     Attributes
     ----------
-    usage_increases_mean: float
+    increases_mean: float
         The mean of the absolute memory increases across all iterations in
         bytes.
-    usage_input: int
+    input: int
         Memory usage of input dataframes in bytes.
-    usage_output: int
+    output: int
         Memory usage of output dataframes in bytes.
-    usage_ratio: float
+    ratio: float
         The amount of memory required for computation in units of input
         memory usage.
 
@@ -475,16 +492,7 @@ def profile(self, *dfs: pd.DataFrame, **kwargs):
         return self
 
     @property
-    def usage_increases_mean(self) -> float:
-        """Returns the mean of the absolute memory increases across all
-        iterations in bytes.
-
-        """
-
-        return self.increases_mean
-
-    @property
-    def usage_input(self) -> float:
+    def input(self) -> float:
         """Returns the memory usage of the input dataframes in bytes.
 
         """
@@ -493,7 +501,7 @@ def usage_input(self) -> float:
         return self._usage_input
 
     @property
-    def usage_output(self) -> float:
+    def output(self) -> float:
         """Returns the memory usage of the output dataframes in bytes.
 
         """
@@ -502,7 +510,7 @@ def usage_output(self) -> float:
         return self._usage_output
 
     @cached_property
-    def usage_ratio(self) -> float:
+    def ratio(self) -> float:
         """Refers to the amount of memory which is required to execute the
         `fit_transform` step. More concretely, it estimates how much more
         memory is used standardized by the input memory usage (memory usage
@@ -514,7 +522,7 @@ def usage_ratio(self) -> float:
 
         """
 
-        return self.usage_increases_mean / self.usage_input
+        return self.increases_mean / self.input
 
     def report(self):
         """Profile memory usage via `profile` and provide human readable
@@ -536,16 +544,16 @@ def report(self):
         str_header = header("{} - memory usage".format(wrangler_name))
 
         # string part for input and output dfs
-        dict_dfs = {"Input dfs": sizeof(self.usage_input),
-                    "Ouput dfs": sizeof(self.usage_output)}
+        dict_dfs = {"Input dfs": sizeof(self.input),
+                    "Ouput dfs": sizeof(self.output)}
 
         str_dfs = enumeration(dict_dfs, **enum_kwargs)
 
         # string part for transform/fit and ratio
-        str_inc = sizeof(self.usage_increases_mean)
+        str_inc = sizeof(self.increases_mean)
         str_std = sizeof(self.increases_std, width=0)
         str_inc += " (Std: {})".format(str_std)
-        str_ratio = "{:>7.2f}".format(self.usage_ratio)
+        str_ratio = "{:>7.2f}".format(self.ratio)
         str_baseline_change = sizeof(self.baseline_change)
         dict_inc = {"Fit/Transform": str_inc,
                     "Ratio": str_ratio,
@@ -561,7 +569,7 @@ def report(self):
 
     @staticmethod
     def _memory_usage_dfs(*dfs: pd.DataFrame) -> int:
-        """Return the memory usage in Bytes for all dataframes `dfs`.
+        """Return memory usage in bytes for all given dataframes.
 
         Parameters
         ----------
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 15685fc..3fd493b 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -151,7 +151,7 @@ def transform(self, df):
 
     memory_profiler = PandasMemoryProfiler(DummyWrangler())
 
-    assert memory_profiler.profile(empty_df).usage_increases_mean > 29 * MIB
+    assert memory_profiler.profile(empty_df).increases_mean > 29 * MIB
 
 
 def test_pandas_memory_profiler_usage_input_output():
@@ -167,8 +167,8 @@ def transform(self, df):
 
     memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input)
 
-    assert memory_profiler.usage_input == test_df_input
-    assert memory_profiler.usage_output == test_df_output
+    assert memory_profiler.input == test_df_input
+    assert memory_profiler.output == test_df_output
 
 
 def test_pandas_memory_profiler_usage_ratio():
@@ -183,7 +183,7 @@ def transform(self, df):
 
     memory_profiler = PandasMemoryProfiler(DummyWrangler())
 
-    assert memory_profiler.profile(df_input).usage_ratio > test_output
+    assert memory_profiler.profile(df_input).ratio > test_output
 
 
 def test_time_profiler_return_self():
@@ -195,6 +195,7 @@ def dummy():
 
 
 def test_time_profiler_properties():
+
     def dummy():
         pass
 
@@ -205,6 +206,7 @@ def dummy():
     assert time_profiler.standard_deviation == 1
     assert time_profiler.fastest == 1
     assert time_profiler.repetitions == 4
+    assert time_profiler.timings == time_profiler._timings
 
 
 def test_time_profiler_repetitions():

From 1ba767fd03af16cff02aa5ac46cccd97d74b3332 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 21:30:41 +0200
Subject: [PATCH 14/48] Add dask base wrangler.

---
 src/pywrangler/wranglers/dask/__init__.py |  0
 src/pywrangler/wranglers/dask/base.py     | 53 +++++++++++++++++++++++
 tests/wranglers/dask/__init__.py          |  0
 tests/wranglers/dask/test_base.py         | 17 ++++++++
 4 files changed, 70 insertions(+)
 create mode 100644 src/pywrangler/wranglers/dask/__init__.py
 create mode 100644 src/pywrangler/wranglers/dask/base.py
 create mode 100644 tests/wranglers/dask/__init__.py
 create mode 100644 tests/wranglers/dask/test_base.py

diff --git a/src/pywrangler/wranglers/dask/__init__.py b/src/pywrangler/wranglers/dask/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pywrangler/wranglers/dask/base.py b/src/pywrangler/wranglers/dask/base.py
new file mode 100644
index 0000000..6953c16
--- /dev/null
+++ b/src/pywrangler/wranglers/dask/base.py
@@ -0,0 +1,53 @@
+"""This module contains the dask base wrangler.
+
+"""
+
+from dask.dataframe import DataFrame
+
+from pywrangler.wranglers.base import BaseWrangler
+
+
+class DaskWrangler(BaseWrangler):
+    """Contains methods common to all dask based wranglers.
+
+    """
+
+    @property
+    def computation_engine(self):
+        return "dask"
+
+
+class DaskSingleNoFit(DaskWrangler):
+    """Mixin class defining `fit` and `fit_transform` for all wranglers with
+    a single data frame input and output with no fitting necessary.
+
+    """
+
+    def fit(self, df: DataFrame):
+        """Do nothing and return the wrangler unchanged.
+
+        This method is just there to implement the usual API and hence work in
+        pipelines.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+
+        """
+
+        return self
+
+    def fit_transform(self, df: DataFrame) -> DataFrame:
+        """Apply fit and transform in sequence at once.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+
+        Returns
+        -------
+        result: pd.DataFrame
+
+        """
+
+        return self.fit(df).transform(df)
diff --git a/tests/wranglers/dask/__init__.py b/tests/wranglers/dask/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/wranglers/dask/test_base.py b/tests/wranglers/dask/test_base.py
new file mode 100644
index 0000000..da4ae61
--- /dev/null
+++ b/tests/wranglers/dask/test_base.py
@@ -0,0 +1,17 @@
+"""Test dask base wrangler.
+
+"""
+
+import pytest
+
+try:
+    from pywrangler.wranglers.dask.base import DaskWrangler
+except ImportError:
+    DaskWrangler = None
+
+
+@pytest.mark.dask
+def test_dask_base_wrangler_engine():
+    wrangler = DaskWrangler()
+
+    assert wrangler.computation_engine == "dask"

From dea81bd283b9636853cb6fc1d6a0fcca9daf5578 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 21:31:10 +0200
Subject: [PATCH 15/48] Add spark base wrangler.

---
 src/pywrangler/wranglers/spark/__init__.py |  0
 src/pywrangler/wranglers/spark/base.py     | 53 ++++++++++++++++++++++
 tests/wranglers/spark/__init__.py          |  0
 tests/wranglers/spark/test_base.py         | 17 +++++++
 4 files changed, 70 insertions(+)
 create mode 100644 src/pywrangler/wranglers/spark/__init__.py
 create mode 100644 src/pywrangler/wranglers/spark/base.py
 create mode 100644 tests/wranglers/spark/__init__.py
 create mode 100644 tests/wranglers/spark/test_base.py

diff --git a/src/pywrangler/wranglers/spark/__init__.py b/src/pywrangler/wranglers/spark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pywrangler/wranglers/spark/base.py b/src/pywrangler/wranglers/spark/base.py
new file mode 100644
index 0000000..8f7ba14
--- /dev/null
+++ b/src/pywrangler/wranglers/spark/base.py
@@ -0,0 +1,53 @@
+"""This module contains the dask base wrangler.
+
+"""
+
+from pyspark.sql import DataFrame
+
+from pywrangler.wranglers.base import BaseWrangler
+
+
+class SparkWrangler(BaseWrangler):
+    """Contains methods common to all spark based wranglers.
+
+    """
+
+    @property
+    def computation_engine(self):
+        return "spark"
+
+
+class SparkSingleNoFit(SparkWrangler):
+    """Mixin class defining `fit` and `fit_transform` for all wranglers with
+    a single data frame input and output with no fitting necessary.
+
+    """
+
+    def fit(self, df: DataFrame):
+        """Do nothing and return the wrangler unchanged.
+
+        This method is just there to implement the usual API and hence work in
+        pipelines.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+
+        """
+
+        return self
+
+    def fit_transform(self, df: DataFrame) -> DataFrame:
+        """Apply fit and transform in sequence at once.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+
+        Returns
+        -------
+        result: pd.DataFrame
+
+        """
+
+        return self.fit(df).transform(df)
diff --git a/tests/wranglers/spark/__init__.py b/tests/wranglers/spark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/wranglers/spark/test_base.py b/tests/wranglers/spark/test_base.py
new file mode 100644
index 0000000..6c20d21
--- /dev/null
+++ b/tests/wranglers/spark/test_base.py
@@ -0,0 +1,17 @@
+"""Test spark base wrangler.
+
+"""
+
+import pytest
+
+try:
+    from pywrangler.wranglers.spark.base import SparkWrangler
+except ImportError:
+    SparkWrangler = None
+
+
+@pytest.mark.pyspark
+def test_spark_base_wrangler_engine():
+    wrangler = SparkWrangler()
+
+    assert wrangler.computation_engine == "spark"

From 533685e6dbe00e161455f762284db24e25cbafd0 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 21:31:26 +0200
Subject: [PATCH 16/48] Fix typo.

---
 tests/test_environment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_environment.py b/tests/test_environment.py
index d8cee0d..5018288 100644
--- a/tests/test_environment.py
+++ b/tests/test_environment.py
@@ -45,7 +45,7 @@ def test_pyspark_import():
 
 @pytest.mark.pyspark
 def test_pyspark_pandas_interaction(spark):
-    """Check simple interaction between pyspark and pandes.
+    """Check simple interaction between pyspark and pandas.
 
     """
 

From 15d0952ae4f1493cf7f5396b4aeb257bd5095d1a Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 21:33:04 +0200
Subject: [PATCH 17/48] Add custom `report` method for `TimeProfiler`. Add
 `SparkTimeProfiler`.

---
 src/pywrangler/benchmark.py | 120 ++++++++++++++++++++++++++++++++++--
 tests/test_benchmark.py     |  59 ++++++++++++++++--
 2 files changed, 169 insertions(+), 10 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index 8b6156f..b823234 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -6,7 +6,7 @@
 import inspect
 import sys
 import timeit
-from typing import Callable, Iterable, List, Union
+from typing import Any, Callable, Iterable, List, Union
 
 import numpy as np
 import pandas as pd
@@ -15,7 +15,17 @@
 from pywrangler.util import sanitizer
 from pywrangler.util._pprint import enumeration, header, sizeof
 from pywrangler.util.helper import cached_property
-from pywrangler.wranglers.pandas.base import PandasWrangler
+from pywrangler.wranglers.base import BaseWrangler
+
+try:
+    from pyspark.sql import DataFrame as SparkDataFrame
+except ImportError:
+    SparkDataFrame = Any
+
+try:
+    from dask.dataframe import DataFrame as DaskDataFrame
+except ImportError:
+    DaskDataFrame = Any
 
 
 def allocate_memory(size: float) -> np.ndarray:
@@ -395,6 +405,35 @@ def repetitions(self) -> int:
 
         return len(self._timings)
 
+    def report(self):
+        """Profile time via `profile` and provide human readable report.
+
+        Returns
+        -------
+        None. Prints report to stdout.
+
+        """
+
+        enum_kwargs = dict(align_width=15, bullet_char="")
+
+        # string part for header
+        wrangler_name = self._wrangler.__class__.__name__
+        str_header = header("{} - time profiling".format(wrangler_name))
+
+        # string part for values
+        dict_values = {"Fastest": "{:.2f}s".format(self.fastest),
+                       "Median": "{:.2f}s".format(self.median),
+                       "Std": "{:.2f}s".format(self.standard_deviation),
+                       "Repetitions": self.repetitions}
+
+        str_values = enumeration(dict_values, **enum_kwargs)
+
+        # build complete string and print
+        template = "{}\n{}\n"
+        report_string = template.format(str_header, str_values)
+
+        print(report_string)
+
 
 class PandasTimeProfiler(TimeProfiler):
     """Approximate time that a pandas wrangler instance requires to execute the
@@ -402,7 +441,7 @@ class PandasTimeProfiler(TimeProfiler):
 
     Parameters
     ----------
-    wrangler: pywrangler.wranglers.pandas.base.PandasWrangler
+    wrangler: pywrangler.wranglers.base.BaseWrangler
          The wrangler instance to be profiled.
     repetitions: None, int, optional
         Number of repetitions. If `None`, `timeit.Timer.autorange` will
@@ -423,7 +462,7 @@ class PandasTimeProfiler(TimeProfiler):
 
     """
 
-    def __init__(self, wrangler: PandasWrangler,
+    def __init__(self, wrangler: BaseWrangler,
                  repetitions: Union[None, int] = None):
         self._wrangler = wrangler
         super().__init__(wrangler.fit_transform, repetitions)
@@ -464,7 +503,7 @@ class PandasMemoryProfiler(MemoryProfiler):
 
     """
 
-    def __init__(self, wrangler: PandasWrangler, repetitions: int = 5):
+    def __init__(self, wrangler: BaseWrangler, repetitions: int = 5):
         self._wrangler = wrangler
 
         self._usage_input = None
@@ -587,3 +626,74 @@ def _memory_usage_dfs(*dfs: pd.DataFrame) -> int:
                       for df in dfs]
 
         return int(np.sum(mem_usages))
+
+
+class SparkTimeProfiler(TimeProfiler):
+    """Approximate time that a spark wrangler instance requires to execute the
+    `fit_transform` step.
+
+    Please note, input dataframes are cached before timing execution to ensure
+    timing measurements only capture wrangler's `fit_transform`. This may cause
+    problems if the size of input dataframes exceeds available memory.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.base.BaseWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    timings: list
+        The timing measurements in seconds.
+    median: float
+        The median of the timing measurements in seconds.
+    standard_deviation: float
+        The standard deviation of the timing measurements in seconds.
+    fastast: float
+        The fastest value of the timing measurements in seconds.
+    repetitions: int
+        The number of measurements.
+
+    """
+
+    def __init__(self, wrangler: BaseWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+
+        def wrapper(*args, **kwargs):
+            """Wrapper function to call `count()` to enforce computation.
+
+            """
+
+            wrangler.fit_transform(*args, **kwargs).count()
+
+        super().__init__(wrapper, repetitions)
+
+    def profile(self, *dfs: SparkDataFrame, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
+
+        Please note, input dataframes are cached before timing execution to
+        ensure timing measurements only capture wrangler's `fit_transform`.
+        This may cause problems if the size of input dataframes exceeds
+        available memory.
+
+        """
+
+        # cache input dataframes
+        dfs_cached = [df.cache() for df in dfs]
+
+        # enforce caching calling count() action
+        for df in dfs_cached:
+            df.count()
+
+        super().profile(*dfs_cached, **kwargs)
+
+        # clear caches
+        for df in dfs_cached:
+            df.unpersist()
+
+        return self
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 3fd493b..891dc80 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -15,12 +15,18 @@
     MemoryProfiler,
     PandasMemoryProfiler,
     PandasTimeProfiler,
+    SparkTimeProfiler,
     TimeProfiler,
     allocate_memory
 )
 from pywrangler.exceptions import NotProfiledError
 from pywrangler.wranglers.pandas.base import PandasSingleNoFit
 
+try:
+    from pywrangler.wranglers.spark.base import SparkSingleNoFit
+except ImportError:
+    SparkSingleNoFit = None
+
 MIB = 2 ** 20
 
 
@@ -195,7 +201,6 @@ def dummy():
 
 
 def test_time_profiler_properties():
-
     def dummy():
         pass
 
@@ -231,15 +236,59 @@ def dummy():
 
 
 def test_pandas_time_profiler_fastest():
+    """Basic test for pandas time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
 
     sleep = 0.0001
-    df_dummy = pd.DataFrame()
+    df_input = pd.DataFrame()
 
     class DummyWrangler(PandasSingleNoFit):
         def transform(self, df):
             time.sleep(sleep)
-            pass
+            return df
+
+    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input)
+
+    assert time_profiler.fastest >= sleep
+
+
+@pytest.mark.pyspark
+def test_spark_time_profiler_fastest(spark):
+    """Basic test for spark time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
+
+    sleep = 0.0001
+    df_input = spark.range(10).toDF("col")
+
+    class DummyWrangler(SparkSingleNoFit):
+        def transform(self, df):
+            time.sleep(sleep)
+            return df
+
+    time_profiler = SparkTimeProfiler(DummyWrangler(), 1).profile(df_input)
+
+    assert time_profiler.fastest >= sleep
+
+
+@pytest.mark.pyspark
+def test_spark_time_profiler_no_caching(spark):
+    """Pyspark input dataframes are cached during time profiling. Ensure input
+    dataframes are released from caching after profiling.
+
+    """
+
+    sleep = 0.0001
+    df_input = spark.range(10).toDF("col")
+
+    class DummyWrangler(SparkSingleNoFit):
+        def transform(self, df):
+            time.sleep(sleep)
+            return df
 
-    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_dummy)
+    SparkTimeProfiler(DummyWrangler(), 1).profile(df_input)
 
-    time_profiler.fastest >= sleep
+    assert df_input.is_cached is False

From 9576941b0f78d9645d48fe2b31e779d02c26b2c1 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 22:54:58 +0200
Subject: [PATCH 18/48] Add `DaskTimeProfiler`.

---
 src/pywrangler/benchmark.py | 92 +++++++++++++++++++++++++++++++++++--
 tests/test_benchmark.py     | 28 +++++++++++
 2 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index b823234..3d596bc 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -174,6 +174,9 @@ class MemoryProfiler(BaseProfiler):
         Callable object to be memory profiled.
     repetitions: int, optional
         Number of repetitions.
+    interval: float, optional
+        Defines interval duration between consecutive memory usage
+        measurements in seconds.
 
     Notes
     -----
@@ -183,9 +186,10 @@ class MemoryProfiler(BaseProfiler):
 
     """
 
-    def __init__(self, func, repetitions=5):
+    def __init__(self, func, repetitions: int = 5, interval: float = 0.01):
         self._func = func
         self._repetitions = repetitions
+        self._interval = interval
 
         self._max_usages = None
         self._baselines = None
@@ -212,10 +216,12 @@ def profile(self, *args, **kwargs):
         while counter < self._repetitions:
             gc.collect()
             baseline = memory_usage()[0]
-            max_usage = memory_usage(mem_args, max_usage=True)[0]
+            max_usage = memory_usage(mem_args,
+                                     interval=self._interval,
+                                     max_usage=True)
 
             baselines.append(self._mb_to_bytes(baseline))
-            max_usages.append(self._mb_to_bytes(max_usage))
+            max_usages.append(self._mb_to_bytes(max_usage[0]))
             counter += 1
 
         self._max_usages = max_usages
@@ -487,6 +493,9 @@ class PandasMemoryProfiler(MemoryProfiler):
         The wrangler instance to be profiled.
     repetitions: int
         The number of measurements for memory profiling.
+    interval: float, optional
+        Defines interval duration between consecutive memory usage
+        measurements in seconds.
 
     Attributes
     ----------
@@ -503,13 +512,14 @@ class PandasMemoryProfiler(MemoryProfiler):
 
     """
 
-    def __init__(self, wrangler: BaseWrangler, repetitions: int = 5):
+    def __init__(self, wrangler: BaseWrangler, repetitions: int = 5,
+                 interval: float = 0.01):
         self._wrangler = wrangler
 
         self._usage_input = None
         self._usage_output = None
 
-        super().__init__(wrangler.fit_transform, repetitions)
+        super().__init__(wrangler.fit_transform, repetitions, interval)
 
     def profile(self, *dfs: pd.DataFrame, **kwargs):
         """Profiles the actual memory usage given input dataframes `dfs`
@@ -695,5 +705,77 @@ def profile(self, *dfs: SparkDataFrame, **kwargs):
         # clear caches
         for df in dfs_cached:
             df.unpersist()
+            del df
+
+        del dfs_cached
+
+        return self
+
+
+class DaskTimeProfiler(TimeProfiler):
+    """Approximate time that a dask wrangler instance requires to execute the
+    `fit_transform` step.
+
+    Please note, input dataframes are cached before timing execution to ensure
+    timing measurements only capture wrangler's `fit_transform`. This may cause
+    problems if the size of input dataframes exceeds available memory.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.base.BaseWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    timings: list
+        The timing measurements in seconds.
+    median: float
+        The median of the timing measurements in seconds.
+    standard_deviation: float
+        The standard deviation of the timing measurements in seconds.
+    fastast: float
+        The fastest value of the timing measurements in seconds.
+    repetitions: int
+        The number of measurements.
+
+    """
+
+    def __init__(self, wrangler: BaseWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+
+        def wrapper(*args, **kwargs):
+            """Wrapper function to call `compute()` to enforce computation.
+
+            """
+
+            wrangler.fit_transform(*args, **kwargs).compute()
+
+        super().__init__(wrapper, repetitions)
+
+    def profile(self, *dfs: DaskDataFrame, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
+
+        Please note, input dataframes are cached before timing execution to
+        ensure timing measurements only capture wrangler's `fit_transform`.
+        This may cause problems if the size of input dataframes exceeds
+        available memory.
+
+        """
+
+        # cache input dataframes
+        dfs_cached = [df.persist() for df in dfs]
+
+        super().profile(*dfs_cached, **kwargs)
+
+        # clear caches
+        for df in dfs_cached:
+            del df
+
+        del dfs_cached
 
         return self
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 891dc80..e73434c 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -12,6 +12,7 @@
 
 from pywrangler.benchmark import (
     BaseProfiler,
+    DaskTimeProfiler,
     MemoryProfiler,
     PandasMemoryProfiler,
     PandasTimeProfiler,
@@ -27,6 +28,11 @@
 except ImportError:
     SparkSingleNoFit = None
 
+try:
+    from pywrangler.wranglers.dask.base import DaskSingleNoFit
+except ImportError:
+    DaskSingleNoFit = None
+
 MIB = 2 ** 20
 
 
@@ -292,3 +298,25 @@ def transform(self, df):
     SparkTimeProfiler(DummyWrangler(), 1).profile(df_input)
 
     assert df_input.is_cached is False
+
+
+@pytest.mark.dask
+def test_dask_time_profiler_fastest(spark):
+    """Basic test for dask time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
+
+    from dask import dataframe as dd
+
+    sleep = 0.0001
+    df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
+
+    class DummyWrangler(DaskSingleNoFit):
+        def transform(self, df):
+            time.sleep(sleep)
+            return df
+
+    time_profiler = DaskTimeProfiler(DummyWrangler(), 1).profile(df_input)
+
+    assert time_profiler.fastest >= sleep

From 4c1bb91a018d050628c8dd442ca9b22c79cb3ebc Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 22:55:21 +0200
Subject: [PATCH 19/48] Add dask setup to tox.

---
 tox.ini | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tox.ini b/tox.ini
index 0c8d8a6..8779c07 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,6 +2,7 @@
 envlist =
     {py35,py36,py37}-pandas{0190,0191,0192,0200,0201,0202,0203,0210,0211,0220,0230,0231,0232,0233,0234,0240,0241}
     {py35,py36,py37}-pyspark{231,240}
+    {py35,py36,py37}-dask{115}
     flake8
 
 skip_missing_interpreters = True
@@ -34,6 +35,8 @@ deps =
     pyspark240: pyspark==2.4.0
     pyspark231: pyspark==2.3.1
 
+    dask115: dask[dataframe]==1.1.5
+
 setenv   =
     PYWRANGLER_TEST_ENV = {envname}
 

From ff9666357bb766afd25646f813b11f3918584365 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 3 Apr 2019 22:56:10 +0200
Subject: [PATCH 20/48] Add dask for TravisCI.

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 8ff1d1b..79ad3cd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,6 +34,8 @@ env:
   - ENV_STRING=pyspark2.4.0
   - ENV_STRING=pyspark2.3.1
 
+  - ENV_STRING=dask1.1.5
+
 
 # Remove python/pandas version interactions which do not have wheels on pypi
 matrix:

From a725f584f992d4e95b680e09a9fd34fb27795ddf Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 20 Apr 2019 15:08:20 +0200
Subject: [PATCH 21/48] Add `pretty_time_duration`. Rename `sizeof` to
 `pretty_file_size`.

---
 src/pywrangler/util/_pprint.py | 54 +++++++++++++++++++++++++++++-----
 tests/util/test_pprint.py      | 28 ++++++++++++++----
 2 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/src/pywrangler/util/_pprint.py b/src/pywrangler/util/_pprint.py
index d602168..70f282d 100644
--- a/src/pywrangler/util/_pprint.py
+++ b/src/pywrangler/util/_pprint.py
@@ -62,7 +62,7 @@ def header(name: str, indent: int = 0, underline: str = "-") -> str:
     _indent = " " * indent
 
     _header = _indent + name
-    _underline = _indent + underline*len(name) + "\n"
+    _underline = _indent + underline * len(name) + "\n"
 
     return _join([_header, _underline])
 
@@ -111,8 +111,8 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-",
     return _join(indented)
 
 
-def sizeof(size: float, precision: int = 2, align: str = ">",
-           width=None) -> str:
+def pretty_file_size(size: float, precision: int = 2, align: str = ">",
+                     width: int = 0) -> str:
     """Helper function to format size in human readable format.
 
     Parameters
@@ -138,10 +138,6 @@ def sizeof(size: float, precision: int = 2, align: str = ">",
     """  # noqa: E501
 
     template = "{size:{align}{width}.{precision}f} {unit}B"
-
-    if width is None:
-        width = precision + 5
-
     kwargs = dict(width=width, precision=precision, align=align)
 
     # iterate units (multiples of 1024 bytes)
@@ -152,3 +148,47 @@ def sizeof(size: float, precision: int = 2, align: str = ">",
 
     else:
         return template.format(size=size, unit='Yi', **kwargs)
+
+
+def pretty_time_duration(seconds: float, precision: int = 1, align: str = ">",
+                         width: int = 0) -> str:
+    """Helper function to format time duration in human readable format.
+
+    Parameters
+    ----------
+    seconds: float
+        The size in seconds to be converted into human readable format.
+    precision: int, optional
+        Define shown precision.
+    align: {'<', '^', '>'}, optional
+        Format align specifier.
+    width: int
+        Define maximum width for number.
+
+    Returns
+    -------
+    human_fmt: str
+        Human readable representation of given `seconds`.
+
+    """
+
+    template = "{time_delta:{align}{width}.{precision}f} {unit}"
+
+    units = [('year', 60 * 60 * 24 * 365),
+             ('month', 60 * 60 * 24 * 30),
+             ('d', 60 * 60 * 24),
+             ('h', 60 * 60),
+             ('min', 60),
+             ('s', 1),
+             ('ms', 1e-3),
+             ('µs', 1e-6),
+             ('ns', 1e-9)]
+
+    for unit_name, unit_seconds in units:
+        if seconds > unit_seconds:
+            time_delta = seconds / unit_seconds
+            return template.format(time_delta=time_delta,
+                                   align=align,
+                                   width=width,
+                                   precision=precision,
+                                   unit=unit_name)
diff --git a/tests/util/test_pprint.py b/tests/util/test_pprint.py
index 35a71b0..82f5ef6 100644
--- a/tests/util/test_pprint.py
+++ b/tests/util/test_pprint.py
@@ -82,9 +82,25 @@ def test_enumeration_list_with_bullet():
     assert _pprint.enumeration(test_input, bullet_char="o") == test_output
 
 
-def test_sizeof():
-    assert _pprint.sizeof(1024, precision=1, width=0) == '1.0 KiB'
-    assert _pprint.sizeof(1024, precision=1) == '   1.0 KiB'
-    assert _pprint.sizeof(1024, precision=1, align="<") == '1.0    KiB'
-    assert _pprint.sizeof(1024 ** 2, precision=1, width=0) == '1.0 MiB'
-    assert _pprint.sizeof(1024 ** 8, precision=2, width=0) == '1.00 YiB'
+def test_pretty_file_size():
+    pfs = _pprint.pretty_file_size
+
+    assert pfs(1024, precision=1, width=4) == ' 1.0 KiB'
+    assert pfs(1024, precision=1, width=4, align="<") == '1.0  KiB'
+    assert pfs(1024, precision=1) == '1.0 KiB'
+    assert pfs(1024 ** 2, precision=1, width=0) == '1.0 MiB'
+    assert pfs(1024 ** 8, precision=2, width=0) == '1.00 YiB'
+
+
+def test_pretty_time_duration():
+    ptd = _pprint.pretty_time_duration
+
+    assert ptd(1.1) == "1.1 s"
+    assert ptd(1.59, width=5) == "  1.6 s"
+    assert ptd(1.55, width=7, precision=2) == "   1.55 s"
+    assert ptd(1.55, width=7, precision=2, align="<") == "1.55    s"
+    assert ptd(120, precision=2) == "2.00 min"
+    assert ptd(5400, precision=1) == "1.5 h"
+    assert ptd(0.5, precision=1) == "500.0 ms"
+    assert ptd(0.0005, precision=1) == "500.0 µs"
+    assert ptd(0.0000005, precision=1) == "500.0 ns"

From a8381611d7a34ef6334b31413fd3c209c9ca14fe Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:22:47 +0200
Subject: [PATCH 22/48] Move spark benchmark utility into spark subpackage.

---
 src/pywrangler/wranglers/spark/benchmark.py | 96 +++++++++++++++++++++
 tests/wranglers/spark/test_benchmark.py     | 52 +++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 src/pywrangler/wranglers/spark/benchmark.py
 create mode 100644 tests/wranglers/spark/test_benchmark.py

diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py
new file mode 100644
index 0000000..ef5cbf2
--- /dev/null
+++ b/src/pywrangler/wranglers/spark/benchmark.py
@@ -0,0 +1,96 @@
+"""This module contains benchmarking utility for pandas wranglers.
+
+"""
+
+from typing import Union
+
+from pyspark.sql import DataFrame
+
+from pywrangler.benchmark import TimeProfiler
+from pywrangler.wranglers.spark.base import SparkWrangler
+
+
+class SparkTimeProfiler(TimeProfiler):
+    """Approximate time that a spark wrangler instance requires to execute the
+    `fit_transform` step.
+
+    Please note, input dataframes are cached before timing execution to ensure
+    timing measurements only capture wrangler's `fit_transform`. This may cause
+    problems if the size of input dataframes exceeds available memory.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.base.BaseWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in seconds.
+    best: float
+        The best measurement in seconds.
+    median: float
+        The median of measurements in seconds.
+    worst: float
+        The worst measurement in seconds.
+    std: float
+        The standard deviation of measurements in seconds.
+    runs: int
+        The number of measurements.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+    """
+
+    def __init__(self, wrangler: SparkWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+
+        def wrapper(*args, **kwargs):
+            """Wrapper function to call `count()` to enforce computation.
+
+            """
+
+            wrangler.fit_transform(*args, **kwargs).count()
+
+        super().__init__(wrapper, repetitions)
+
+    def profile(self, *dfs: DataFrame, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
+
+        Please note, input dataframes are cached before timing execution to
+        ensure timing measurements only capture wrangler's `fit_transform`.
+        This may cause problems if the size of input dataframes exceeds
+        available memory.
+
+        """
+
+        # cache input dataframes
+        dfs_cached = [df.cache() for df in dfs]
+
+        # enforce caching calling count() action
+        for df in dfs_cached:
+            df.count()
+
+        super().profile(*dfs_cached, **kwargs)
+
+        # clear caches
+        for df in dfs_cached:
+            df.unpersist()
+            del df
+
+        del dfs_cached
+
+        return self
diff --git a/tests/wranglers/spark/test_benchmark.py b/tests/wranglers/spark/test_benchmark.py
new file mode 100644
index 0000000..c353448
--- /dev/null
+++ b/tests/wranglers/spark/test_benchmark.py
@@ -0,0 +1,52 @@
+"""This module contains tests for spark benchmarks.
+
+isort:skip_file
+"""
+
+import time
+
+import pytest
+
+pytestmark = pytest.mark.pyspark  # noqa: E402
+pyspark = pytest.importorskip("pyspark")  # noqa: E402
+
+from pywrangler.wranglers.spark.base import SparkSingleNoFit
+from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler
+
+SLEEP = 0.0001
+
+
+@pytest.fixture
+def wrangler_sleeps():
+    class DummyWrangler(SparkSingleNoFit):
+        def transform(self, df):
+            time.sleep(SLEEP)
+            return df
+
+    return DummyWrangler
+
+
+def test_spark_time_profiler_fastest(spark, wrangler_sleeps):
+    """Basic test for spark time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
+
+    df_input = spark.range(10).toDF("col")
+
+    time_profiler = SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+
+    assert time_profiler.best >= SLEEP
+
+
+def test_spark_time_profiler_no_caching(spark, wrangler_sleeps):
+    """Pyspark input dataframes are cached during time profiling. Ensure input
+    dataframes are released from caching after profiling.
+
+    """
+
+    df_input = spark.range(10).toDF("col")
+
+    SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+
+    assert df_input.is_cached is False

From 3e6b6eede1cb6e2481114b24b2966872cffdfa92 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:24:30 +0200
Subject: [PATCH 23/48] Reuse `get_param_names` helper function.

---
 src/pywrangler/wranglers/base.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py
index 94b02a4..0ee47df 100644
--- a/src/pywrangler/wranglers/base.py
+++ b/src/pywrangler/wranglers/base.py
@@ -3,9 +3,8 @@
 
 """
 
-import inspect
-
 from pywrangler.util import _pprint
+from pywrangler.util.helper import get_param_names
 
 
 class BaseWrangler:
@@ -60,11 +59,7 @@ def get_params(self) -> dict:
 
         """
 
-        init = self.__class__.__init__
-        signature = inspect.signature(init)
-        parameters = signature.parameters.values()
-
-        param_names = [x.name for x in parameters if x.name != "self"]
+        param_names = get_param_names(self.__class__.__init__, ["self"])
         param_dict = {x: getattr(self, x) for x in param_names}
 
         return param_dict

From 63723e1e02b953e283de693a90689bb1131e9acf Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:28:10 +0200
Subject: [PATCH 24/48] Refactor `BaseProfiler`, `MemoryProfiler` and
 `TimeProfiler` to use common interface for measurements. Remove wrangler
 specific benchmark functions.

---
 src/pywrangler/benchmark.py | 781 ++++++++++++------------------------
 tests/test_benchmark.py     | 325 +++++----------
 2 files changed, 372 insertions(+), 734 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index 3d596bc..7111571 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -3,29 +3,20 @@
 """
 
 import gc
-import inspect
 import sys
 import timeit
-from typing import Any, Callable, Iterable, List, Union
+from typing import Callable, Iterable, List, Union
 
 import numpy as np
-import pandas as pd
 
 from pywrangler.exceptions import NotProfiledError
-from pywrangler.util import sanitizer
-from pywrangler.util._pprint import enumeration, header, sizeof
-from pywrangler.util.helper import cached_property
-from pywrangler.wranglers.base import BaseWrangler
-
-try:
-    from pyspark.sql import DataFrame as SparkDataFrame
-except ImportError:
-    SparkDataFrame = Any
-
-try:
-    from dask.dataframe import DataFrame as DaskDataFrame
-except ImportError:
-    DaskDataFrame = Any
+from pywrangler.util._pprint import (
+    enumeration,
+    header,
+    pretty_file_size,
+    pretty_time_duration
+)
+from pywrangler.util.helper import get_param_names
 
 
 def allocate_memory(size: float) -> np.ndarray:
@@ -57,51 +48,128 @@ def allocate_memory(size: float) -> np.ndarray:
 
 
 class BaseProfiler:
-    """Base class defining interface and common helper methods for memory and
-    time profiler.
+    """Base class defining the interface for all profilers.
+
+    Subclasses have to implement `profile` (the actual profiling
+    implementation) and `less_is_better` (defining the ranking of profiling
+    measurements).
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements.
+    best: float
+        The best measurement.
+    median: float
+        The median of measurements.
+    worst: float
+        The worst measurement.
+    std: float
+        The standard deviation of measurements.
+    runs: int
+        The number of measurements.
 
-    By convention, the profiled object should always be the first argument
-    (ignoring self) passed to `__init__`.
-    All public profiling metrics have to should be defined as properties. All
-    private attributes need to start with an underscore.
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
 
     """
 
-    def profile(self, *args, **kwargs):
-        """Contains the actual profiling implementation and should always
-        return self.
+    @property
+    def measurements(self) -> List[float]:
+        """Return measurements of profiling.
+
+        """
+
+        self._check_is_profiled(["_measurements"])
+
+        return self._measurements
+
+    @property
+    def best(self) -> float:
+        """Returns the best measurement.
+
+        """
+
+        if self.less_is_better:
+            return np.min(self.measurements)
+        else:
+            return np.max(self.measurements)
+
+    @property
+    def median(self) -> float:
+        """Returns the median of measurements.
+
+        """
+
+        return np.median(self.measurements)
+
+    @property
+    def worst(self) -> float:
+        """Returns the worst measurement.
+
+        """
+
+        if self.less_is_better:
+            return np.max(self.measurements)
+        else:
+            return np.min(self.measurements)
+
+    @property
+    def std(self) -> float:
+        """Returns the standard deviation of measurements.
+
+        """
+
+        return np.std(self.measurements)
+
+    @property
+    def runs(self) -> int:
+        """Return number of measurements.
+
+        """
+
+        return len(self.measurements)
+
+    @property
+    def less_is_better(self) -> bool:
+        """Defines ranking of measurements.
 
         """
 
         raise NotImplementedError
 
-    def report(self):
-        """Print simple report consisting of the name of the profiler class,
-        the name of the profiled object, and all defined metrics/properties.
+    def profile(self, *args, **kwargs):
+        """Contains the actual profiling implementation and has to set
+        `self._measurements`. Always returns self.
 
         """
 
-        # get name of profiler
-        profiler_name = self.__class__.__name__
+        raise NotImplementedError
 
-        # get name of profiled object
-        parameters = inspect.signature(self.__init__).parameters.keys()
-        profiled_object = getattr(self, '_{}'.format(list(parameters)[0]))
+    def report(self):
+        """Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+
+        """
 
-        try:
-            profiled_obj_name = profiled_object.__name__
-        except AttributeError:
-            profiled_obj_name = profiled_object.__class__.__name__
+        tpl = "{best} {sign} {median} {sign} {worst} ± {std} ({runs} runs)"
 
-        # get relevant metrics
-        ignore = ('profile', 'report', 'profile_report')
-        metric_names = [x for x in dir(self)
-                        if not x.startswith('_')
-                        and x not in ignore]
-        metric_values = {x: getattr(self, x) for x in metric_names}
+        fmt = self._pretty_formatter
+        values = {"best": fmt(self.best),
+                  "median": fmt(self.median),
+                  "worst": fmt(self.worst),
+                  "std": fmt(self.std),
+                  "runs": self.runs,
+                  "sign": "<" if self.less_is_better else ">"}
 
-        print(header('{}: {}'.format(profiler_name, profiled_obj_name)), '\n',
-              enumeration(metric_values), sep='')
+        print(tpl.format(**values))
 
     def profile_report(self, *args, **kwargs):
         """Calls profile and report in sequence.
@@ -110,6 +178,25 @@ def profile_report(self, *args, **kwargs):
 
         self.profile(*args, **kwargs).report()
 
+    def _pretty_formatter(self, value: float) -> str:
+        """String formatter for human readable output of given input `value`.
+        Should be replaced with sensible formatters for file size or time
+        duration.
+
+        Parameters
+        ----------
+        value: float
+            Numeric value to be formatted.
+
+        Returns
+        -------
+        pretty_string: str
+            Human readable representation of `value`.
+
+        """
+
+        return str(value)
+
     def _check_is_profiled(self, attributes: Iterable[str]) -> None:
         """Check if `profile` was already called by ensuring passed attributes
         are not `None`.
@@ -133,30 +220,26 @@ def _check_is_profiled(self, attributes: Iterable[str]) -> None:
 
         """
 
-        if any([getattr(self, x) is None for x in attributes]):
+        if any([getattr(self, x, None) is None for x in attributes]):
             msg = ("This {}'s instance is not profiled yet. Call 'profile' "
                    "with appropriate arguments before using this method."
                    .format(self.__class__.__name__))
 
             raise NotProfiledError(msg)
 
-    @staticmethod
-    def _mb_to_bytes(size_mib: float) -> int:
-        """Helper method to convert MiB to Bytes.
+    def __repr__(self):
+        """Print representation of profiler instance.
 
-        Parameters
-        ----------
-        size_mib: float
-            Size in MiB
+        """
 
-        Returns
-        -------
-        size_bytes: int
-            Size in bytes.
+        # get name of profiler
+        profiler_name = self.__class__.__name__
 
-        """
+        # get parameter names
+        param_names = get_param_names(self.__class__.__init__, ["self"])
+        param_dict = {x: getattr(self, x) for x in param_names}
 
-        return int(size_mib * (2 ** 20))
+        return header(profiler_name) + enumeration(param_dict)
 
 
 class MemoryProfiler(BaseProfiler):
@@ -165,6 +248,8 @@ class MemoryProfiler(BaseProfiler):
     usage during function execution and the baseline memory usage before
     function execution.
 
+    Note, memory consumption of child processes are included.
+
     In addition, compute the mean increase in baseline memory usage between
     repetitions which might indicate memory leakage.
 
@@ -178,6 +263,33 @@ class MemoryProfiler(BaseProfiler):
         Defines interval duration between consecutive memory usage
         measurements in seconds.
 
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in bytes.
+    best: float
+        The best measurement in bytes.
+    median: float
+        The median of measurements in bytes.
+    worst: float
+        The worst measurement in bytes.
+    std: float
+        The standard deviation of measurements in bytes.
+    runs: int
+        The number of measurements.
+    baseline_change: float
+        The median change in baseline memory usage across all runs in bytes.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
     Notes
     -----
     The implementation is based on `memory_profiler` and is inspired by the
@@ -186,13 +298,11 @@ class MemoryProfiler(BaseProfiler):
 
     """
 
-    def __init__(self, func, repetitions: int = 5, interval: float = 0.01):
-        self._func = func
-        self._repetitions = repetitions
-        self._interval = interval
-
-        self._max_usages = None
-        self._baselines = None
+    def __init__(self, func: Callable, repetitions: int = 5,
+                 interval: float = 0.01):
+        self.func = func
+        self.repetitions = repetitions
+        self.interval = interval
 
     def profile(self, *args, **kwargs):
         """Executes the actual memory profiling.
@@ -211,14 +321,16 @@ def profile(self, *args, **kwargs):
         counter = 0
         baselines = []
         max_usages = []
-        mem_args = (self._func, args, kwargs)
 
-        while counter < self._repetitions:
+        func_args = (self.func, args, kwargs)
+        mem_args = dict(interval=self.interval,
+                        multiprocess=True,
+                        max_usage=True)
+
+        while counter < self.repetitions:
             gc.collect()
-            baseline = memory_usage()[0]
-            max_usage = memory_usage(mem_args,
-                                     interval=self._interval,
-                                     max_usage=True)
+            baseline = memory_usage(**mem_args)
+            max_usage = memory_usage(func_args, **mem_args)
 
             baselines.append(self._mb_to_bytes(baseline))
             max_usages.append(self._mb_to_bytes(max_usage[0]))
@@ -226,12 +338,21 @@ def profile(self, *args, **kwargs):
 
         self._max_usages = max_usages
         self._baselines = baselines
+        self._measurements = np.subtract(max_usages, baselines).tolist()
 
         return self
 
+    @property
+    def less_is_better(self) -> bool:
+        """Less memory consumption is better.
+
+        """
+
+        return True
+
     @property
     def max_usages(self) -> List[int]:
-        """Returns the absolute, maximum memory usages for each iteration in
+        """Returns the absolute, maximum memory usages for each run in
         bytes.
 
         """
@@ -242,7 +363,7 @@ def max_usages(self) -> List[int]:
 
     @property
     def baselines(self) -> List[int]:
-        """Returns the absolute, baseline memory usages for each iteration in
+        """Returns the absolute, baseline memory usages for each run in
         bytes. The baseline memory usage is defined as the memory usage before
         function execution.
 
@@ -253,43 +374,49 @@ def baselines(self) -> List[int]:
         return self._baselines
 
     @property
-    def increases(self) -> List[int]:
-        """Returns the absolute memory increase for each iteration in bytes.
-        The memory increase is defined as the difference between the maximum
-        memory usage during function execution and the baseline memory usage
+    def baseline_change(self) -> float:
+        """Returns the median change in baseline memory usage across all
+        run. The baseline memory usage is defined as the memory usage
         before function execution.
-
         """
 
-        return np.subtract(self.max_usages, self.baselines).tolist()
+        changes = np.diff(self.baselines)
+        return float(np.median(changes))
 
-    @property
-    def increases_mean(self) -> float:
-        """Returns the mean of the absolute memory increases across all
-        iterations.
+    def _pretty_formatter(self, value: float) -> str:
+        """String formatter for human readable output of given input `value`.
+
+        Parameters
+        ----------
+        value: float
+            Numeric value to be formatted.
+
+        Returns
+        -------
+        pretty_string: str
+            Human readable representation of `value`.
 
         """
 
-        return float(np.mean(self.increases))
+        return pretty_file_size(value)
 
-    @property
-    def increases_std(self) -> float:
-        """Returns the standard variation of the absolute memory increases
-        across all iterations.
+    @staticmethod
+    def _mb_to_bytes(size_mib: float) -> int:
+        """Helper method to convert MiB to Bytes.
 
-        """
+        Parameters
+        ----------
+        size_mib: float
+            Size in MiB
 
-        return float(np.std(self.increases))
+        Returns
+        -------
+        size_bytes: int
+            Size in bytes.
 
-    @property
-    def baseline_change(self) -> float:
-        """Returns the mean change in baseline memory usage across all
-        all iterations. The baseline memory usage is defined as the memory
-        usage before function execution.
         """
 
-        changes = np.diff(self.baselines)
-        return float(np.mean(changes))
+        return int(size_mib * (2 ** 20))
 
 
 class TimeProfiler(BaseProfiler):
@@ -307,17 +434,29 @@ class TimeProfiler(BaseProfiler):
 
     Attributes
     ----------
-    timings: list
-        The timing measurements in seconds.
+    measurements: list
+        The actual profiling measurements in seconds.
+    best: float
+        The best measurement in seconds.
     median: float
-        The median of the timing measurements in seconds.
-    standard_deviation: float
-        The standard deviation of the timing measurements in seconds.
-    fastast: float
-        The fastest value of the timing measurements in seconds.
-    repetitions: int
+        The median of measurements in seconds.
+    worst: float
+        The worst measurement in seconds.
+    std: float
+        The standard deviation of measurements in seconds.
+    runs: int
         The number of measurements.
 
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
     Notes
     -----
     The implementation is based on standard library's `timeit` module.
@@ -325,13 +464,8 @@ class TimeProfiler(BaseProfiler):
     """
 
     def __init__(self, func: Callable, repetitions: Union[None, int] = None):
-        self._func = func
-        self._repetitions = repetitions
-
-        self._timings = None
-        self._timings_mean = None
-        self._timings_std = None
-        self._fastest = None
+        self.func = func
+        self.repetitions = repetitions
 
     def profile(self, *args, **kwargs):
         """Executes the actual time profiling.
@@ -351,431 +485,40 @@ def wrapper():
 
             """
 
-            self._func(*args, **kwargs)
+            self.func(*args, **kwargs)
 
         timer = timeit.Timer(stmt=wrapper)
 
-        if self._repetitions is None:
+        if self.repetitions is None:
             repeat, _ = timer.autorange(None)
         else:
-            repeat = self._repetitions
+            repeat = self.repetitions
 
-        self._timings = timer.repeat(number=1, repeat=repeat)
+        self._measurements = timer.repeat(number=1, repeat=repeat)
 
         return self
 
     @property
-    def timings(self) -> List[float]:
-        """Returns the timeit measurements in seconds.
+    def less_is_better(self) -> bool:
+        """Less time required is better.
 
         """
 
-        return self._timings
+        return True
 
-    @property
-    def median(self) -> float:
-        """Returns the median of all timeit measurements in seconds.
-
-        """
-
-        self._check_is_profiled(['_timings'])
-
-        return float(np.median(self._timings))
-
-    @property
-    def standard_deviation(self) -> float:
-        """Returns the standard deviation of all timeit measurements in
-        seconds.
-
-        """
-
-        self._check_is_profiled(['_timings'])
-
-        return float(np.std(self._timings))
-
-    @property
-    def fastest(self) -> float:
-        """Returns the fastest timing measurement in seconds.
-
-        """
-
-        self._check_is_profiled(['_timings'])
-
-        return min(self._timings)
-
-    @property
-    def repetitions(self) -> int:
-        """Returns the number of measurements.
-
-        """
-
-        return len(self._timings)
-
-    def report(self):
-        """Profile time via `profile` and provide human readable report.
-
-        Returns
-        -------
-        None. Prints report to stdout.
-
-        """
-
-        enum_kwargs = dict(align_width=15, bullet_char="")
-
-        # string part for header
-        wrangler_name = self._wrangler.__class__.__name__
-        str_header = header("{} - time profiling".format(wrangler_name))
-
-        # string part for values
-        dict_values = {"Fastest": "{:.2f}s".format(self.fastest),
-                       "Median": "{:.2f}s".format(self.median),
-                       "Std": "{:.2f}s".format(self.standard_deviation),
-                       "Repetitions": self.repetitions}
-
-        str_values = enumeration(dict_values, **enum_kwargs)
-
-        # build complete string and print
-        template = "{}\n{}\n"
-        report_string = template.format(str_header, str_values)
-
-        print(report_string)
-
-
-class PandasTimeProfiler(TimeProfiler):
-    """Approximate time that a pandas wrangler instance requires to execute the
-    `fit_transform` step.
-
-    Parameters
-    ----------
-    wrangler: pywrangler.wranglers.base.BaseWrangler
-         The wrangler instance to be profiled.
-    repetitions: None, int, optional
-        Number of repetitions. If `None`, `timeit.Timer.autorange` will
-        determine a sensible default.
-
-    Attributes
-    ----------
-    timings: list
-        The timing measurements in seconds.
-    median: float
-        The median of the timing measurements in seconds.
-    standard_deviation: float
-        The standard deviation of the timing measurements in seconds.
-    fastast: float
-        The fastest value of the timing measurements in seconds.
-    repetitions: int
-        The number of measurements.
-
-    """
-
-    def __init__(self, wrangler: BaseWrangler,
-                 repetitions: Union[None, int] = None):
-        self._wrangler = wrangler
-        super().__init__(wrangler.fit_transform, repetitions)
-
-
-class PandasMemoryProfiler(MemoryProfiler):
-    """Approximate memory usage that a pandas wrangler instance requires to
-    execute the `fit_transform` step.
-
-    As a key metric, `ratio` is computed. It refers to the amount of
-    memory which is required to execute the `fit_transform` step. More
-    concretely, it estimates how much more memory is used standardized by the
-    input memory usage (memory usage increase during function execution divided
-    by memory usage of input dataframes). In other words, if you have a 1GB
-    input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free
-    memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input
-    dataframe would require 1GB free memory available for computation.
-
-    Parameters
-    ----------
-    wrangler: pywrangler.wranglers.pandas.base.PandasWrangler
-        The wrangler instance to be profiled.
-    repetitions: int
-        The number of measurements for memory profiling.
-    interval: float, optional
-        Defines interval duration between consecutive memory usage
-        measurements in seconds.
-
-    Attributes
-    ----------
-    increases_mean: float
-        The mean of the absolute memory increases across all iterations in
-        bytes.
-    input: int
-        Memory usage of input dataframes in bytes.
-    output: int
-        Memory usage of output dataframes in bytes.
-    ratio: float
-        The amount of memory required for computation in units of input
-        memory usage.
-
-    """
-
-    def __init__(self, wrangler: BaseWrangler, repetitions: int = 5,
-                 interval: float = 0.01):
-        self._wrangler = wrangler
-
-        self._usage_input = None
-        self._usage_output = None
-
-        super().__init__(wrangler.fit_transform, repetitions, interval)
-
-    def profile(self, *dfs: pd.DataFrame, **kwargs):
-        """Profiles the actual memory usage given input dataframes `dfs`
-        which are passed to `fit_transform`.
-
-        """
-
-        # usage input
-        self._usage_input = self._memory_usage_dfs(*dfs)
-
-        # usage output
-        dfs_output = self._wrangler.fit_transform(*dfs)
-        dfs_output = sanitizer.ensure_tuple(dfs_output)
-        self._usage_output = self._memory_usage_dfs(*dfs_output)
-
-        # usage during fit_transform
-        super().profile(*dfs, **kwargs)
-
-        return self
-
-    @property
-    def input(self) -> float:
-        """Returns the memory usage of the input dataframes in bytes.
-
-        """
-
-        self._check_is_profiled(['_usage_input'])
-        return self._usage_input
-
-    @property
-    def output(self) -> float:
-        """Returns the memory usage of the output dataframes in bytes.
-
-        """
-
-        self._check_is_profiled(['_usage_output'])
-        return self._usage_output
-
-    @cached_property
-    def ratio(self) -> float:
-        """Refers to the amount of memory which is required to execute the
-        `fit_transform` step. More concretely, it estimates how much more
-        memory is used standardized by the input memory usage (memory usage
-        increase during function execution divided by memory usage of input
-        dataframes). In other words, if you have a 1GB input dataframe, and the
-        `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to
-        succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would
-        require 1GB free memory available for computation.
-
-        """
-
-        return self.increases_mean / self.input
-
-    def report(self):
-        """Profile memory usage via `profile` and provide human readable
-        report including memory usage of input and output dataframes, memory
-        usage during `fit_transform`, the usage ratio and shows if
-        the wrangler may have side effects in regard to memory consumption via
-        the change in baseline memory usage.
-
-        Returns
-        -------
-        None. Prints report to stdout.
-
-        """
-
-        enum_kwargs = dict(align_width=15, bullet_char="")
-
-        # string part for header
-        wrangler_name = self._wrangler.__class__.__name__
-        str_header = header("{} - memory usage".format(wrangler_name))
-
-        # string part for input and output dfs
-        dict_dfs = {"Input dfs": sizeof(self.input),
-                    "Ouput dfs": sizeof(self.output)}
-
-        str_dfs = enumeration(dict_dfs, **enum_kwargs)
-
-        # string part for transform/fit and ratio
-        str_inc = sizeof(self.increases_mean)
-        str_std = sizeof(self.increases_std, width=0)
-        str_inc += " (Std: {})".format(str_std)
-        str_ratio = "{:>7.2f}".format(self.ratio)
-        str_baseline_change = sizeof(self.baseline_change)
-        dict_inc = {"Fit/Transform": str_inc,
-                    "Ratio": str_ratio,
-                    "Baseline change": str_baseline_change}
-
-        str_inc = enumeration(dict_inc, **enum_kwargs)
-
-        # build complete string and print
-        template = "{}\n{}\n\n{}"
-        report_string = template.format(str_header, str_dfs, str_inc)
-
-        print(report_string)
-
-    @staticmethod
-    def _memory_usage_dfs(*dfs: pd.DataFrame) -> int:
-        """Return memory usage in bytes for all given dataframes.
+    def _pretty_formatter(self, value: float) -> str:
+        """String formatter for human readable output of given input `value`.
 
         Parameters
         ----------
-        dfs: pd.DataFrame
-            The pandas dataframes for which memory usage should be computed.
+        value: float
+            Numeric value to be formatted.
 
         Returns
         -------
-        memory_usage: int
-            The computed memory usage in bytes.
-
-        """
-
-        mem_usages = [df.memory_usage(deep=True, index=True).sum()
-                      for df in dfs]
-
-        return int(np.sum(mem_usages))
-
-
-class SparkTimeProfiler(TimeProfiler):
-    """Approximate time that a spark wrangler instance requires to execute the
-    `fit_transform` step.
-
-    Please note, input dataframes are cached before timing execution to ensure
-    timing measurements only capture wrangler's `fit_transform`. This may cause
-    problems if the size of input dataframes exceeds available memory.
-
-    Parameters
-    ----------
-    wrangler: pywrangler.wranglers.base.BaseWrangler
-         The wrangler instance to be profiled.
-    repetitions: None, int, optional
-        Number of repetitions. If `None`, `timeit.Timer.autorange` will
-        determine a sensible default.
-
-    Attributes
-    ----------
-    timings: list
-        The timing measurements in seconds.
-    median: float
-        The median of the timing measurements in seconds.
-    standard_deviation: float
-        The standard deviation of the timing measurements in seconds.
-    fastast: float
-        The fastest value of the timing measurements in seconds.
-    repetitions: int
-        The number of measurements.
-
-    """
-
-    def __init__(self, wrangler: BaseWrangler,
-                 repetitions: Union[None, int] = None):
-        self._wrangler = wrangler
-
-        def wrapper(*args, **kwargs):
-            """Wrapper function to call `count()` to enforce computation.
-
-            """
-
-            wrangler.fit_transform(*args, **kwargs).count()
-
-        super().__init__(wrapper, repetitions)
-
-    def profile(self, *dfs: SparkDataFrame, **kwargs):
-        """Profiles timing given input dataframes `dfs` which are passed to
-        `fit_transform`.
-
-        Please note, input dataframes are cached before timing execution to
-        ensure timing measurements only capture wrangler's `fit_transform`.
-        This may cause problems if the size of input dataframes exceeds
-        available memory.
+        pretty_string: str
+            Human readable representation of `value`.
 
         """
 
-        # cache input dataframes
-        dfs_cached = [df.cache() for df in dfs]
-
-        # enforce caching calling count() action
-        for df in dfs_cached:
-            df.count()
-
-        super().profile(*dfs_cached, **kwargs)
-
-        # clear caches
-        for df in dfs_cached:
-            df.unpersist()
-            del df
-
-        del dfs_cached
-
-        return self
-
-
-class DaskTimeProfiler(TimeProfiler):
-    """Approximate time that a dask wrangler instance requires to execute the
-    `fit_transform` step.
-
-    Please note, input dataframes are cached before timing execution to ensure
-    timing measurements only capture wrangler's `fit_transform`. This may cause
-    problems if the size of input dataframes exceeds available memory.
-
-    Parameters
-    ----------
-    wrangler: pywrangler.wranglers.base.BaseWrangler
-         The wrangler instance to be profiled.
-    repetitions: None, int, optional
-        Number of repetitions. If `None`, `timeit.Timer.autorange` will
-        determine a sensible default.
-
-    Attributes
-    ----------
-    timings: list
-        The timing measurements in seconds.
-    median: float
-        The median of the timing measurements in seconds.
-    standard_deviation: float
-        The standard deviation of the timing measurements in seconds.
-    fastast: float
-        The fastest value of the timing measurements in seconds.
-    repetitions: int
-        The number of measurements.
-
-    """
-
-    def __init__(self, wrangler: BaseWrangler,
-                 repetitions: Union[None, int] = None):
-        self._wrangler = wrangler
-
-        def wrapper(*args, **kwargs):
-            """Wrapper function to call `compute()` to enforce computation.
-
-            """
-
-            wrangler.fit_transform(*args, **kwargs).compute()
-
-        super().__init__(wrapper, repetitions)
-
-    def profile(self, *dfs: DaskDataFrame, **kwargs):
-        """Profiles timing given input dataframes `dfs` which are passed to
-        `fit_transform`.
-
-        Please note, input dataframes are cached before timing execution to
-        ensure timing measurements only capture wrangler's `fit_transform`.
-        This may cause problems if the size of input dataframes exceeds
-        available memory.
-
-        """
-
-        # cache input dataframes
-        dfs_cached = [df.persist() for df in dfs]
-
-        super().profile(*dfs_cached, **kwargs)
-
-        # clear caches
-        for df in dfs_cached:
-            del df
-
-        del dfs_cached
-
-        return self
+        return pretty_time_duration(value)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index e73434c..e126b24 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -7,33 +7,23 @@
 
 import pytest
 
-import numpy as np
-import pandas as pd
-
 from pywrangler.benchmark import (
     BaseProfiler,
-    DaskTimeProfiler,
     MemoryProfiler,
-    PandasMemoryProfiler,
-    PandasTimeProfiler,
-    SparkTimeProfiler,
     TimeProfiler,
     allocate_memory
 )
 from pywrangler.exceptions import NotProfiledError
-from pywrangler.wranglers.pandas.base import PandasSingleNoFit
 
-try:
-    from pywrangler.wranglers.spark.base import SparkSingleNoFit
-except ImportError:
-    SparkSingleNoFit = None
+MIB = 2 ** 20
 
-try:
-    from pywrangler.wranglers.dask.base import DaskSingleNoFit
-except ImportError:
-    DaskSingleNoFit = None
 
-MIB = 2 ** 20
+@pytest.fixture()
+def func_no_effect():
+    def func():
+        pass
+
+    return func
 
 
 def test_allocate_memory_empty():
@@ -51,7 +41,7 @@ def test_allocate_memory_5mb():
 def test_base_profiler_not_implemented():
     base_profiler = BaseProfiler()
 
-    for will_raise in ('profile', 'profile_report'):
+    for will_raise in ('profile', 'profile_report', 'less_is_better'):
         with pytest.raises(NotImplementedError):
             getattr(base_profiler, will_raise)()
 
@@ -67,169 +57,155 @@ def test_base_profiler_check_is_profiled():
     base_profiler._check_is_profiled(['_is_set'])
 
 
-def test_base_profiler_mb_to_bytes():
-    assert BaseProfiler._mb_to_bytes(1) == 1048576
-    assert BaseProfiler._mb_to_bytes(1.5) == 1572864
-    assert BaseProfiler._mb_to_bytes(0.33) == 346030
-
-
-def test_memory_profiler_return_self():
-    def dummy():
-        pass
-
-    memory_profiler = MemoryProfiler(dummy)
-    assert memory_profiler.profile() is memory_profiler
-
-
-def test_memory_profiler_properties():
-    def dummy():
-        pass
-
-    memory_profiler = MemoryProfiler(dummy)
-    memory_profiler._baselines = [0, 1, 2, 3]
-    memory_profiler._max_usages = [4, 5, 7, 8]
-
-    assert memory_profiler.max_usages == memory_profiler._max_usages
-    assert memory_profiler.baselines == memory_profiler._baselines
-    assert memory_profiler.increases == [4, 4, 5, 5]
-    assert memory_profiler.increases_mean == 4.5
-    assert memory_profiler.increases_std == 0.5
-    assert memory_profiler.baseline_change == 1
-
-
-def test_memory_profiler_no_side_effect():
-    def no_side_effect():
-        dummy = 5
-        return dummy
-
-    assert MemoryProfiler(no_side_effect).profile().baseline_change < 0.5 * MIB
-
+def test_base_profiler_measurements_less_is_better(capfd):
+    measurements = range(7)
 
-def test_memory_profiler_side_effect():
-    side_effect_container = []
+    class Profiler(BaseProfiler):
 
-    def side_effect():
-        memory_holder = allocate_memory(5)
-        side_effect_container.append(memory_holder)
+        @property
+        def less_is_better(self):
+            return True
 
-        return memory_holder
+        def profile(self, *args, **kwargs):
+            self._measurements = measurements
+            return self
 
-    assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB
+        def _pretty_formatter(self, value):
+            return "{:.0f}".format(value)
 
+    base_profiler = Profiler()
+    base_profiler.profile_report()
 
-def test_memory_profiler_no_increase():
-    def no_increase():
-        pass
+    assert base_profiler.median == 3
+    assert base_profiler.best == 0
+    assert base_profiler.worst == 6
+    assert base_profiler.std == 2
+    assert base_profiler.runs == 7
+    assert base_profiler.measurements == measurements
 
-    assert MemoryProfiler(no_increase).profile().increases_mean < 0.1 * MIB
-    assert MemoryProfiler(no_increase).profile().increases_std < 0.1 * MIB
+    out, _ = capfd.readouterr()
+    assert out == "0 < 3 < 6 ± 2 (7 runs)\n"
 
 
-def test_memory_profiler_increase():
-    def increase():
-        memory_holder = allocate_memory(30)
-        return memory_holder
+def test_base_profiler_measurements_more_is_better(capfd):
+    measurements = range(7)
 
-    assert MemoryProfiler(increase).profile().increases_mean > 29 * MIB
+    class Profiler(BaseProfiler):
+        @property
+        def less_is_better(self):
+            return False
 
+        def profile(self, *args, **kwargs):
+            self._measurements = measurements
+            return self
 
-def test_pandas_memory_profiler_memory_usage_dfs():
-    df1 = pd.DataFrame(np.random.rand(10))
-    df2 = pd.DataFrame(np.random.rand(10))
+        def _pretty_formatter(self, value):
+            return "{:.0f}".format(value)
 
-    test_input = [df1, df2]
-    test_output = int(df1.memory_usage(index=True, deep=True).sum() +
-                      df2.memory_usage(index=True, deep=True).sum())
+    base_profiler = Profiler()
+    base_profiler.profile_report()
 
-    assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output
+    assert base_profiler.median == 3
+    assert base_profiler.best == 6
+    assert base_profiler.worst == 0
+    assert base_profiler.std == 2
+    assert base_profiler.runs == 7
+    assert base_profiler.measurements == measurements
 
+    out, _ = capfd.readouterr()
+    assert out == "6 > 3 > 0 ± 2 (7 runs)\n"
 
-def test_pandas_memory_profiler_return_self():
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame()
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+def test_memory_profiler_mb_to_bytes():
+    assert MemoryProfiler._mb_to_bytes(1) == 1048576
+    assert MemoryProfiler._mb_to_bytes(1.5) == 1572864
+    assert MemoryProfiler._mb_to_bytes(0.33) == 346030
 
-    assert memory_profiler is memory_profiler.profile(pd.DataFrame())
 
+def test_memory_profiler_return_self(func_no_effect):
+    memory_profiler = MemoryProfiler(func_no_effect)
+    assert memory_profiler.profile() is memory_profiler
 
-def test_pandas_memory_profiler_usage_increases_mean():
-    empty_df = pd.DataFrame()
 
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame(allocate_memory(30))
+def test_memory_profiler_measurements(func_no_effect):
+    baselines = [0, 1, 2, 3]
+    max_usages = [4, 5, 7, 8]
+    measurements = [4, 4, 5, 5]
+
+    memory_profiler = MemoryProfiler(func_no_effect)
+    memory_profiler._baselines = baselines
+    memory_profiler._max_usages = max_usages
+    memory_profiler._measurements = measurements
+
+    assert memory_profiler.less_is_better is True
+    assert memory_profiler.max_usages == max_usages
+    assert memory_profiler.baselines == baselines
+    assert memory_profiler.measurements == measurements
+    assert memory_profiler.median == 4.5
+    assert memory_profiler.std == 0.5
+    assert memory_profiler.best == 4
+    assert memory_profiler.worst == 5
+    assert memory_profiler.baseline_change == 1
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
 
-    assert memory_profiler.profile(empty_df).increases_mean > 29 * MIB
+def test_memory_profiler_no_side_effect(func_no_effect):
+    baseline_change = MemoryProfiler(func_no_effect).profile().baseline_change
 
+    assert baseline_change < 0.5 * MIB
 
-def test_pandas_memory_profiler_usage_input_output():
-    df_input = pd.DataFrame(np.random.rand(1000))
-    df_output = pd.DataFrame(np.random.rand(10000))
 
-    test_df_input = df_input.memory_usage(index=True, deep=True).sum()
-    test_df_output = df_output.memory_usage(index=True, deep=True).sum()
+def test_memory_profiler_side_effect():
+    side_effect_container = []
 
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return df_output
+    def side_effect():
+        memory_holder = allocate_memory(5)
+        side_effect_container.append(memory_holder)
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input)
+        return memory_holder
 
-    assert memory_profiler.input == test_df_input
-    assert memory_profiler.output == test_df_output
+    assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB
 
 
-def test_pandas_memory_profiler_usage_ratio():
-    usage_mib = 30
-    df_input = pd.DataFrame(np.random.rand(1000000))
-    usage_input = df_input.memory_usage(index=True, deep=True).sum()
-    test_output = ((usage_mib - 1) * MIB) / usage_input
+def test_memory_profiler_no_increase(func_no_effect):
+    memory_profiler = MemoryProfiler(func_no_effect).profile()
+    print(memory_profiler.measurements)
 
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame(allocate_memory(usage_mib))
+    assert memory_profiler.median < MIB
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
 
-    assert memory_profiler.profile(df_input).ratio > test_output
+def test_memory_profiler_increase():
+    def increase():
+        memory_holder = allocate_memory(30)
+        return memory_holder
 
+    assert MemoryProfiler(increase).profile().median > 29 * MIB
 
-def test_time_profiler_return_self():
-    def dummy():
-        pass
 
-    time_profiler = TimeProfiler(dummy, 1)
+def test_time_profiler_return_self(func_no_effect):
+    time_profiler = TimeProfiler(func_no_effect, 1)
     assert time_profiler.profile() is time_profiler
 
 
-def test_time_profiler_properties():
-    def dummy():
-        pass
+def test_time_profiler_measurements(func_no_effect):
+    measurements = [1, 1, 3, 3]
 
-    time_profiler = TimeProfiler(dummy)
-    time_profiler._timings = [1, 1, 3, 3]
+    time_profiler = TimeProfiler(func_no_effect)
+    time_profiler._measurements = [1, 1, 3, 3]
 
+    assert time_profiler.less_is_better is True
     assert time_profiler.median == 2
-    assert time_profiler.standard_deviation == 1
-    assert time_profiler.fastest == 1
-    assert time_profiler.repetitions == 4
-    assert time_profiler.timings == time_profiler._timings
-
-
-def test_time_profiler_repetitions():
-    def dummy():
-        pass
+    assert time_profiler.std == 1
+    assert time_profiler.best == 1
+    assert time_profiler.runs == 4
+    assert time_profiler.measurements == measurements
 
-    time_profiler = TimeProfiler(dummy, repetitions=10).profile()
 
+def test_time_profiler_repetitions(func_no_effect):
+    time_profiler = TimeProfiler(func_no_effect, repetitions=10)
     assert time_profiler.repetitions == 10
 
 
-def test_time_profiler_fastest():
+def test_time_profiler_best():
     sleep = 0.0001
 
     def dummy():
@@ -238,85 +214,4 @@ def dummy():
 
     time_profiler = TimeProfiler(dummy, repetitions=1).profile()
 
-    assert time_profiler.fastest >= sleep
-
-
-def test_pandas_time_profiler_fastest():
-    """Basic test for pandas time profiler ensuring fastest timing is slower
-    than forced sleep.
-
-    """
-
-    sleep = 0.0001
-    df_input = pd.DataFrame()
-
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            time.sleep(sleep)
-            return df
-
-    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input)
-
-    assert time_profiler.fastest >= sleep
-
-
-@pytest.mark.pyspark
-def test_spark_time_profiler_fastest(spark):
-    """Basic test for spark time profiler ensuring fastest timing is slower
-    than forced sleep.
-
-    """
-
-    sleep = 0.0001
-    df_input = spark.range(10).toDF("col")
-
-    class DummyWrangler(SparkSingleNoFit):
-        def transform(self, df):
-            time.sleep(sleep)
-            return df
-
-    time_profiler = SparkTimeProfiler(DummyWrangler(), 1).profile(df_input)
-
-    assert time_profiler.fastest >= sleep
-
-
-@pytest.mark.pyspark
-def test_spark_time_profiler_no_caching(spark):
-    """Pyspark input dataframes are cached during time profiling. Ensure input
-    dataframes are released from caching after profiling.
-
-    """
-
-    sleep = 0.0001
-    df_input = spark.range(10).toDF("col")
-
-    class DummyWrangler(SparkSingleNoFit):
-        def transform(self, df):
-            time.sleep(sleep)
-            return df
-
-    SparkTimeProfiler(DummyWrangler(), 1).profile(df_input)
-
-    assert df_input.is_cached is False
-
-
-@pytest.mark.dask
-def test_dask_time_profiler_fastest(spark):
-    """Basic test for dask time profiler ensuring fastest timing is slower
-    than forced sleep.
-
-    """
-
-    from dask import dataframe as dd
-
-    sleep = 0.0001
-    df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
-
-    class DummyWrangler(DaskSingleNoFit):
-        def transform(self, df):
-            time.sleep(sleep)
-            return df
-
-    time_profiler = DaskTimeProfiler(DummyWrangler(), 1).profile(df_input)
-
-    assert time_profiler.fastest >= sleep
+    assert time_profiler.best >= sleep

From 07b4d10bddbcd7ccb85029352163567bf232112e Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:28:56 +0200
Subject: [PATCH 25/48] Move spark fixture into spark subpackage.

---
 tests/conftest.py                 | 20 --------------------
 tests/wranglers/spark/conftest.py | 25 +++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 20 deletions(-)
 create mode 100644 tests/wranglers/spark/conftest.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 12bba43..5bb8255 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -53,23 +53,3 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if skip_item in item.keywords:
                 item.add_marker(skip)
-
-
-@pytest.fixture(scope="session")
-def spark(request):
-    """Provide session wide Spark Session to avoid expensive recreation for
-    each test.
-
-    If pyspark is not available, skip tests.
-
-    """
-
-    try:
-        from pyspark.sql import SparkSession
-        spark = SparkSession.builder.getOrCreate()
-
-        request.addfinalizer(lambda: spark.stop())
-        return spark
-
-    except ImportError:
-        pytest.skip("Pyspark not available.")
diff --git a/tests/wranglers/spark/conftest.py b/tests/wranglers/spark/conftest.py
new file mode 100644
index 0000000..eba57af
--- /dev/null
+++ b/tests/wranglers/spark/conftest.py
@@ -0,0 +1,25 @@
+"""pytest configuration
+
+"""
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def spark(request):
+    """Provide session wide Spark Session to avoid expensive recreation for
+    each test.
+
+    If pyspark is not available, skip tests.
+
+    """
+
+    try:
+        from pyspark.sql import SparkSession
+        spark = SparkSession.builder.getOrCreate()
+
+        request.addfinalizer(lambda: spark.stop())
+        return spark
+
+    except ImportError:
+        pytest.skip("Pyspark not available.")

From c8da336adb0ec70cd9831a18a181d5d8dab9bbb3 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:29:27 +0200
Subject: [PATCH 26/48] Move spark environment tests into spark subpackage.

---
 tests/{ => wranglers/spark}/test_environment.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 rename tests/{ => wranglers/spark}/test_environment.py (96%)

diff --git a/tests/test_environment.py b/tests/wranglers/spark/test_environment.py
similarity index 96%
rename from tests/test_environment.py
rename to tests/wranglers/spark/test_environment.py
index 5018288..155c6d6 100644
--- a/tests/test_environment.py
+++ b/tests/wranglers/spark/test_environment.py
@@ -7,8 +7,9 @@
 
 import pytest
 
+pytestmark = pytest.mark.pyspark
+
 
-@pytest.mark.pyspark
 def test_java_environment():
     """Pyspark requires Java to be available. It uses Py4J to start and
     communicate with the JVM. Py4J looks for JAVA_HOME or falls back calling
@@ -29,7 +30,6 @@ def test_java_environment():
         raise EnvironmentError("Java setup broken.")
 
 
-@pytest.mark.pyspark
 def test_pyspark_import():
     """Fail if pyspark can't be imported. This test is mandatory because other
     spark tests will be skipped if the spark session fixture fails.
@@ -43,7 +43,6 @@ def test_pyspark_import():
         pytest.fail("pyspark can't be imported")
 
 
-@pytest.mark.pyspark
 def test_pyspark_pandas_interaction(spark):
     """Check simple interaction between pyspark and pandas.
 

From cdfb47e7628d7341d97331b43c1d542014f77472 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:30:28 +0200
Subject: [PATCH 27/48] Move `SparkWrangler` tests into spark subpackage.

---
 tests/wranglers/spark/test_base.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/wranglers/spark/test_base.py b/tests/wranglers/spark/test_base.py
index 6c20d21..6b061f0 100644
--- a/tests/wranglers/spark/test_base.py
+++ b/tests/wranglers/spark/test_base.py
@@ -1,16 +1,16 @@
 """Test spark base wrangler.
 
+isort:skip_file
 """
 
 import pytest
 
-try:
-    from pywrangler.wranglers.spark.base import SparkWrangler
-except ImportError:
-    SparkWrangler = None
+pytestmark = pytest.mark.pyspark  # noqa: E402
+pyspark = pytest.importorskip("pyspark")  # noqa: E402
+
+from pywrangler.wranglers.spark.base import SparkWrangler
 
 
-@pytest.mark.pyspark
 def test_spark_base_wrangler_engine():
     wrangler = SparkWrangler()
 

From 170392758937b9d76fc8b1ca530b7a41c1fdcb1e Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:31:44 +0200
Subject: [PATCH 28/48] Move pandas wrangler benchmark functions into pandas
 subpackage.

---
 src/pywrangler/wranglers/pandas/benchmark.py | 196 +++++++++++++++++++
 tests/wranglers/pandas/test_benchmark.py     | 101 ++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 src/pywrangler/wranglers/pandas/benchmark.py
 create mode 100644 tests/wranglers/pandas/test_benchmark.py

diff --git a/src/pywrangler/wranglers/pandas/benchmark.py b/src/pywrangler/wranglers/pandas/benchmark.py
new file mode 100644
index 0000000..c6211a5
--- /dev/null
+++ b/src/pywrangler/wranglers/pandas/benchmark.py
@@ -0,0 +1,196 @@
+"""This module contains benchmarking utility for pandas wranglers.
+
+"""
+
+from typing import Union
+
+import numpy as np
+import pandas as pd
+
+from pywrangler.benchmark import MemoryProfiler, TimeProfiler
+from pywrangler.util import sanitizer
+from pywrangler.wranglers.pandas.base import PandasWrangler
+
+
+class PandasTimeProfiler(TimeProfiler):
+    """Approximate time that a pandas wrangler instance requires to execute the
+    `fit_transform` step.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.base.BaseWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in seconds.
+    best: float
+        The best measurement in seconds.
+    median: float
+        The median of measurements in seconds.
+    worst: float
+        The worst measurement in seconds.
+    std: float
+        The standard deviation of measurements in seconds.
+    runs: int
+        The number of measurements.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+    """
+
+    def __init__(self, wrangler: PandasWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+        super().__init__(wrangler.fit_transform, repetitions)
+
+
+class PandasMemoryProfiler(MemoryProfiler):
+    """Approximate memory usage that a pandas wrangler instance requires to
+    execute the `fit_transform` step.
+
+    As a key metric, `ratio` is computed. It refers to the amount of
+    memory which is required to execute the `fit_transform` step. More
+    concretely, it estimates how much more memory is used standardized by the
+    input memory usage (memory usage increase during function execution divided
+    by memory usage of input dataframes). In other words, if you have a 1GB
+    input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free
+    memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input
+    dataframe would require 1GB free memory available for computation.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.pandas.base.PandasWrangler
+        The wrangler instance to be profiled.
+    repetitions: int
+        The number of measurements for memory profiling.
+    interval: float, optional
+        Defines interval duration between consecutive memory usage
+        measurements in seconds.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in bytes.
+    best: float
+        The best measurement in bytes.
+    median: float
+        The median of measurements in bytes.
+    worst: float
+        The worst measurement in bytes.
+    std: float
+        The standard deviation of measurements in bytes.
+    runs: int
+        The number of measurements.
+    baseline_change: float
+        The median change in baseline memory usage across all runs in bytes.
+    input: int
+        Memory usage of input dataframes in bytes.
+    output: int
+        Memory usage of output dataframes in bytes.
+    ratio: float
+        The amount of memory required for computation in units of input
+        memory usage.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+    """
+
+    def __init__(self, wrangler: PandasWrangler, repetitions: int = 5,
+                 interval: float = 0.01):
+        self._wrangler = wrangler
+
+        super().__init__(wrangler.fit_transform, repetitions, interval)
+
+    def profile(self, *dfs: pd.DataFrame, **kwargs):
+        """Profiles the actual memory usage given input dataframes `dfs`
+        which are passed to `fit_transform`.
+
+        """
+
+        # usage input
+        self._usage_input = self._memory_usage_dfs(*dfs)
+
+        # usage output
+        dfs_output = self._wrangler.fit_transform(*dfs)
+        dfs_output = sanitizer.ensure_tuple(dfs_output)
+        self._usage_output = self._memory_usage_dfs(*dfs_output)
+
+        # usage during fit_transform
+        super().profile(*dfs, **kwargs)
+
+        return self
+
+    @property
+    def input(self) -> float:
+        """Returns the memory usage of the input dataframes in bytes.
+
+        """
+
+        self._check_is_profiled(['_usage_input'])
+        return self._usage_input
+
+    @property
+    def output(self) -> float:
+        """Returns the memory usage of the output dataframes in bytes.
+
+        """
+
+        self._check_is_profiled(['_usage_output'])
+        return self._usage_output
+
+    @property
+    def ratio(self) -> float:
+        """Refers to the amount of memory which is required to execute the
+        `fit_transform` step. More concretely, it estimates how much more
+        memory is used standardized by the input memory usage (memory usage
+        increase during function execution divided by memory usage of input
+        dataframes). In other words, if you have a 1GB input dataframe, and the
+        `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to
+        succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would
+        require 1GB free memory available for computation.
+
+        """
+
+        return self.median / self.input
+
+    @staticmethod
+    def _memory_usage_dfs(*dfs: pd.DataFrame) -> int:
+        """Return memory usage in bytes for all given dataframes.
+
+        Parameters
+        ----------
+        dfs: pd.DataFrame
+            The pandas dataframes for which memory usage should be computed.
+
+        Returns
+        -------
+        memory_usage: int
+            The computed memory usage in bytes.
+
+        """
+
+        mem_usages = [df.memory_usage(deep=True, index=True).sum()
+                      for df in dfs]
+
+        return int(np.sum(mem_usages))
diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py
new file mode 100644
index 0000000..6964a54
--- /dev/null
+++ b/tests/wranglers/pandas/test_benchmark.py
@@ -0,0 +1,101 @@
+"""This module contains tests for pandas benchmarks.
+
+"""
+
+import time
+
+import numpy as np
+import pandas as pd
+
+from pywrangler.benchmark import allocate_memory
+from pywrangler.wranglers.pandas.base import PandasSingleNoFit
+from pywrangler.wranglers.pandas.benchmark import (
+    PandasMemoryProfiler,
+    PandasTimeProfiler
+)
+
+MIB = 2 ** 20
+
+
+def test_pandas_memory_profiler_memory_usage_dfs():
+    df1 = pd.DataFrame(np.random.rand(10))
+    df2 = pd.DataFrame(np.random.rand(10))
+
+    test_input = [df1, df2]
+    test_output = int(df1.memory_usage(index=True, deep=True).sum() +
+                      df2.memory_usage(index=True, deep=True).sum())
+
+    assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output
+
+
+def test_pandas_memory_profiler_return_self():
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame()
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler is memory_profiler.profile(pd.DataFrame())
+
+
+def test_pandas_memory_profiler_usage_increases_mean():
+    empty_df = pd.DataFrame()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame(allocate_memory(30))
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler.profile(empty_df).median > 29 * MIB
+
+
+def test_pandas_memory_profiler_usage_input_output():
+    df_input = pd.DataFrame(np.random.rand(1000))
+    df_output = pd.DataFrame(np.random.rand(10000))
+
+    test_df_input = df_input.memory_usage(index=True, deep=True).sum()
+    test_df_output = df_output.memory_usage(index=True, deep=True).sum()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return df_output
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input)
+
+    assert memory_profiler.input == test_df_input
+    assert memory_profiler.output == test_df_output
+
+
+def test_pandas_memory_profiler_usage_ratio():
+    usage_mib = 30
+    df_input = pd.DataFrame(np.random.rand(1000000))
+    usage_input = df_input.memory_usage(index=True, deep=True).sum()
+    test_output = ((usage_mib - 1) * MIB) / usage_input
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            return pd.DataFrame(allocate_memory(usage_mib))
+
+    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+
+    assert memory_profiler.profile(df_input).ratio > test_output
+
+
+def test_pandas_time_profiler_fastest():
+    """Basic test for pandas time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
+
+    sleep = 0.0001
+    df_input = pd.DataFrame()
+
+    class DummyWrangler(PandasSingleNoFit):
+        def transform(self, df):
+            time.sleep(sleep)
+            return df
+
+    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input)
+
+    assert time_profiler.best >= sleep

From fbcd573c3480e15bee1905dc8e573ee32798f67a Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:33:27 +0200
Subject: [PATCH 29/48] Move dask benchmarks into dask subpackage.

---
 src/pywrangler/wranglers/dask/benchmark.py | 91 ++++++++++++++++++++++
 tests/wranglers/dask/test_benchmark.py     | 43 ++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 src/pywrangler/wranglers/dask/benchmark.py
 create mode 100644 tests/wranglers/dask/test_benchmark.py

diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py
new file mode 100644
index 0000000..44f2802
--- /dev/null
+++ b/src/pywrangler/wranglers/dask/benchmark.py
@@ -0,0 +1,91 @@
+"""This module contains benchmarking utility for pandas wranglers.
+
+"""
+
+from typing import Union
+
+from dask.dataframe import DataFrame
+
+from pywrangler.benchmark import TimeProfiler
+from pywrangler.wranglers.dask.base import DaskWrangler
+
+
+class DaskTimeProfiler(TimeProfiler):
+    """Approximate time that a dask wrangler instance requires to execute the
+    `fit_transform` step.
+
+    Please note, input dataframes are cached before timing execution to ensure
+    timing measurements only capture wrangler's `fit_transform`. This may cause
+    problems if the size of input dataframes exceeds available memory.
+
+    Parameters
+    ----------
+    wrangler: pywrangler.wranglers.base.BaseWrangler
+         The wrangler instance to be profiled.
+    repetitions: None, int, optional
+        Number of repetitions. If `None`, `timeit.Timer.autorange` will
+        determine a sensible default.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in seconds.
+    best: float
+        The best measurement in seconds.
+    median: float
+        The median of measurements in seconds.
+    worst: float
+        The worst measurement in seconds.
+    std: float
+        The standard deviation of measurements in seconds.
+    runs: int
+        The number of measurements.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+    """
+
+    def __init__(self, wrangler: DaskWrangler,
+                 repetitions: Union[None, int] = None):
+        self._wrangler = wrangler
+
+        def wrapper(*args, **kwargs):
+            """Wrapper function to call `compute()` to enforce computation.
+
+            """
+
+            wrangler.fit_transform(*args, **kwargs).compute()
+
+        super().__init__(wrapper, repetitions)
+
+    def profile(self, *dfs: DataFrame, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
+
+        Please note, input dataframes are cached before timing execution to
+        ensure timing measurements only capture wrangler's `fit_transform`.
+        This may cause problems if the size of input dataframes exceeds
+        available memory.
+
+        """
+
+        # cache input dataframes
+        dfs_cached = [df.persist() for df in dfs]
+
+        super().profile(*dfs_cached, **kwargs)
+
+        # clear caches
+        for df in dfs_cached:
+            del df
+
+        del dfs_cached
+
+        return self
diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py
new file mode 100644
index 0000000..421d850
--- /dev/null
+++ b/tests/wranglers/dask/test_benchmark.py
@@ -0,0 +1,43 @@
+"""This module contains tests for dask benchmarks.
+
+isort:skip_file
+"""
+
+import time
+
+import pytest
+import pandas as pd
+import numpy as np
+
+pytestmark = pytest.mark.dask  # noqa: E402
+dask = pytest.importorskip("dask")  # noqa: E402
+
+from dask import dataframe as dd
+
+from pywrangler.wranglers.dask.benchmark import DaskTimeProfiler
+from pywrangler.wranglers.dask.base import DaskSingleNoFit
+
+SLEEP = 0.0001
+
+
+@pytest.fixture
+def wrangler_sleeps():
+    class DummyWrangler(DaskSingleNoFit):
+        def transform(self, df):
+            time.sleep(SLEEP)
+            return df
+
+    return DummyWrangler
+
+
+def test_dask_time_profiler_fastest(spark, wrangler_sleeps):
+    """Basic test for dask time profiler ensuring fastest timing is slower
+    than forced sleep.
+
+    """
+
+    df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
+
+    time_profiler = DaskTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+
+    assert time_profiler.best >= SLEEP

From a05738f48902ed7de551b84bde6689d75b061a34 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Fri, 26 Apr 2019 22:40:38 +0200
Subject: [PATCH 30/48] Add `get_param_names` helper function.

---
 src/pywrangler/util/helper.py | 34 +++++++++++++++++++++++++++++++++-
 src/pywrangler/util/types.py  |  5 ++++-
 tests/__init__.py             |  0
 tests/util/test_helper.py     | 19 +++++++++++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/util/test_helper.py

diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py
index b845033..e05a3b8 100644
--- a/src/pywrangler/util/helper.py
+++ b/src/pywrangler/util/helper.py
@@ -2,7 +2,10 @@
 
 """
 
-from typing import Callable
+import inspect
+from typing import Callable, List
+
+from pywrangler.util.types import T_STR_OPT_MUL
 
 
 def cached_property(method: Callable) -> property:
@@ -36,3 +39,32 @@ def get_prop_value(obj):
             return value
 
     return property(get_prop_value, doc=docstring)
+
+
+def get_param_names(func: Callable,
+                    ignore: T_STR_OPT_MUL = None) -> List[str]:
+    """Retrieve all parameter names for given function.
+
+    Parameters
+    ----------
+    func: Callable
+        Function for which parameter names should be retrieved.
+    ignore: iterable, None, optional
+        Parameter names to be ignored. For example, `self` for `__init__`
+        functions.
+
+    Returns
+    -------
+    param_names: list
+        List of parameter names.
+
+    """
+
+    ignore = ignore or []
+
+    signature = inspect.signature(func)
+    parameters = signature.parameters.values()
+
+    param_names = [x.name for x in parameters if x.name not in ignore]
+
+    return param_names
diff --git a/src/pywrangler/util/types.py b/src/pywrangler/util/types.py
index 928106c..52c8213 100644
--- a/src/pywrangler/util/types.py
+++ b/src/pywrangler/util/types.py
@@ -4,5 +4,8 @@
 
 from typing import Iterable, Union
 
-TYPE_COLUMNS = Union[str, Iterable[str], None]
+T_STR_OPT_MUL = Union[Iterable[str], None]
+T_STR_OPT_SING_MUL = Union[str, Iterable[str], None]
+
+TYPE_COLUMNS = T_STR_OPT_SING_MUL
 TYPE_ASCENDING = Union[bool, Iterable[bool], None]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/util/test_helper.py b/tests/util/test_helper.py
new file mode 100644
index 0000000..1cc4d42
--- /dev/null
+++ b/tests/util/test_helper.py
@@ -0,0 +1,19 @@
+"""This module contains tests for the helper module.
+
+"""
+
+from pywrangler.util.helper import get_param_names
+
+
+def test_get_param_names():
+
+    def func():
+        pass
+
+    assert get_param_names(func) == []
+
+    def func1(a, b=4, c=6):
+        pass
+
+    assert get_param_names(func1) == ["a", "b", "c"]
+    assert get_param_names(func1, ["a"]) == ["b", "c"]

From bb0a7a1c3641a330cceb21ce3dd520f8f7b4a155 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 27 Apr 2019 14:20:13 +0200
Subject: [PATCH 31/48] Add pytestmarks and remove wrong spark fixture.

---
 tests/wranglers/dask/test_base.py      | 10 +++++-----
 tests/wranglers/dask/test_benchmark.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/wranglers/dask/test_base.py b/tests/wranglers/dask/test_base.py
index da4ae61..3b5711c 100644
--- a/tests/wranglers/dask/test_base.py
+++ b/tests/wranglers/dask/test_base.py
@@ -1,16 +1,16 @@
 """Test dask base wrangler.
 
+isort:skip_file
 """
 
 import pytest
 
-try:
-    from pywrangler.wranglers.dask.base import DaskWrangler
-except ImportError:
-    DaskWrangler = None
+pytestmark = pytest.mark.dask  # noqa: E402
+dask = pytest.importorskip("dask")  # noqa: E402
+
+from pywrangler.wranglers.dask.base import DaskWrangler
 
 
-@pytest.mark.dask
 def test_dask_base_wrangler_engine():
     wrangler = DaskWrangler()
 
diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py
index 421d850..037b834 100644
--- a/tests/wranglers/dask/test_benchmark.py
+++ b/tests/wranglers/dask/test_benchmark.py
@@ -30,7 +30,7 @@ def transform(self, df):
     return DummyWrangler
 
 
-def test_dask_time_profiler_fastest(spark, wrangler_sleeps):
+def test_dask_time_profiler_fastest(wrangler_sleeps):
     """Basic test for dask time profiler ensuring fastest timing is slower
     than forced sleep.
 

From 6e8b31157620ac12df5ec27513eda0ad346f42a7 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 27 Apr 2019 14:21:22 +0200
Subject: [PATCH 32/48] Refactor tests to use `create_wrangler` fixture. Add
 sleeps for memory profiling.

---
 tests/wranglers/pandas/test_benchmark.py | 88 ++++++++++++++----------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py
index 6964a54..3d915b3 100644
--- a/tests/wranglers/pandas/test_benchmark.py
+++ b/tests/wranglers/pandas/test_benchmark.py
@@ -4,6 +4,8 @@
 
 import time
 
+import pytest
+
 import numpy as np
 import pandas as pd
 
@@ -17,6 +19,42 @@
 MIB = 2 ** 20
 
 
+@pytest.fixture
+def test_wrangler():
+    """Helper fixture to generate PandasWrangler instances with parametrization
+    of transform output and sleep.
+
+    """
+
+    def create_wrangler(size=None, result=None, sleep=0):
+        """Return instance of PandasWrangler.
+
+        Parameters
+        ----------
+        size: float
+            Memory size in MiB to allocate during transform step.
+        result: pd.DataFrame
+            Define extact return value of transform step.
+        sleep: float
+            Define sleep interval.
+
+        """
+
+        class DummyWrangler(PandasSingleNoFit):
+            def transform(self, df):
+                if size is not None:
+                    df_out = pd.DataFrame(allocate_memory(size))
+                else:
+                    df_out = pd.DataFrame(result)
+
+                time.sleep(sleep)
+                return df_out
+
+        return DummyWrangler()
+
+    return create_wrangler
+
+
 def test_pandas_memory_profiler_memory_usage_dfs():
     df1 = pd.DataFrame(np.random.rand(10))
     df2 = pd.DataFrame(np.random.rand(10))
@@ -28,74 +66,54 @@ def test_pandas_memory_profiler_memory_usage_dfs():
     assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output
 
 
-def test_pandas_memory_profiler_return_self():
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame()
-
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+def test_pandas_memory_profiler_return_self(test_wrangler):
+    memory_profiler = PandasMemoryProfiler(test_wrangler())
 
     assert memory_profiler is memory_profiler.profile(pd.DataFrame())
 
 
-def test_pandas_memory_profiler_usage_increases_mean():
-    empty_df = pd.DataFrame()
-
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame(allocate_memory(30))
+def test_pandas_memory_profiler_usage_median(test_wrangler):
+    wrangler = test_wrangler(size=30, sleep=0.01)
+    memory_profiler = PandasMemoryProfiler(wrangler)
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+    assert memory_profiler.profile(pd.DataFrame()).median > 29 * MIB
 
-    assert memory_profiler.profile(empty_df).median > 29 * MIB
 
-
-def test_pandas_memory_profiler_usage_input_output():
+def test_pandas_memory_profiler_usage_input_output(test_wrangler):
     df_input = pd.DataFrame(np.random.rand(1000))
     df_output = pd.DataFrame(np.random.rand(10000))
 
     test_df_input = df_input.memory_usage(index=True, deep=True).sum()
     test_df_output = df_output.memory_usage(index=True, deep=True).sum()
 
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return df_output
-
-    memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input)
+    wrangler = test_wrangler(result=df_output)
+    memory_profiler = PandasMemoryProfiler(wrangler).profile(df_input)
 
     assert memory_profiler.input == test_df_input
     assert memory_profiler.output == test_df_output
 
 
-def test_pandas_memory_profiler_usage_ratio():
+def test_pandas_memory_profiler_ratio(test_wrangler):
     usage_mib = 30
     df_input = pd.DataFrame(np.random.rand(1000000))
     usage_input = df_input.memory_usage(index=True, deep=True).sum()
     test_output = ((usage_mib - 1) * MIB) / usage_input
 
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            return pd.DataFrame(allocate_memory(usage_mib))
+    wrangler = test_wrangler(size=usage_mib, sleep=0.01)
 
-    memory_profiler = PandasMemoryProfiler(DummyWrangler())
+    memory_profiler = PandasMemoryProfiler(wrangler)
 
     assert memory_profiler.profile(df_input).ratio > test_output
 
 
-def test_pandas_time_profiler_fastest():
+def test_pandas_time_profiler_best(test_wrangler):
     """Basic test for pandas time profiler ensuring fastest timing is slower
     than forced sleep.
 
     """
 
     sleep = 0.0001
-    df_input = pd.DataFrame()
-
-    class DummyWrangler(PandasSingleNoFit):
-        def transform(self, df):
-            time.sleep(sleep)
-            return df
-
-    time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input)
+    wrangler = test_wrangler(sleep=sleep)
+    time_profiler = PandasTimeProfiler(wrangler, 1).profile(pd.DataFrame())
 
     assert time_profiler.best >= sleep

From a2b4dfc9bae6dd9c4419c62a3a7b95db4fd8047a Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 27 Apr 2019 14:21:46 +0200
Subject: [PATCH 33/48] Add sleeps for memory profiling.

---
 tests/test_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index e126b24..a656858 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -176,6 +176,7 @@ def test_memory_profiler_no_increase(func_no_effect):
 def test_memory_profiler_increase():
     def increase():
         memory_holder = allocate_memory(30)
+        time.sleep(0.01)
         return memory_holder
 
     assert MemoryProfiler(increase).profile().median > 29 * MIB

From 4bf9faa06646af715007bf1a3f26a618385d5b7e Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Sat, 27 Apr 2019 14:39:42 +0200
Subject: [PATCH 34/48] Allow memory usage tests to fail due to non
 deterministic memory management.

---
 tests/test_benchmark.py                  | 3 ++-
 tests/wranglers/pandas/test_benchmark.py | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index a656858..8787216 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -173,10 +173,11 @@ def test_memory_profiler_no_increase(func_no_effect):
     assert memory_profiler.median < MIB
 
 
+@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due "
+                          "to non deterministic memory management.")
 def test_memory_profiler_increase():
     def increase():
         memory_holder = allocate_memory(30)
-        time.sleep(0.01)
         return memory_holder
 
     assert MemoryProfiler(increase).profile().median > 29 * MIB
diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py
index 3d915b3..f7916d7 100644
--- a/tests/wranglers/pandas/test_benchmark.py
+++ b/tests/wranglers/pandas/test_benchmark.py
@@ -72,6 +72,8 @@ def test_pandas_memory_profiler_return_self(test_wrangler):
     assert memory_profiler is memory_profiler.profile(pd.DataFrame())
 
 
+@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due "
+                          "to non deterministic memory management.")
 def test_pandas_memory_profiler_usage_median(test_wrangler):
     wrangler = test_wrangler(size=30, sleep=0.01)
     memory_profiler = PandasMemoryProfiler(wrangler)
@@ -93,6 +95,8 @@ def test_pandas_memory_profiler_usage_input_output(test_wrangler):
     assert memory_profiler.output == test_df_output
 
 
+@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due "
+                          "to non deterministic memory management.")
 def test_pandas_memory_profiler_ratio(test_wrangler):
     usage_mib = 30
     df_input = pd.DataFrame(np.random.rand(1000000))

From d14a5ff9b2325fa2a680422f584c061c27866f90 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 1 May 2019 15:03:30 +0200
Subject: [PATCH 35/48] Add assertion for number of runs.

---
 tests/test_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 8787216..16b49d8 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -146,6 +146,7 @@ def test_memory_profiler_measurements(func_no_effect):
     assert memory_profiler.best == 4
     assert memory_profiler.worst == 5
     assert memory_profiler.baseline_change == 1
+    assert memory_profiler.runs == 4
 
 
 def test_memory_profiler_no_side_effect(func_no_effect):
@@ -192,7 +193,7 @@ def test_time_profiler_measurements(func_no_effect):
     measurements = [1, 1, 3, 3]
 
     time_profiler = TimeProfiler(func_no_effect)
-    time_profiler._measurements = [1, 1, 3, 3]
+    time_profiler._measurements = measurements
 
     assert time_profiler.less_is_better is True
     assert time_profiler.median == 2

From 0927a2a891355b798651fcb0c915de5b105f310a Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 1 May 2019 15:04:20 +0200
Subject: [PATCH 36/48] Remove note about memory consumption of child
 processes.

---
 src/pywrangler/benchmark.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index 7111571..26a7e63 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -248,8 +248,6 @@ class MemoryProfiler(BaseProfiler):
     usage during function execution and the baseline memory usage before
     function execution.
 
-    Note, memory consumption of child processes are included.
-
     In addition, compute the mean increase in baseline memory usage between
     repetitions which might indicate memory leakage.
 

From b28f6dbf9f14c90f1503419160d878f39c94fc09 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 1 May 2019 15:07:18 +0200
Subject: [PATCH 37/48] Add `DaskBaseProfiler` and `DaskMemoryProfiler`.

---
 src/pywrangler/wranglers/dask/benchmark.py | 211 ++++++++++++++++++---
 tests/wranglers/dask/test_benchmark.py     | 162 +++++++++++++++-
 2 files changed, 335 insertions(+), 38 deletions(-)

diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py
index 44f2802..936a242 100644
--- a/src/pywrangler/wranglers/dask/benchmark.py
+++ b/src/pywrangler/wranglers/dask/benchmark.py
@@ -2,22 +2,89 @@
 
 """
 
-from typing import Union
+import gc
+import sys
+import warnings
+from typing import Callable, List, Union
 
-from dask.dataframe import DataFrame
+import numpy as np
+from dask.diagnostics import ResourceProfiler
 
-from pywrangler.benchmark import TimeProfiler
+from pywrangler.benchmark import MemoryProfiler, TimeProfiler
 from pywrangler.wranglers.dask.base import DaskWrangler
 
 
-class DaskTimeProfiler(TimeProfiler):
+class DaskBaseProfiler:
+    """Define common methods for dask profiler.
+
+    """
+
+    def _wrap_fit_transform(self) -> Callable:
+        """Wrapper function to call `compute()` on dask wrangler instances to
+        enforce computation on lazily evaluated dask graphs.
+
+        Returns
+        -------
+        wrapped: callable
+            Wrapped `fit_transform` method as a function.
+
+        """
+
+        def wrapped(*args, **kwargs):
+            return self.wrangler.fit_transform(*args, **kwargs).compute()
+
+        return wrapped
+
+    @staticmethod
+    def _cache_input(dfs) -> List:
+        """Persist lazily evaluated dask input collections before profiling to
+        capture only relevant `fit_transform`.
+
+        Parameters
+        ----------
+        dfs: iterable
+            Dask collections which can be persisted.
+
+        Returns
+        -------
+        persisted: iterable
+            List of computed dask collections.
+
+        """
+
+        return [df.persist() for df in dfs]
+
+    @staticmethod
+    def _clear_cached_input(dfs):
+        """Remove original reference to previously persisted dask collections
+        to enable garbage collection to free memory. Explicitly check reference
+        count and give warning if persisted dask collections are referenced
+        elsewhere which would prevent memory deallocation.
+
+        Parameters
+        ----------
+        dfs: iterable
+            Persisted dask collections which should be removed.
+
+        """
+
+        # ensure reference counts are updated
+        gc.collect()
+
+        # check ref counts
+        for df in dfs:
+            if sys.getrefcount(df) > 3:
+                warnings.warn("Persisted dask collection is referenced "
+                              "elsewhere and prevents garbage collection",
+                              ResourceWarning)
+
+        dfs.clear()
+
+
+class DaskTimeProfiler(TimeProfiler, DaskBaseProfiler):
     """Approximate time that a dask wrangler instance requires to execute the
     `fit_transform` step.
 
-    Please note, input dataframes are cached before timing execution to ensure
-    timing measurements only capture wrangler's `fit_transform`. This may cause
-    problems if the size of input dataframes exceeds available memory.
-
     Parameters
     ----------
     wrangler: pywrangler.wranglers.base.BaseWrangler
@@ -25,6 +92,10 @@ class DaskTimeProfiler(TimeProfiler):
     repetitions: None, int, optional
         Number of repetitions. If `None`, `timeit.Timer.autorange` will
         determine a sensible default.
+    cache_input: bool, optional
+        Dask collections may be cached before timing execution to ensure
+        timing measurements only capture wrangler's `fit_transform`. By
+        default, it is disabled.
 
     Attributes
     ----------
@@ -54,38 +125,122 @@ class DaskTimeProfiler(TimeProfiler):
     """
 
     def __init__(self, wrangler: DaskWrangler,
-                 repetitions: Union[None, int] = None):
-        self._wrangler = wrangler
+                 repetitions: Union[None, int] = None,
+                 cache_input: bool = False):
+        self.wrangler = wrangler
+        self.cache_input = cache_input
 
-        def wrapper(*args, **kwargs):
-            """Wrapper function to call `compute()` to enforce computation.
+        func = self._wrap_fit_transform()
+        super().__init__(func, repetitions)
 
-            """
+    def profile(self, *dfs, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
 
-            wrangler.fit_transform(*args, **kwargs).compute()
+        """
+
+        if self.cache_input:
+            dfs = self._cache_input(dfs)
+
+        super().profile(*dfs, **kwargs)
+
+        if self.cache_input:
+            self._clear_cached_input(dfs)
+
+        return self
 
-        super().__init__(wrapper, repetitions)
 
-    def profile(self, *dfs: DataFrame, **kwargs):
+class DaskMemoryProfiler(MemoryProfiler, DaskBaseProfiler):
+    """Approximate memory usage that a dask wrangler instance requires to
+    execute the `fit_transform` step.
+
+    Parameters
+    ----------
+    func: callable
+        Callable object to be memory profiled.
+    repetitions: int, optional
+        Number of repetitions.
+    interval: float, optional
+        Defines interval duration between consecutive memory usage
+        measurements in seconds.
+    cache_input: bool, optional
+        Dask collections may be cached before timing execution to ensure
+        timing measurements only capture wrangler's `fit_transform`. By
+        default, it is disabled.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in bytes.
+    best: float
+        The best measurement in bytes.
+    median: float
+        The median of measurements in bytes.
+    worst: float
+        The worst measurement in bytes.
+    std: float
+        The standard deviation of measurements in bytes.
+    runs: int
+        The number of measurements.
+    baseline_change: float
+        The median change in baseline memory usage across all runs in bytes.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+    Notes
+    -----
+    The implementation uses dask's own `ResourceProfiler`.
+
+    """
+
+    def __init__(self, wrangler: DaskWrangler,
+                 repetitions: Union[None, int] = 5,
+                 interval: float = 0.01,
+                 cache_input: bool = False):
+        self.wrangler = wrangler
+        self.cache_input = cache_input
+
+        func = self._wrap_fit_transform()
+        super().__init__(func, repetitions, interval)
+
+    def profile(self, *dfs, **kwargs):
         """Profiles timing given input dataframes `dfs` which are passed to
         `fit_transform`.
 
-        Please note, input dataframes are cached before timing execution to
-        ensure timing measurements only capture wrangler's `fit_transform`.
-        This may cause problems if the size of input dataframes exceeds
-        available memory.
-
         """
 
-        # cache input dataframes
-        dfs_cached = [df.persist() for df in dfs]
+        if self.cache_input:
+            dfs = self._cache_input(dfs)
+
+        counter = 0
+        baselines = []
+        max_usages = []
+
+        while counter < self.repetitions:
+            gc.collect()
+
+            with ResourceProfiler(dt=self.interval) as rprof:
+                self.func(*dfs, **kwargs)
+
+            mem_usages = [x.mem for x in rprof.results]
+            baselines.append(np.min(mem_usages))
+            max_usages.append(np.max(mem_usages))
 
-        super().profile(*dfs_cached, **kwargs)
+            counter += 1
 
-        # clear caches
-        for df in dfs_cached:
-            del df
+        self._max_usages = max_usages
+        self._baselines = baselines
+        self._measurements = np.subtract(max_usages, baselines).tolist()
 
-        del dfs_cached
+        if self.cache_input:
+            self._clear_cached_input(dfs)
 
         return self
diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py
index 037b834..df4ad61 100644
--- a/tests/wranglers/dask/test_benchmark.py
+++ b/tests/wranglers/dask/test_benchmark.py
@@ -14,30 +14,172 @@
 
 from dask import dataframe as dd
 
-from pywrangler.wranglers.dask.benchmark import DaskTimeProfiler
+from pywrangler.benchmark import allocate_memory
+from pywrangler.wranglers.dask.benchmark import (
+    DaskTimeProfiler,
+    DaskMemoryProfiler,
+    DaskBaseProfiler
+)
 from pywrangler.wranglers.dask.base import DaskSingleNoFit
 
-SLEEP = 0.0001
-
 
 @pytest.fixture
-def wrangler_sleeps():
+def mean_wranger():
     class DummyWrangler(DaskSingleNoFit):
         def transform(self, df):
-            time.sleep(SLEEP)
-            return df
+            return df.mean()
+
+    return DummyWrangler()
+
+
+@pytest.fixture
+def test_wrangler():
+    """Helper fixture to generate DaskWrangler instances with parametrization
+    of transform output and sleep.
+
+    """
+
+    def create_wrangler(size=None, result=None, sleep=0):
+        """Return instance of DaskWrangler.
+
+        Parameters
+        ----------
+        size: float
+            Memory size in MiB to allocate during transform step.
+        result: Dask DataFrame
+            Define extact return value of transform step.
+        sleep: float
+            Define sleep interval.
+
+        """
+
+        class DummyWrangler(DaskSingleNoFit):
+            def transform(self, df):
+                if size is not None:
+                    pdf = pd.DataFrame(allocate_memory(size))
+                    df_out = dd.from_pandas(pdf)
+                elif result is not None:
+                    df_out = result
+                else:
+                    df_out = dd.from_pandas(pd.DataFrame([0]), 1)
+
+                time.sleep(sleep)
+                return df_out
+
+        return DummyWrangler()
+
+    return create_wrangler
+
+
+def test_dask_base_profiler_wrap_fit_transform(test_wrangler):
+    pdf = pd.DataFrame(np.random.rand(50, 5))
+    df = dd.from_pandas(pdf, 5).max().max()
+
+    profiler = DaskTimeProfiler(wrangler=test_wrangler(result=df),
+                                repetitions=1)
+
+    wrapped = profiler._wrap_fit_transform()
+
+    assert callable(wrapped)
+    assert wrapped(df) == pdf.max().max()
+
+
+def test_dask_base_profiler_cache_input():
+    class MockPersist:
+        def persist(self):
+            self.persist_called = True
+            return self
+
+    dask_mocks = [MockPersist(), MockPersist()]
+
+    persisted = DaskBaseProfiler._cache_input(dask_mocks)
+
+    assert all([x.persist_called for x in persisted])
+
+
+def test_dask_base_profiler_clear_cache_input():
+    pdf = pd.DataFrame(np.random.rand(50, 5))
 
-    return DummyWrangler
+    with pytest.warns(None) as record:
+        DaskBaseProfiler._clear_cached_input([dd.from_pandas(pdf, 5)])
+        assert len(record) == 0
 
+    df = dd.from_pandas(pdf, 5)
+    ref = df  # noqa: F841
 
-def test_dask_time_profiler_fastest(wrangler_sleeps):
+    with pytest.warns(ResourceWarning):
+        DaskBaseProfiler._clear_cached_input([df])
+
+
+def test_dask_time_profiler_fastest(test_wrangler):
     """Basic test for dask time profiler ensuring fastest timing is slower
     than forced sleep.
 
     """
 
+    sleep = 0.001
+
     df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
 
-    time_profiler = DaskTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+    time_profiler = DaskTimeProfiler(wrangler=test_wrangler(sleep=sleep),
+                                     repetitions=1,
+                                     cache_input=True)
+
+    assert time_profiler.profile(df_input).best >= sleep
+
+
+def test_dask_time_profiler_profile_return_self(test_wrangler):
+    df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
+
+    time_profiler = DaskTimeProfiler(wrangler=test_wrangler(),
+                                     repetitions=1)
+
+    assert time_profiler.profile(df_input) is time_profiler
+
+
+def test_dask_time_profiler_cached_faster(mean_wranger):
+    pdf = pd.DataFrame(np.random.rand(1000000, 10))
+    df_input = dd.from_pandas(pdf, 2).mean()
+
+    time_profiler_no_cache = DaskTimeProfiler(wrangler=mean_wranger,
+                                              repetitions=5,
+                                              cache_input=False)
+
+    time_profiler_cache = DaskTimeProfiler(wrangler=mean_wranger,
+                                           repetitions=5,
+                                           cache_input=True)
+
+    no_cache_time = time_profiler_no_cache.profile(df_input).median
+    cache_time = time_profiler_cache.profile(df_input).median
+
+    assert no_cache_time > cache_time
+
+
+def test_dask_memory_profiler_profile_return_self(test_wrangler):
+    df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2)
+
+    mem_profiler = DaskMemoryProfiler(wrangler=test_wrangler(),
+                                      repetitions=1)
+
+    assert mem_profiler.profile(df_input) is mem_profiler
+    assert mem_profiler.runs == 1
+
+
+def test_dask_memory_profiler_cached_lower_usage(mean_wranger):
+    pdf = pd.DataFrame(np.random.rand(1000000, 10))
+    df_input = dd.from_pandas(pdf, 5).mean()
+
+    mem_profiler_no_cache = DaskMemoryProfiler(wrangler=mean_wranger,
+                                               repetitions=5,
+                                               cache_input=False,
+                                               interval=0.00001)
+
+    mem_profiler_cache = DaskMemoryProfiler(wrangler=mean_wranger,
+                                            repetitions=5,
+                                            cache_input=True,
+                                            interval=0.00001)
+
+    no_cache_usage = mem_profiler_no_cache.profile(df_input).median
+    cache_usage = mem_profiler_cache.profile(df_input).median
 
-    assert time_profiler.best >= SLEEP
+    assert no_cache_usage > cache_usage

From 01aa643ec10134e2d5f2ceebd3e7a4e4c428a758 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 20:57:16 +0200
Subject: [PATCH 38/48] Doc string improvements.

---
 src/pywrangler/benchmark.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py
index 26a7e63..82cd472 100644
--- a/src/pywrangler/benchmark.py
+++ b/src/pywrangler/benchmark.py
@@ -20,7 +20,7 @@
 
 
 def allocate_memory(size: float) -> np.ndarray:
-    """Helper function for testing to allocate memory by creating numpy array
+    """Helper function to approximately allocate memory by creating numpy array
     with given size in MiB.
 
     Numpy is used deliberately to define the used memory via dtype.
@@ -50,9 +50,10 @@ def allocate_memory(size: float) -> np.ndarray:
 class BaseProfiler:
     """Base class defining the interface for all profilers.
 
-    Subclasses have to implement `profile` (the actual profiling
-    implementation) and `less_is_better` (defining the ranking of profiling
-    measurements).
+    Subclasses have to implement `profile` (the actual profiling method) and
+    `less_is_better` (defining the ranking of profiling measurements).
+
+    The private attribute `_measurements` is assumed to be set by `profile`.
 
     Attributes
     ----------

From 81a1aeac6329499d8343893e1f5ef0e7eec51a32 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 20:59:40 +0200
Subject: [PATCH 39/48] Doc string improvement.

---
 src/pywrangler/wranglers/dask/benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py
index 936a242..b52e4f3 100644
--- a/src/pywrangler/wranglers/dask/benchmark.py
+++ b/src/pywrangler/wranglers/dask/benchmark.py
@@ -20,8 +20,8 @@ class DaskBaseProfiler:
     """
 
     def _wrap_fit_transform(self) -> Callable:
-        """Wrapper function to call `compute()` on dask wrangler instances to
-        enforce computation on lazily evaluated dask graphs.
+        """Wrapper function to call `compute()` on wrangler's `fit_transform`
+        to enforce computation on lazily evaluated dask graphs.
 
         Returns
         -------

From 347cb6a93f1eab03db0ee513d6c55450c790dc3a Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:04:30 +0200
Subject: [PATCH 40/48] Add SparkBaseProfiler. Add `cache_input` parameter. Add
 SparkMemoryProfiler template without current implementation.

---
 src/pywrangler/wranglers/spark/benchmark.py | 178 ++++++++++++++++----
 1 file changed, 149 insertions(+), 29 deletions(-)

diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py
index ef5cbf2..c37ea2a 100644
--- a/src/pywrangler/wranglers/spark/benchmark.py
+++ b/src/pywrangler/wranglers/spark/benchmark.py
@@ -2,22 +2,81 @@
 
 """
 
-from typing import Union
+import warnings
+from typing import Callable, Iterable, Union
 
 from pyspark.sql import DataFrame
 
-from pywrangler.benchmark import TimeProfiler
+from pywrangler.benchmark import MemoryProfiler, TimeProfiler
 from pywrangler.wranglers.spark.base import SparkWrangler
 
 
-class SparkTimeProfiler(TimeProfiler):
+class SparkBaseProfiler:
+    """Define common methods for spark profiler.
+
+    """
+
+    def _wrap_fit_transform(self) -> Callable:
+        """Wrapper function to call `count()` on wrangler's `fit_transform`
+        to enforce computation on lazily evaluated spark dataframes.
+
+        Returns
+        -------
+        wrapped: callable
+            Wrapped `fit_transform` method as a function.
+
+        """
+
+        def wrapped(*args, **kwargs):
+            return self.wrangler.fit_transform(*args, **kwargs).count()
+
+        return wrapped
+
+    @staticmethod
+    def _cache_input(dfs: Iterable[DataFrame]):
+        """Persist lazily evaluated spark dataframes before profiling to
+        capture only relevant `fit_transform`. Apply `count` to enforce
+        computation to create cached representation.
+
+        Parameters
+        ----------
+        dfs: iterable
+            Spark dataframes to be persisted.
+
+        Returns
+        -------
+        persisted: iterable
+            List of computed dask collections.
+
+        """
+
+        for df in dfs:
+            df.persist()
+            df.count()
+
+    @staticmethod
+    def _clear_cached_input(dfs: Iterable[DataFrame]):
+        """Unpersist previously persisted spark dataframes after profiling.
+
+        Parameters
+        ----------
+        dfs: iterable
+            Persisted spark dataframes.
+
+        """
+
+        for df in dfs:
+            df.unpersist()
+
+            if df.is_cached:
+                warnings.warn("Spark dataframe could not be unpersisted.",
+                              ResourceWarning)
+
+
+class SparkTimeProfiler(TimeProfiler, SparkBaseProfiler):
     """Approximate time that a spark wrangler instance requires to execute the
     `fit_transform` step.
 
-    Please note, input dataframes are cached before timing execution to ensure
-    timing measurements only capture wrangler's `fit_transform`. This may cause
-    problems if the size of input dataframes exceeds available memory.
-
     Parameters
     ----------
     wrangler: pywrangler.wranglers.base.BaseWrangler
@@ -25,6 +84,10 @@ class SparkTimeProfiler(TimeProfiler):
     repetitions: None, int, optional
         Number of repetitions. If `None`, `timeit.Timer.autorange` will
         determine a sensible default.
+    cache_input: bool, optional
+        Spark dataframes may be cached before timing execution to ensure
+        timing measurements only capture wrangler's `fit_transform`. By
+        default, it is disabled.
 
     Attributes
     ----------
@@ -54,17 +117,13 @@ class SparkTimeProfiler(TimeProfiler):
     """
 
     def __init__(self, wrangler: SparkWrangler,
-                 repetitions: Union[None, int] = None):
-        self._wrangler = wrangler
-
-        def wrapper(*args, **kwargs):
-            """Wrapper function to call `count()` to enforce computation.
-
-            """
+                 repetitions: Union[None, int] = None,
+                 cache_input: bool = False):
+        self.wrangler = wrangler
+        self.cache_input = cache_input
 
-            wrangler.fit_transform(*args, **kwargs).count()
-
-        super().__init__(wrapper, repetitions)
+        func = self._wrap_fit_transform()
+        super().__init__(func, repetitions)
 
     def profile(self, *dfs: DataFrame, **kwargs):
         """Profiles timing given input dataframes `dfs` which are passed to
@@ -77,20 +136,81 @@ def profile(self, *dfs: DataFrame, **kwargs):
 
         """
 
-        # cache input dataframes
-        dfs_cached = [df.cache() for df in dfs]
+        if self.cache_input:
+            self._cache_input(dfs)
 
-        # enforce caching calling count() action
-        for df in dfs_cached:
-            df.count()
+        super().profile(*dfs, **kwargs)
 
-        super().profile(*dfs_cached, **kwargs)
+        if self.cache_input:
+            self._clear_cached_input(dfs)
 
-        # clear caches
-        for df in dfs_cached:
-            df.unpersist()
-            del df
+        return self
 
-        del dfs_cached
 
-        return self
+class SparkMemoryProfiler(MemoryProfiler, SparkBaseProfiler):
+    """Approximate memory usage that a spark wrangler instance requires to
+    execute the `fit_transform` step.
+
+    #TODO: provide implementation for profile
+
+    Parameters
+    ----------
+    func: callable
+        Callable object to be memory profiled.
+    repetitions: int, optional
+        Number of repetitions.
+    interval: float, optional
+        Defines interval duration between consecutive memory usage
+        measurements in seconds.
+    cache_input: bool, optional
+        Spark dataframes may be cached before timing execution to ensure
+        timing measurements only capture wrangler's `fit_transform`. By
+        default, it is disabled.
+
+    Attributes
+    ----------
+    measurements: list
+        The actual profiling measurements in bytes.
+    best: float
+        The best measurement in bytes.
+    median: float
+        The median of measurements in bytes.
+    worst: float
+        The worst measurement in bytes.
+    std: float
+        The standard deviation of measurements in bytes.
+    runs: int
+        The number of measurements.
+    baseline_change: float
+        The median change in baseline memory usage across all runs in bytes.
+
+    Methods
+    -------
+    profile
+        Contains the actual profiling implementation.
+    report
+        Print simple report consisting of best, median, worst, standard
+        deviation and the number of measurements.
+    profile_report
+        Calls profile and report in sequence.
+
+
+    """
+
+    def __init__(self, wrangler: SparkWrangler,
+                 repetitions: Union[None, int] = 5,
+                 interval: float = 0.01,
+                 cache_input: bool = False):
+        self.wrangler = wrangler
+        self.cache_input = cache_input
+
+        func = self._wrap_fit_transform()
+        super().__init__(func, repetitions, interval)
+
+    def profile(self, *dfs: DataFrame, **kwargs):
+        """Profiles timing given input dataframes `dfs` which are passed to
+        `fit_transform`.
+
+        """
+
+        raise NotImplementedError

From 358163d429539bc401e1428845520f9227922c83 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:11:00 +0200
Subject: [PATCH 41/48] Remove unused `cached_property`.

---
 src/pywrangler/util/helper.py | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py
index e05a3b8..1b6acf4 100644
--- a/src/pywrangler/util/helper.py
+++ b/src/pywrangler/util/helper.py
@@ -8,39 +8,6 @@
 from pywrangler.util.types import T_STR_OPT_MUL
 
 
-def cached_property(method: Callable) -> property:
-    """Decorated method will be called only on first access to calculate a
-    cached property value. After that, the cached value is returned.
-
-    Parameters
-    ---------
-    method: Callable
-        Getter method to be lazily evaluated.
-
-    Returns
-    -------
-    property
-
-    Notes
-    -----
-    Credit goes to python-pptx: https://github.com/scanny/python-pptx/blob/master/pptx/util.py
-
-    """  # noqa: E501
-
-    cache_attr_name = '__{}'.format(method.__name__)
-    docstring = method.__doc__
-
-    def get_prop_value(obj):
-        try:
-            return getattr(obj, cache_attr_name)
-        except AttributeError:
-            value = method(obj)
-            setattr(obj, cache_attr_name, value)
-            return value
-
-    return property(get_prop_value, doc=docstring)
-
-
 def get_param_names(func: Callable,
                     ignore: T_STR_OPT_MUL = None) -> List[str]:
     """Retrieve all parameter names for given function.

From 1c910fb30934ac0c48f4d769dcf768294e488612 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:11:23 +0200
Subject: [PATCH 42/48] Remove obsolete commentaries.

---
 src/pywrangler/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pywrangler/__init__.py b/src/pywrangler/__init__.py
index c99a72a..49ab580 100644
--- a/src/pywrangler/__init__.py
+++ b/src/pywrangler/__init__.py
@@ -1,8 +1,6 @@
-# -*- coding: utf-8 -*-
 from pkg_resources import get_distribution, DistributionNotFound
 
 try:
-    # Change here if project is renamed and does not equal the package name
     dist_name = __name__
     __version__ = get_distribution(dist_name).version
 except DistributionNotFound:

From cd1ac619148897cd3fd4a036a23ac6ce6d46f5c8 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:30:36 +0200
Subject: [PATCH 43/48] Allow failing of memory test due to non deterministic
 behaviour.

---
 tests/wranglers/dask/test_benchmark.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py
index df4ad61..1be0db1 100644
--- a/tests/wranglers/dask/test_benchmark.py
+++ b/tests/wranglers/dask/test_benchmark.py
@@ -165,6 +165,8 @@ def test_dask_memory_profiler_profile_return_self(test_wrangler):
     assert mem_profiler.runs == 1
 
 
+@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due "
+                          "to non deterministic memory management.")
 def test_dask_memory_profiler_cached_lower_usage(mean_wranger):
     pdf = pd.DataFrame(np.random.rand(1000000, 10))
     df_input = dd.from_pandas(pdf, 5).mean()

From e9b803d4b52f8e89d9bd3a2486f27ffc4549ac1f Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:32:29 +0200
Subject: [PATCH 44/48] Remove SparkMemoryProfiler.

---
 src/pywrangler/wranglers/spark/benchmark.py | 73 +--------------------
 1 file changed, 3 insertions(+), 70 deletions(-)

diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py
index c37ea2a..492fc49 100644
--- a/src/pywrangler/wranglers/spark/benchmark.py
+++ b/src/pywrangler/wranglers/spark/benchmark.py
@@ -1,5 +1,7 @@
 """This module contains benchmarking utility for pandas wranglers.
 
+TODO: implement SparkMemoryProfiler
+
 """
 
 import warnings
@@ -7,7 +9,7 @@
 
 from pyspark.sql import DataFrame
 
-from pywrangler.benchmark import MemoryProfiler, TimeProfiler
+from pywrangler.benchmark import TimeProfiler
 from pywrangler.wranglers.spark.base import SparkWrangler
 
 
@@ -145,72 +147,3 @@ def profile(self, *dfs: DataFrame, **kwargs):
             self._clear_cached_input(dfs)
 
         return self
-
-
-class SparkMemoryProfiler(MemoryProfiler, SparkBaseProfiler):
-    """Approximate memory usage that a spark wrangler instance requires to
-    execute the `fit_transform` step.
-
-    #TODO: provide implementation for profile
-
-    Parameters
-    ----------
-    func: callable
-        Callable object to be memory profiled.
-    repetitions: int, optional
-        Number of repetitions.
-    interval: float, optional
-        Defines interval duration between consecutive memory usage
-        measurements in seconds.
-    cache_input: bool, optional
-        Spark dataframes may be cached before timing execution to ensure
-        timing measurements only capture wrangler's `fit_transform`. By
-        default, it is disabled.
-
-    Attributes
-    ----------
-    measurements: list
-        The actual profiling measurements in bytes.
-    best: float
-        The best measurement in bytes.
-    median: float
-        The median of measurements in bytes.
-    worst: float
-        The worst measurement in bytes.
-    std: float
-        The standard deviation of measurements in bytes.
-    runs: int
-        The number of measurements.
-    baseline_change: float
-        The median change in baseline memory usage across all runs in bytes.
-
-    Methods
-    -------
-    profile
-        Contains the actual profiling implementation.
-    report
-        Print simple report consisting of best, median, worst, standard
-        deviation and the number of measurements.
-    profile_report
-        Calls profile and report in sequence.
-
-
-    """
-
-    def __init__(self, wrangler: SparkWrangler,
-                 repetitions: Union[None, int] = 5,
-                 interval: float = 0.01,
-                 cache_input: bool = False):
-        self.wrangler = wrangler
-        self.cache_input = cache_input
-
-        func = self._wrap_fit_transform()
-        super().__init__(func, repetitions, interval)
-
-    def profile(self, *dfs: DataFrame, **kwargs):
-        """Profiles timing given input dataframes `dfs` which are passed to
-        `fit_transform`.
-
-        """
-
-        raise NotImplementedError

From a83811410ced03958c38af49a5dc4572f48768ab Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:33:07 +0200
Subject: [PATCH 45/48] Add test for caching/uncaching.

---
 tests/wranglers/spark/test_benchmark.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tests/wranglers/spark/test_benchmark.py b/tests/wranglers/spark/test_benchmark.py
index c353448..6610960 100644
--- a/tests/wranglers/spark/test_benchmark.py
+++ b/tests/wranglers/spark/test_benchmark.py
@@ -11,7 +11,8 @@
 pyspark = pytest.importorskip("pyspark")  # noqa: E402
 
 from pywrangler.wranglers.spark.base import SparkSingleNoFit
-from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler
+from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler, \
+    SparkBaseProfiler
 
 SLEEP = 0.0001
 
@@ -40,13 +41,27 @@ def test_spark_time_profiler_fastest(spark, wrangler_sleeps):
 
 
 def test_spark_time_profiler_no_caching(spark, wrangler_sleeps):
-    """Pyspark input dataframes are cached during time profiling. Ensure input
-    dataframes are released from caching after profiling.
+    df_input = spark.range(10).toDF("col")
+
+    SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+
+    assert df_input.is_cached is False
 
-    """
 
+def test_spark_time_profiler_caching(spark, wrangler_sleeps):
+    """Cache is released after profiling."""
     df_input = spark.range(10).toDF("col")
 
-    SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input)
+    SparkTimeProfiler(wrangler_sleeps(), 1, cache_input=True).profile(df_input)
 
     assert df_input.is_cached is False
+
+
+def test_spark_base_profiler_cache_input(spark):
+    df = spark.range(10).toDF("col")
+
+    SparkBaseProfiler._cache_input([df])
+    assert df.is_cached is True
+
+    SparkBaseProfiler._clear_cached_input([df])
+    assert df.is_cached is False

From e94fab28cc0d7c3e51dfaedda134a94f1f002332 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 21:39:16 +0200
Subject: [PATCH 46/48] Update changelog.

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e975cfd..74025be 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,7 @@ Version 0.1.0
 
 This is the initial release of pywrangler.
 
+- Add benchmark utilities for pandas, spark and dask wranglers (`#5 <https://github.com/mansenfranzen/pywrangler/pull/5>`_).
 - Add sequential ``NaiveIterator`` and vectorized ``VectorizedCumSum`` pandas implementations for ``IntervalIdentifier`` wrangler (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
 - Add ``PandasWrangler`` (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).
 - Add ``IntervalIdentifier`` wrangler interface (`#2 <https://github.com/mansenfranzen/pywrangler/pull/2>`_).

From 9d0d36382eca2646bdae43afa5396bd2e374cf85 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 22:00:07 +0200
Subject: [PATCH 47/48] Add args and kwargs to `fit`, `transform` and
 `fit_transform` to allow subclasses to implement varying positional and
 keyword arguments (linters).

---
 src/pywrangler/wranglers/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py
index 0ee47df..6dd3c54 100644
--- a/src/pywrangler/wranglers/base.py
+++ b/src/pywrangler/wranglers/base.py
@@ -91,13 +91,13 @@ def set_params(self, **params):
 
         return self
 
-    def fit(self):
+    def fit(self, *args, **kwargs):
         raise NotImplementedError
 
-    def transform(self):
+    def transform(self, *args, **kwargs):
         raise NotImplementedError
 
-    def fit_transform(self):
+    def fit_transform(self, *args, **kwargs):
         raise NotImplementedError
 
     def __repr__(self):

From 559fb46f0aa7fd7f0b2eba08cc0686dc1d37c940 Mon Sep 17 00:00:00 2001
From: mansenfranzen <franz.woellert@gmail.com>
Date: Wed, 5 Jun 2019 22:00:54 +0200
Subject: [PATCH 48/48] Remove unnecessary else clause.

---
 src/pywrangler/wranglers/pandas/interval_identifier.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/pywrangler/wranglers/pandas/interval_identifier.py b/src/pywrangler/wranglers/pandas/interval_identifier.py
index 844fa08..b6c05a0 100644
--- a/src/pywrangler/wranglers/pandas/interval_identifier.py
+++ b/src/pywrangler/wranglers/pandas/interval_identifier.py
@@ -157,9 +157,8 @@ def is_valid_end(value, active):
             else:
                 intermediate.append(active)
 
-        else:
-            # finally, add rest to result which must be invalid
-            result.extend([0] * len(intermediate))
+        # finally, add rest to result which must be invalid
+        result.extend([0] * len(intermediate))
 
         return result