From 075070ca3f49644edde3761ae66caaf4f26ad930 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:24:05 +0100 Subject: [PATCH 01/48] Add `sizeof` pretty printer. Add `align_values` and `align_width` to `enumeration`. --- src/pywrangler/util/_pprint.py | 61 ++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/src/pywrangler/util/_pprint.py b/src/pywrangler/util/_pprint.py index 580f4d5..d602168 100644 --- a/src/pywrangler/util/_pprint.py +++ b/src/pywrangler/util/_pprint.py @@ -67,7 +67,8 @@ def header(name: str, indent: int = 0, underline: str = "-") -> str: return _join([_header, _underline]) -def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str: +def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-", + align_values: bool = True, align_width: int = 0) -> str: """Create enumeration with bullet points. Parameters @@ -78,6 +79,12 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str: Indentation count. bullet_char: str, optional Bullet character. + align_values: bool, optional + If dict is provided, align all values to the same column. The longest + key defines the exact position. + align_width: int, optional + If dict is provided and `align_values` is True, manually set the align + width. Returns ------- @@ -86,7 +93,14 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str: """ if isinstance(values, dict): - _values = ["{key}: {value}".format(key=key, value=value) + fstring = "{key:>{align_width}}: {value}" + if align_values and not align_width: + align_width = max([len(x) for x in values.keys()]) + + _values = [fstring.format(key=key, + value=value, + align_width=align_width) + for key, value in sorted(values.items())] else: _values = values @@ -95,3 +109,46 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-") -> str: indented = _indent(with_bullets, indent) return _join(indented) + + +def sizeof(size: float, precision: int = 2, align: str = ">", + width=None) -> str: + """Helper function to format size in human readable format. + + Parameters + ---------- + size: float + The size in bytes to be converted into human readable format. + precision: int, optional + Define shown precision. + align: {'<', '^', '>'}, optional + Format align specifier. + width: int + Define maximum width for number. + + Returns + ------- + human_fmt: str + Human readable representation of given `size`. + + Notes + ----- + Credit to https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size + + """ # noqa: E501 + + template = "{size:{align}{width}.{precision}f} {unit}B" + + if width is None: + width = precision + 5 + + kwargs = dict(width=width, precision=precision, align=align) + + # iterate units (multiples of 1024 bytes) + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(size) < 1024.0: + return template.format(size=size, unit=unit, **kwargs) + size /= 1024.0 + + else: + return template.format(size=size, unit='Yi', **kwargs) From 447102052dd714bed58e549a1dcf3a60dfe3dc99 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:24:43 +0100 Subject: [PATCH 02/48] Add `helper` module. --- src/pywrangler/util/helper.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/pywrangler/util/helper.py diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py new file mode 100644 index 0000000..b845033 --- /dev/null +++ b/src/pywrangler/util/helper.py @@ -0,0 +1,38 @@ +"""This module contains commonly used helper functions or classes. + +""" + +from typing import Callable + + +def cached_property(method: Callable) -> property: + """Decorated method will be called only on first access to calculate a + cached property value. After that, the cached value is returned. + + Parameters + --------- + method: Callable + Getter method to be lazily evaluated. + + Returns + ------- + property + + Notes + ----- + Credit goes to python-pptx: https://github.com/scanny/python-pptx/blob/master/pptx/util.py + + """ # noqa: E501 + + cache_attr_name = '__{}'.format(method.__name__) + docstring = method.__doc__ + + def get_prop_value(obj): + try: + return getattr(obj, cache_attr_name) + except AttributeError: + value = method(obj) + setattr(obj, cache_attr_name, value) + return value + + return property(get_prop_value, doc=docstring) From c8ec9b869b6689b4e1789f8bf84219d609b1fb3b Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:25:33 +0100 Subject: [PATCH 03/48] Add single pandas dataframe to exception of iterables. --- src/pywrangler/util/sanitizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pywrangler/util/sanitizer.py b/src/pywrangler/util/sanitizer.py index ab4e5ab..31a133d 100644 --- a/src/pywrangler/util/sanitizer.py +++ b/src/pywrangler/util/sanitizer.py @@ -6,6 +6,8 @@ import collections from typing import Any, Tuple +import pandas as pd + def ensure_tuple(values: Any) -> Tuple[Any]: """For convenience, some parameters may accept a single value (string @@ -31,8 +33,8 @@ def ensure_tuple(values: Any) -> Tuple[Any]: elif not isinstance(values, collections.Iterable): return (values, ) - # handle single string which is iterable but still is only one value - elif isinstance(values, str): + # handle exception which are iterable but still count as one value + elif isinstance(values, (str, pd.DataFrame)): return (values, ) # anything else should ok to be converted to tuple From 8997f91c560f10942a5a52fcd8dea1dc53ab79de Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:26:20 +0100 Subject: [PATCH 04/48] Replace `+` operator with `add` method following pandas/numpy warning. --- src/pywrangler/wranglers/pandas/interval_identifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pywrangler/wranglers/pandas/interval_identifier.py b/src/pywrangler/wranglers/pandas/interval_identifier.py index 74b1dfa..844fa08 100644 --- a/src/pywrangler/wranglers/pandas/interval_identifier.py +++ b/src/pywrangler/wranglers/pandas/interval_identifier.py @@ -195,10 +195,10 @@ def _transform(self, series: pd.Series) -> List[int]: bool_end_shift = bool_end.shift().fillna(False) # get increasing ids for intervals (in/valid) with cumsum - ser_ids = (bool_start + bool_end_shift).cumsum() + ser_ids = bool_start.add(bool_end_shift).cumsum() # separate valid vs invalid: ids with start AND end marker are valid - bool_valid_ids = (bool_start + bool_end).groupby(ser_ids).sum().eq(2) + bool_valid_ids = bool_start.add(bool_end).groupby(ser_ids).sum().eq(2) valid_ids = bool_valid_ids.index[bool_valid_ids].values bool_valid = ser_ids.isin(valid_ids) From dc4c599a77a03c442763c09fa83f2c1c9970f0e0 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:27:50 +0100 Subject: [PATCH 05/48] Pass dict instead of items to print parameters correctly. --- src/pywrangler/wranglers/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py index c535650..94b02a4 100644 --- a/src/pywrangler/wranglers/base.py +++ b/src/pywrangler/wranglers/base.py @@ -110,7 +110,7 @@ def __repr__(self): template = '{wrangler_name} ({computation_engine})\n\n{parameters}'\ parameters = (_pprint.header("Parameters", 3) + - _pprint.enumeration(self.get_params().items(), 3)) + _pprint.enumeration(self.get_params(), 3)) _repr = template.format(wrangler_name=self.__class__.__name__, computation_engine=self.computation_engine, From f064839c5472c106ee757117827e02bc00506dbb Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:28:06 +0100 Subject: [PATCH 06/48] Add `exceptions` module. --- src/pywrangler/exceptions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/pywrangler/exceptions.py diff --git a/src/pywrangler/exceptions.py b/src/pywrangler/exceptions.py new file mode 100644 index 0000000..09bccf3 --- /dev/null +++ b/src/pywrangler/exceptions.py @@ -0,0 +1,13 @@ +"""The module contains package wide custom exceptions and warnings. + +""" + + +class NotProfiledError(ValueError, AttributeError): + """Exception class to raise if profiling results are acquired before + calling `profile`. + + This class inherits from both ValueError and AttributeError to help with + exception handling + + """ From 82929d1133b397ae0cc7f520850e9158493ebebe Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:28:48 +0100 Subject: [PATCH 07/48] Add `benchmark` module. --- src/pywrangler/benchmark.py | 380 ++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 src/pywrangler/benchmark.py diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py new file mode 100644 index 0000000..babc88f --- /dev/null +++ b/src/pywrangler/benchmark.py @@ -0,0 +1,380 @@ +"""This module contains benchmarking utility. + +""" + +import gc +import inspect +import sys +from typing import Iterable, List + +import numpy as np + +from pywrangler.exceptions import NotProfiledError +from pywrangler.util import sanitizer +from pywrangler.util._pprint import enumeration, header, sizeof +from pywrangler.util.helper import cached_property + + +def allocate_memory(size: float) -> np.ndarray: + """Occupies memory by creating numpy array with given size (MB). + + Numpy is used deliberately to specifically define the used memory via + dtype. + + Parameters + ---------- + size: float + Size in MB to be occupied. + + Returns + ------- + memory_holder: np.ndarray + + """ + + if size <= 0: + return None + + empty_size = sys.getsizeof(np.ones(0)) + + size_in_bytes = np.ceil(size * (2 ** 20)).astype(np.int64) - empty_size + memory_holder = np.ones(size_in_bytes, dtype=np.int8) + + return memory_holder + + +class BaseProfile: + """Base class defining interface and providing common helper methods. + + By convention, the profiled object should always the be the first argument + (ignoring self) passed to `__init__`. All public, relevant profiling + metrics have to be defined as properties. All private attributes (methods + and variables) need to start with an underscore. + + """ + + def profile(self, *args, **kwargs): + raise NotImplementedError + + def report(self): + """Creates basic report consisting the name of the profiler class, the + name of the profiled object, and all defined metrics/properties. + + """ + + # get name of profiler + profiler_name = self.__class__.__name__ + + # get name of profiled object + parameters = inspect.signature(self.__init__).parameters.keys() + profiled_object = getattr(self, '_{}'.format(list(parameters)[0])) + + try: + profiled_obj_name = profiled_object.__name__ + except AttributeError: + profiled_obj_name = profiled_object.__class__.__name__ + + # get relevant metrics + ignore = ('profile', 'report', 'profile_report') + metric_names = [x for x in dir(self) + if not x.startswith('_') + and x not in ignore] + metric_values = {x: getattr(self, x) for x in metric_names} + + print(header('{}: {}'.format(profiler_name, profiled_obj_name)), '\n', + enumeration(metric_values), sep='') + + def profile_report(self, *args, **kwargs): + self.profile(*args, **kwargs).report() + + def _check_is_profiled(self, attributes: Iterable[str]) -> None: + """Check if `profile` was already called by ensuring passed attributes + are not `None`. + + Parameters + ---------- + attributes: + Attribute name(s) given as string or a list/tuple of strings + + Returns + ------- + None + + Raises + ------ + NotProfiledError + + Notes + ----- + Inspired by sklearns `check_is_fitted`. + + """ + + if any([getattr(self, x) is None for x in attributes]): + msg = ("This {}'s instance is not profiled yet. Call 'profile' " + "with appropriate arguments before using this method." + .format(self.__class__.__name__)) + + raise NotProfiledError(msg) + + @staticmethod + def _mb_to_bytes(size_mib: float) -> int: + """Helper method to convert MiB to Bytes. + + Parameters + ---------- + size_mib: float + Size in MiB + + Returns + ------- + size_bytes: int + Size in bytes. + + """ + + return int(size_mib * (2 ** 20)) + + +class MemoryProfile(BaseProfile): + """Approximate the maximum increase in memory usage when calling a given + function. The maximum increase is defined as the difference between the + maximum memory usage during function execution and the baseline memory + usage before function execution. + + In addition, compute the mean increase in baseline memory usage between + repetitions which might indicate memory leakage. + + The current solution is based on `memory_profiler` and is inspired by the + IPython `%memit` magic which additionally calls `gc.collect()` before + executing the function to get more stable results. + + Parameters + ---------- + func: callable + Callable object to be memory profiled. + repetitions: int, optional + Number of repetitions. + + """ + + def __init__(self, func, repetitions=5): + self._func = func + self._repetitions = repetitions + + self._max_usages = None + self._baselines = None + + def profile(self, *args, **kwargs): + """Executes the actual memory profiling. + + Parameters + ---------- + args: iterable, optional + Optional positional arguments passed to `func`. + kwargs: mapping, optional + Optional keyword arguments passed to `func`. + + """ + + from memory_profiler import memory_usage + + counter = 0 + baselines = [] + max_usages = [] + mem_args = (self._func, args, kwargs) + + while counter < self._repetitions: + gc.collect() + baseline = memory_usage()[0] + max_usage = memory_usage(mem_args, max_usage=True)[0] + + baselines.append(self._mb_to_bytes(baseline)) + max_usages.append(self._mb_to_bytes(max_usage)) + counter += 1 + + self._max_usages = max_usages + self._baselines = baselines + + return self + + @property + def max_usages(self) -> List[int]: + """Returns the absolute, maximum memory usages for each iteration in + bytes. + + """ + + self._check_is_profiled(['_max_usages', '_baselines']) + + return self._max_usages + + @property + def baselines(self) -> List[int]: + """Returns the absolute, baseline memory usages for each iteration in + bytes. The baseline memory usage is defined as the memory usage before + function execution. + + """ + + self._check_is_profiled(['_max_usages', '_baselines']) + + return self._baselines + + @property + def increases(self) -> List[int]: + """Returns the absolute memory increase for each iteration in bytes. + The memory increase is defined as the difference between the maximum + memory usage during function execution and the baseline memory usage + before function execution. + + """ + + return np.subtract(self.max_usages, self.baselines).tolist() + + @property + def increases_mean(self) -> float: + """Returns the mean of the absolute memory increases across all + iterations. + + """ + + return float(np.mean(self.increases)) + + @property + def increases_std(self) -> float: + """Returns the standard variation of the absolute memory increases + across all iterations. + + """ + + return float(np.std(self.increases)) + + @property + def baseline_change(self) -> float: + """Returns the mean change in baseline memory usage across all + all iterations. The baseline memory usage is defined as the memory + usage before function execution. + """ + + changes = np.diff(self.baselines) + return float(np.mean(changes)) + + +class PandasMemoryProfiler(BaseProfile): + """Approximate memory usage for wrangler execution via `fit_transform` + for given input dataframes. + + Computes the ratio of maximum memory usage and input memory usage as an + estimate of how many times more memory is required for wrangler execution + in regard to the input memory usage. + + """ + + def __init__(self, wrangler, repetitions=5, precision=2): + self._wrangler = wrangler + self._repetitions = repetitions + self._precision = precision + + self._memory_profile = None + self._usage_input = None + self._usage_output = None + + def profile(self, *dfs, **kwargs): + + memory_profile = MemoryProfile(self._wrangler.fit_transform, + self._repetitions) + self._memory_profile = memory_profile.profile(*dfs, **kwargs) + + self._usage_input = self._memory_usage_dfs(*dfs) + + dfs_output = self._wrangler.fit_transform(*dfs) + dfs_output = sanitizer.ensure_tuple(dfs_output) + self._usage_output = self._memory_usage_dfs(*dfs_output) + + return self + + @property + def usage_increases_mean(self): + """Returns the mean of the absolute memory increases across all + iterations. + + """ + + self._check_is_profiled(['_memory_profile']) + return self._memory_profile.increases_mean + + @property + def usage_input(self) -> float: + """Returns the memory usage of the input dataframes in bytes. + + """ + + self._check_is_profiled(['_usage_input']) + return self._usage_input + + @property + def usage_output(self) -> float: + """Returns the memory usage of the output dataframes in bytes. + + """ + + self._check_is_profiled(['_usage_output']) + return self._usage_output + + @cached_property + def usage_ratio(self) -> float: + """Returns the ratio of maximum memory usage and input memory usage. + A value of 0 means no memory consumption during execution. A value of 1 + means that the wrangler additionally requires the same amount of the + input memory usage during the `transform` step. A value of 2 means that + the wrangler requires twice the amount of the input dataframes memory + usage. + + """ + + return self.usage_increases_mean / self.usage_input + + def report(self): + """Profile memory usage via `profile` and provide human readable + report. + + """ + + # string part for header + wrangler_name = self._wrangler.__class__.__name__ + str_header = header("{} - memory usage".format(wrangler_name)) + + # string part for input and output dfs + dict_dfs = {"Input dfs": sizeof(self.usage_input, self._precision), + "Ouput dfs": sizeof(self.usage_output, self._precision)} + + str_dfs = enumeration(dict_dfs, align_width=15, bullet_char="") + + # string part for transform/fit and ratio + str_inc = sizeof(self.usage_increases_mean) + str_std = sizeof(self._memory_profile.increases_std, + self._precision, width=0) + str_inc += " (Std: {})".format(str_std) + str_ratio = "{:>7.2f}".format(self.usage_ratio) + dict_inc = {"Fit/Transform": str_inc, + "Ratio": str_ratio} + + str_inc = enumeration(dict_inc, align_width=15, bullet_char="") + + # build complete string and print + template = "{}\n{}\n\n{}" + report_string = template.format(str_header, str_dfs, str_inc) + + print(report_string) + + @staticmethod + def _memory_usage_dfs(*dfs) -> int: + """Return the memory usage in Bytes for all dataframes `dfs`. + + """ + + mem_usages = [df.memory_usage(deep=True, index=True).sum() + for df in dfs] + + return int(np.sum(mem_usages)) From 24ae1c1dafde7f62ff330c7206a3460c007533d8 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:29:21 +0100 Subject: [PATCH 08/48] Comment `src` to avoid duplicated coverage. --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index d1652c5..776fdf7 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,7 +6,7 @@ source = pywrangler [paths] source = - src/ +# src/ */site-packages/ [report] From 132ad48ab47b991151e1cd733c672174dbe43103 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Thu, 21 Mar 2019 21:29:53 +0100 Subject: [PATCH 09/48] Add `memory_profiler` to testing. --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 20d4f29..5685a63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,6 +32,7 @@ testing = pytest pytest-cov tox + memory_profiler dev = sphinx @@ -52,6 +53,7 @@ norecursedirs = dist build .tox + testpaths = tests [aliases] From f800ec609b90e330433a2496894ed0387fe87f6b Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 22 Mar 2019 14:18:02 +0100 Subject: [PATCH 10/48] Add more expressive doc strings. Add tests. --- src/pywrangler/benchmark.py | 143 ++++++++++++++++++++-------- tests/test_benchmark.py | 183 ++++++++++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+), 37 deletions(-) create mode 100644 tests/test_benchmark.py diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index babc88f..ad553fb 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -8,11 +8,13 @@ from typing import Iterable, List import numpy as np +import pandas as pd from pywrangler.exceptions import NotProfiledError from pywrangler.util import sanitizer from pywrangler.util._pprint import enumeration, header, sizeof from pywrangler.util.helper import cached_property +from pywrangler.wranglers.pandas.base import PandasWrangler def allocate_memory(size: float) -> np.ndarray: @@ -43,17 +45,23 @@ def allocate_memory(size: float) -> np.ndarray: return memory_holder -class BaseProfile: - """Base class defining interface and providing common helper methods. +class BaseProfiler: + """Base class defining interface and providing common helper methods for + memory and time profiler. By convention, the profiled object should always the be the first argument - (ignoring self) passed to `__init__`. All public, relevant profiling - metrics have to be defined as properties. All private attributes (methods - and variables) need to start with an underscore. + (ignoring self) passed to `__init__`. All public profiling metrics have to + be defined as properties. All private attributes need to start with an + underscore. """ def profile(self, *args, **kwargs): + """Contains the actual profiling implementation and should always + return self. + + """ + raise NotImplementedError def report(self): @@ -85,6 +93,10 @@ def report(self): enumeration(metric_values), sep='') def profile_report(self, *args, **kwargs): + """Calls profile and report in sequence. + + """ + self.profile(*args, **kwargs).report() def _check_is_profiled(self, attributes: Iterable[str]) -> None: @@ -136,7 +148,7 @@ def _mb_to_bytes(size_mib: float) -> int: return int(size_mib * (2 ** 20)) -class MemoryProfile(BaseProfile): +class MemoryProfiler(BaseProfiler): """Approximate the maximum increase in memory usage when calling a given function. The maximum increase is defined as the difference between the maximum memory usage during function execution and the baseline memory @@ -205,7 +217,7 @@ def max_usages(self) -> List[int]: """ - self._check_is_profiled(['_max_usages', '_baselines']) + self._check_is_profiled(['_max_usages']) return self._max_usages @@ -217,7 +229,7 @@ def baselines(self) -> List[int]: """ - self._check_is_profiled(['_max_usages', '_baselines']) + self._check_is_profiled(['_baselines']) return self._baselines @@ -261,43 +273,78 @@ def baseline_change(self) -> float: return float(np.mean(changes)) -class PandasMemoryProfiler(BaseProfile): - """Approximate memory usage for wrangler execution via `fit_transform` - for given input dataframes. +class PandasMemoryProfiler(BaseProfiler): + """Approximate memory usage for pandas wrangler instances. - Computes the ratio of maximum memory usage and input memory usage as an - estimate of how many times more memory is required for wrangler execution - in regard to the input memory usage. + Memory consumption is profiled while calling `fit_transform` for given + input dataframes. + + As a key metric, `usage_ratio` is computed. It refers to the amount of + memory which is required to execute the `fit_transform` step. More + concretely, it estimates how much more memory is used standardized by the + input memory usage (memory usage increase during function execution divided + by memory usage of input dataframes). In other words, if you have a 1GB + input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free + memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input + dataframe would require 1GB free memory available for computation. + + Parameters + ---------- + wrangler: pywrangler.wranglers.pandas.base.PandasWrangler + The wrangler instance to be profiled. + repetitions: int + The number of measurements for memory profiling. + + Attributes + ---------- + usage_increases_mean: float + The mean of the absolute memory increases across all iterations in + bytes. + usage_input: int + Memory usage of input dataframes in bytes. + usage_output: int + Memory usage of output dataframes in bytes. + usage_ratio: float + The amount of memory required for computation in units of input + memory usage. """ - def __init__(self, wrangler, repetitions=5, precision=2): + def __init__(self, wrangler: PandasWrangler, repetitions: int = 5): self._wrangler = wrangler self._repetitions = repetitions - self._precision = precision self._memory_profile = None self._usage_input = None self._usage_output = None - def profile(self, *dfs, **kwargs): + def profile(self, *dfs: pd.DataFrame, **kwargs): + """Profiles the actual memory usage given input dataframes `dfs` + which are passed to `fit_transform`. + - memory_profile = MemoryProfile(self._wrangler.fit_transform, - self._repetitions) - self._memory_profile = memory_profile.profile(*dfs, **kwargs) + """ + + # usage input self._usage_input = self._memory_usage_dfs(*dfs) + # usage output dfs_output = self._wrangler.fit_transform(*dfs) dfs_output = sanitizer.ensure_tuple(dfs_output) self._usage_output = self._memory_usage_dfs(*dfs_output) + # usage during fit_transform + memory_profile = MemoryProfiler(self._wrangler.fit_transform, + self._repetitions) + self._memory_profile = memory_profile.profile(*dfs, **kwargs) + return self @property - def usage_increases_mean(self): + def usage_increases_mean(self) -> float: """Returns the mean of the absolute memory increases across all - iterations. + iterations in bytes. """ @@ -324,12 +371,14 @@ def usage_output(self) -> float: @cached_property def usage_ratio(self) -> float: - """Returns the ratio of maximum memory usage and input memory usage. - A value of 0 means no memory consumption during execution. A value of 1 - means that the wrangler additionally requires the same amount of the - input memory usage during the `transform` step. A value of 2 means that - the wrangler requires twice the amount of the input dataframes memory - usage. + """Refers to the amount of memory which is required to execute the + `fit_transform` step. More concretely, it estimates how much more + memory is used standardized by the input memory usage (memory usage + increase during function execution divided by memory usage of input + dataframes). In other words, if you have a 1GB input dataframe, and the + `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to + succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would + require 1GB free memory available for computation. """ @@ -337,30 +386,40 @@ def usage_ratio(self) -> float: def report(self): """Profile memory usage via `profile` and provide human readable - report. + report including memory usage of input and output dataframes, memory + usage during `fit_transform`, the usage ratio and shows if + the wrangler may have side effects in regard to memory consumption via + the change in baseline memory usage. + + Returns + ------- + None. Prints report to stdout. """ + enum_kwargs = dict(align_width=15, bullet_char="") + # string part for header wrangler_name = self._wrangler.__class__.__name__ str_header = header("{} - memory usage".format(wrangler_name)) # string part for input and output dfs - dict_dfs = {"Input dfs": sizeof(self.usage_input, self._precision), - "Ouput dfs": sizeof(self.usage_output, self._precision)} + dict_dfs = {"Input dfs": sizeof(self.usage_input), + "Ouput dfs": sizeof(self.usage_output)} - str_dfs = enumeration(dict_dfs, align_width=15, bullet_char="") + str_dfs = enumeration(dict_dfs, **enum_kwargs) # string part for transform/fit and ratio str_inc = sizeof(self.usage_increases_mean) - str_std = sizeof(self._memory_profile.increases_std, - self._precision, width=0) + str_std = sizeof(self._memory_profile.increases_std, width=0) str_inc += " (Std: {})".format(str_std) str_ratio = "{:>7.2f}".format(self.usage_ratio) + str_baseline_change = sizeof(self._memory_profile.baseline_change) dict_inc = {"Fit/Transform": str_inc, - "Ratio": str_ratio} + "Ratio": str_ratio, + "Baseline change": str_baseline_change} - str_inc = enumeration(dict_inc, align_width=15, bullet_char="") + str_inc = enumeration(dict_inc, **enum_kwargs) # build complete string and print template = "{}\n{}\n\n{}" @@ -369,9 +428,19 @@ def report(self): print(report_string) @staticmethod - def _memory_usage_dfs(*dfs) -> int: + def _memory_usage_dfs(*dfs: pd.DataFrame) -> int: """Return the memory usage in Bytes for all dataframes `dfs`. + Parameters + ---------- + dfs: pd.DataFrame + The pandas dataframes for which memory usage should be computed. + + Returns + ------- + memory_usage: int + The computed memory usage in bytes. + """ mem_usages = [df.memory_usage(deep=True, index=True).sum() diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py new file mode 100644 index 0000000..9fc67e4 --- /dev/null +++ b/tests/test_benchmark.py @@ -0,0 +1,183 @@ +"""This module contains tests for the benchmark utilities. + +""" + +import sys + +import pytest + +import numpy as np +import pandas as pd + +from pywrangler.benchmark import ( + BaseProfiler, + MemoryProfiler, + PandasMemoryProfiler, + allocate_memory +) +from pywrangler.exceptions import NotProfiledError +from pywrangler.wranglers.pandas.base import PandasSingleNoFit + +MIB = 2 ** 20 + + +def test_allocate_memory_empty(): + memory_holder = allocate_memory(0) + + assert memory_holder is None + + +def test_allocate_memory_5mb(): + memory_holder = allocate_memory(5) + + assert sys.getsizeof(memory_holder) == 5 * (2 ** 20) + + +def test_base_profiler_not_implemented(): + base_profiler = BaseProfiler() + + for will_raise in ('profile', 'profile_report'): + with pytest.raises(NotImplementedError): + getattr(base_profiler, will_raise)() + + +def test_base_profiler_check_is_profiled(): + base_profiler = BaseProfiler() + base_profiler._not_set = None + base_profiler._is_set = "value" + + with pytest.raises(NotProfiledError): + base_profiler._check_is_profiled(['_not_set']) + + base_profiler._check_is_profiled(['_is_set']) + + +def test_base_profiler_mb_to_bytes(): + assert BaseProfiler._mb_to_bytes(1) == 1048576 + assert BaseProfiler._mb_to_bytes(1.5) == 1572864 + assert BaseProfiler._mb_to_bytes(0.33) == 346030 + + +def test_memory_profiler_return_self(): + def dummy(): + pass + + memory_profiler = MemoryProfiler(dummy) + assert memory_profiler.profile() is memory_profiler + + +def test_memory_profiler_properties(): + def dummy(): + pass + + memory_profiler = MemoryProfiler(dummy) + memory_profiler._baselines = [0, 1, 2, 3] + memory_profiler._max_usages = [4, 5, 7, 8] + + assert memory_profiler.max_usages == memory_profiler._max_usages + assert memory_profiler.baselines == memory_profiler._baselines + assert memory_profiler.increases == [4, 4, 5, 5] + assert memory_profiler.increases_mean == 4.5 + assert memory_profiler.increases_std == 0.5 + assert memory_profiler.baseline_change == 1 + + +def test_memory_profiler_no_side_effect(): + def no_side_effect(): + dummy = 5 + return dummy + + assert MemoryProfiler(no_side_effect).profile().baseline_change < 0.5 * MIB + + +def test_memory_profiler_side_effect(): + side_effect_container = [] + + def side_effect(): + memory_holder = allocate_memory(5) + side_effect_container.append(memory_holder) + + return memory_holder + + assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB + + +def test_memory_profiler_no_increase(): + def no_increase(): + pass + + assert MemoryProfiler(no_increase).profile().increases_mean < 0.1 * MIB + assert MemoryProfiler(no_increase).profile().increases_std < 0.1 * MIB + + +def test_memory_profiler_increase(): + def increase(): + memory_holder = allocate_memory(30) + return memory_holder + + assert MemoryProfiler(increase).profile().increases_mean > 29 * MIB + + +def test_pandas_memory_profiler_memory_usage_dfs(): + df1 = pd.DataFrame(np.random.rand(10)) + df2 = pd.DataFrame(np.random.rand(10)) + + test_input = [df1, df2] + test_output = int(df1.memory_usage(index=True, deep=True).sum() + + df2.memory_usage(index=True, deep=True).sum()) + + assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output + + +def test_pandas_memory_profiler_return_self(): + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame() + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler is memory_profiler.profile(pd.DataFrame()) + + +def test_pandas_memory_profiler_usage_increases_mean(): + empty_df = pd.DataFrame() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame(allocate_memory(30)) + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler.profile(empty_df).usage_increases_mean > 29 * MIB + + +def test_pandas_memory_profiler_usage_input_output(): + df_input = pd.DataFrame(np.random.rand(1000)) + df_output = pd.DataFrame(np.random.rand(10000)) + + test_df_input = df_input.memory_usage(index=True, deep=True).sum() + test_df_output = df_output.memory_usage(index=True, deep=True).sum() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return df_output + + memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input) + + assert memory_profiler.usage_input == test_df_input + assert memory_profiler.usage_output == test_df_output + + +def test_pandas_memory_profiler_usage_ratio(): + usage_mib = 30 + df_input = pd.DataFrame(np.random.rand(1000000)) + usage_input = df_input.memory_usage(index=True, deep=True).sum() + test_output = ((usage_mib - 1) * MIB) / usage_input + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame(allocate_memory(usage_mib)) + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler.profile(df_input).usage_ratio > test_output From ebfb04ea083d0cf50be5dc8dd03edffcea12cd4f Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 22 Mar 2019 14:21:06 +0100 Subject: [PATCH 11/48] Add and adjust tests for `enumeration` and `sizeof`. --- tests/util/test_pprint.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/tests/util/test_pprint.py b/tests/util/test_pprint.py index eb3967e..35a71b0 100644 --- a/tests/util/test_pprint.py +++ b/tests/util/test_pprint.py @@ -2,13 +2,10 @@ """ -import pytest - from pywrangler.util import _pprint def test_join(): - test_input = ["a", "b", "c"] test_output = "a\nb\nc" @@ -16,7 +13,6 @@ def test_join(): def test_indent(): - test_input = ["a", "b", "c"] test_output = [" a", " b", " c"] @@ -24,7 +20,6 @@ def test_indent(): def test_header(): - test_input = "Header" test_output = 'Header\n------\n' @@ -32,7 +27,6 @@ def test_header(): def test_header_with_indent(): - test_input = "Header" test_output = ' Header\n ------\n' @@ -40,23 +34,34 @@ def test_header_with_indent(): def test_header_with_underline(): - test_input = "Header" test_output = 'Header\n======\n' assert _pprint.header(test_input, underline="=") == test_output -def test_enumeration_dict(): +def test_enumeration_dict_align_values_false(): + test_input = {"a": 1, "bb": 2} + test_output = '- a: 1\n- bb: 2' + + assert _pprint.enumeration(test_input, align_values=False) == test_output + - test_input = {"a": 1, "b": 2} - test_output = '- a: 1\n- b: 2' +def test_enumeration_dict_align_values(): + test_input = {"a": 1, "bb": 2} + test_output = '- a: 1\n- bb: 2' assert _pprint.enumeration(test_input) == test_output -def test_enumeration_list(): +def test_enumeration_dict_align_values_with_align_width(): + test_input = {"a": 1, "bb": 2} + test_output = '- a: 1\n- bb: 2' + + assert _pprint.enumeration(test_input, align_width=3) == test_output + +def test_enumeration_list(): test_input = ["note 1", "note 2"] test_output = '- note 1\n- note 2' @@ -64,7 +69,6 @@ def test_enumeration_list(): def test_enumeration_list_with_indent(): - test_input = ["note 1", "note 2"] test_output = ' - note 1\n - note 2' @@ -72,8 +76,15 @@ def test_enumeration_list_with_indent(): def test_enumeration_list_with_bullet(): - test_input = ["note 1", "note 2"] test_output = 'o note 1\no note 2' assert _pprint.enumeration(test_input, bullet_char="o") == test_output + + +def test_sizeof(): + assert _pprint.sizeof(1024, precision=1, width=0) == '1.0 KiB' + assert _pprint.sizeof(1024, precision=1) == ' 1.0 KiB' + assert _pprint.sizeof(1024, precision=1, align="<") == '1.0 KiB' + assert _pprint.sizeof(1024 ** 2, precision=1, width=0) == '1.0 MiB' + assert _pprint.sizeof(1024 ** 8, precision=2, width=0) == '1.00 YiB' From e8a8a296397d2351e5b4c5f25c4f04ad2c18d7d2 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 23 Mar 2019 15:37:17 +0100 Subject: [PATCH 12/48] Add `TimeProfiler` and `PandasTimeProfiler` with tests to benchmark module. --- src/pywrangler/benchmark.py | 160 ++++++++++++++++++++++++++++++++---- tests/test_benchmark.py | 60 ++++++++++++++ 2 files changed, 206 insertions(+), 14 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index ad553fb..6db33f5 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -5,7 +5,8 @@ import gc import inspect import sys -from typing import Iterable, List +import timeit +from typing import Callable, Iterable, List, Union import numpy as np import pandas as pd @@ -157,7 +158,7 @@ class MemoryProfiler(BaseProfiler): In addition, compute the mean increase in baseline memory usage between repetitions which might indicate memory leakage. - The current solution is based on `memory_profiler` and is inspired by the + The implementation is based on `memory_profiler` and is inspired by the IPython `%memit` magic which additionally calls `gc.collect()` before executing the function to get more stable results. @@ -273,7 +274,143 @@ def baseline_change(self) -> float: return float(np.mean(changes)) -class PandasMemoryProfiler(BaseProfiler): +class TimeProfiler(BaseProfiler): + """Approximate the time required to call a given function. + + The implementation is based on standard library's `timeit` module. By + default, the number of repetitions is estimated if not set explicitly. + + Parameters + ---------- + func: callable + Callable object to be memory profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + median: float + The median of the timing measurements in seconds. + standard_deviation: float + The standard deviation of the timing measurements in seconds. + fastast: float + The fastest value of the timing measurements in seconds. + repetitions: int + The number of measurements. + + """ + + def __init__(self, func: Callable, repetitions: Union[None, int] = None): + self._func = func + self._repetitions = repetitions + + self._timings = None + self._timings_mean = None + self._timings_std = None + self._fastest = None + + def profile(self, *args, **kwargs): + """Executes the actual time profiling. + + Parameters + ---------- + args: iterable, optional + Optional positional arguments passed to `func`. + kwargs: mapping, optional + Optional keyword arguments passed to `func`. + + """ + + def wrapper(): + """Helper function without arguments which is passed to `repeat` + which only calls given function with provided args and kwargs. + + """ + + self._func(*args, **kwargs) + + timer = timeit.Timer(stmt=wrapper) + + if self._repetitions is None: + repeat, _ = timer.autorange(None) + else: + repeat = self._repetitions + + self._timings = timer.repeat(number=1, repeat=repeat) + + return self + + @property + def median(self) -> float: + """Returns the median of all timeit measurements in seconds. + + """ + self._check_is_profiled(['_timings']) + + return float(np.median(self._timings)) + + @property + def standard_deviation(self) -> float: + """Returns the standard deviation of all timeit measurements in + seconds. + + """ + self._check_is_profiled(['_timings']) + + return float(np.std(self._timings)) + + @property + def fastest(self) -> float: + """Returns the fastest timing measurement in seconds. + + """ + + self._check_is_profiled(['_timings']) + + return min(self._timings) + + @property + def repetitions(self) -> int: + """Returns the number of measurements. + + """ + + return len(self._timings) + + +class PandasTimeProfiler(TimeProfiler): + """Approximate time which pandas wrangler instances require during their + `fit_transform` step. + + Parameters + ---------- + wrangler: pywrangler.wranglers.pandas.base.PandasWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + median: float + The median of the timing measurements in seconds. + standard_deviation: float + The standard deviation of the timing measurements in seconds. + fastast: float + The fastest value of the timing measurements in seconds. + repetitions: int + The number of measurements. + + """ + + def __init__(self, wrangler: PandasWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + super().__init__(wrangler.fit_transform, repetitions) + + +class PandasMemoryProfiler(MemoryProfiler): """Approximate memory usage for pandas wrangler instances. Memory consumption is profiled while calling `fit_transform` for given @@ -312,18 +449,16 @@ class PandasMemoryProfiler(BaseProfiler): def __init__(self, wrangler: PandasWrangler, repetitions: int = 5): self._wrangler = wrangler - self._repetitions = repetitions - self._memory_profile = None self._usage_input = None self._usage_output = None + super().__init__(wrangler.fit_transform, repetitions) + def profile(self, *dfs: pd.DataFrame, **kwargs): """Profiles the actual memory usage given input dataframes `dfs` which are passed to `fit_transform`. - - """ # usage input @@ -335,9 +470,7 @@ def profile(self, *dfs: pd.DataFrame, **kwargs): self._usage_output = self._memory_usage_dfs(*dfs_output) # usage during fit_transform - memory_profile = MemoryProfiler(self._wrangler.fit_transform, - self._repetitions) - self._memory_profile = memory_profile.profile(*dfs, **kwargs) + super().profile(*dfs, **kwargs) return self @@ -348,8 +481,7 @@ def usage_increases_mean(self) -> float: """ - self._check_is_profiled(['_memory_profile']) - return self._memory_profile.increases_mean + return self.increases_mean @property def usage_input(self) -> float: @@ -411,10 +543,10 @@ def report(self): # string part for transform/fit and ratio str_inc = sizeof(self.usage_increases_mean) - str_std = sizeof(self._memory_profile.increases_std, width=0) + str_std = sizeof(self.increases_std, width=0) str_inc += " (Std: {})".format(str_std) str_ratio = "{:>7.2f}".format(self.usage_ratio) - str_baseline_change = sizeof(self._memory_profile.baseline_change) + str_baseline_change = sizeof(self.baseline_change) dict_inc = {"Fit/Transform": str_inc, "Ratio": str_ratio, "Baseline change": str_baseline_change} diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 9fc67e4..15685fc 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -3,6 +3,7 @@ """ import sys +import time import pytest @@ -13,6 +14,8 @@ BaseProfiler, MemoryProfiler, PandasMemoryProfiler, + PandasTimeProfiler, + TimeProfiler, allocate_memory ) from pywrangler.exceptions import NotProfiledError @@ -181,3 +184,60 @@ def transform(self, df): memory_profiler = PandasMemoryProfiler(DummyWrangler()) assert memory_profiler.profile(df_input).usage_ratio > test_output + + +def test_time_profiler_return_self(): + def dummy(): + pass + + time_profiler = TimeProfiler(dummy, 1) + assert time_profiler.profile() is time_profiler + + +def test_time_profiler_properties(): + def dummy(): + pass + + time_profiler = TimeProfiler(dummy) + time_profiler._timings = [1, 1, 3, 3] + + assert time_profiler.median == 2 + assert time_profiler.standard_deviation == 1 + assert time_profiler.fastest == 1 + assert time_profiler.repetitions == 4 + + +def test_time_profiler_repetitions(): + def dummy(): + pass + + time_profiler = TimeProfiler(dummy, repetitions=10).profile() + + assert time_profiler.repetitions == 10 + + +def test_time_profiler_fastest(): + sleep = 0.0001 + + def dummy(): + time.sleep(sleep) + pass + + time_profiler = TimeProfiler(dummy, repetitions=1).profile() + + assert time_profiler.fastest >= sleep + + +def test_pandas_time_profiler_fastest(): + + sleep = 0.0001 + df_dummy = pd.DataFrame() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + time.sleep(sleep) + pass + + time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_dummy) + + time_profiler.fastest >= sleep From 8d6c495e393d6336068a227a98e73a99a122e17f Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 19:31:44 +0200 Subject: [PATCH 13/48] Improve doc strings. Simplify attributes of `PandasMemoryProfiler`. Add `timings` property to `TimeProfiler.` --- src/pywrangler/benchmark.py | 110 +++++++++++++++++++----------------- tests/test_benchmark.py | 10 ++-- 2 files changed, 65 insertions(+), 55 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index 6db33f5..8b6156f 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -19,15 +19,15 @@ def allocate_memory(size: float) -> np.ndarray: - """Occupies memory by creating numpy array with given size (MB). + """Helper function for testing to allocate memory by creating numpy array + with given size in MiB. - Numpy is used deliberately to specifically define the used memory via - dtype. + Numpy is used deliberately to define the used memory via dtype. Parameters ---------- size: float - Size in MB to be occupied. + Size in MiB to be occupied. Returns ------- @@ -47,13 +47,13 @@ def allocate_memory(size: float) -> np.ndarray: class BaseProfiler: - """Base class defining interface and providing common helper methods for - memory and time profiler. + """Base class defining interface and common helper methods for memory and + time profiler. - By convention, the profiled object should always the be the first argument - (ignoring self) passed to `__init__`. All public profiling metrics have to - be defined as properties. All private attributes need to start with an - underscore. + By convention, the profiled object should always be the first argument + (ignoring self) passed to `__init__`. + All public profiling metrics have to should be defined as properties. All + private attributes need to start with an underscore. """ @@ -66,8 +66,8 @@ def profile(self, *args, **kwargs): raise NotImplementedError def report(self): - """Creates basic report consisting the name of the profiler class, the - name of the profiled object, and all defined metrics/properties. + """Print simple report consisting of the name of the profiler class, + the name of the profiled object, and all defined metrics/properties. """ @@ -150,18 +150,14 @@ def _mb_to_bytes(size_mib: float) -> int: class MemoryProfiler(BaseProfiler): - """Approximate the maximum increase in memory usage when calling a given - function. The maximum increase is defined as the difference between the - maximum memory usage during function execution and the baseline memory - usage before function execution. + """Approximate the increase in memory usage when calling a given function. + Memory increase is defined as the difference between the maximum memory + usage during function execution and the baseline memory usage before + function execution. In addition, compute the mean increase in baseline memory usage between repetitions which might indicate memory leakage. - The implementation is based on `memory_profiler` and is inspired by the - IPython `%memit` magic which additionally calls `gc.collect()` before - executing the function to get more stable results. - Parameters ---------- func: callable @@ -169,6 +165,12 @@ class MemoryProfiler(BaseProfiler): repetitions: int, optional Number of repetitions. + Notes + ----- + The implementation is based on `memory_profiler` and is inspired by the + IPython `%memit` magic which additionally calls `gc.collect()` before + executing the function to get more stable results. + """ def __init__(self, func, repetitions=5): @@ -275,10 +277,9 @@ def baseline_change(self) -> float: class TimeProfiler(BaseProfiler): - """Approximate the time required to call a given function. + """Approximate the time required to execute a function call. - The implementation is based on standard library's `timeit` module. By - default, the number of repetitions is estimated if not set explicitly. + By default, the number of repetitions is estimated if not set explicitly. Parameters ---------- @@ -290,6 +291,8 @@ class TimeProfiler(BaseProfiler): Attributes ---------- + timings: list + The timing measurements in seconds. median: float The median of the timing measurements in seconds. standard_deviation: float @@ -299,6 +302,10 @@ class TimeProfiler(BaseProfiler): repetitions: int The number of measurements. + Notes + ----- + The implementation is based on standard library's `timeit` module. + """ def __init__(self, func: Callable, repetitions: Union[None, int] = None): @@ -341,11 +348,20 @@ def wrapper(): return self + @property + def timings(self) -> List[float]: + """Returns the timeit measurements in seconds. + + """ + + return self._timings + @property def median(self) -> float: """Returns the median of all timeit measurements in seconds. """ + self._check_is_profiled(['_timings']) return float(np.median(self._timings)) @@ -356,6 +372,7 @@ def standard_deviation(self) -> float: seconds. """ + self._check_is_profiled(['_timings']) return float(np.std(self._timings)) @@ -380,7 +397,7 @@ def repetitions(self) -> int: class PandasTimeProfiler(TimeProfiler): - """Approximate time which pandas wrangler instances require during their + """Approximate time that a pandas wrangler instance requires to execute the `fit_transform` step. Parameters @@ -393,6 +410,8 @@ class PandasTimeProfiler(TimeProfiler): Attributes ---------- + timings: list + The timing measurements in seconds. median: float The median of the timing measurements in seconds. standard_deviation: float @@ -411,12 +430,10 @@ def __init__(self, wrangler: PandasWrangler, class PandasMemoryProfiler(MemoryProfiler): - """Approximate memory usage for pandas wrangler instances. - - Memory consumption is profiled while calling `fit_transform` for given - input dataframes. + """Approximate memory usage that a pandas wrangler instance requires to + execute the `fit_transform` step. - As a key metric, `usage_ratio` is computed. It refers to the amount of + As a key metric, `ratio` is computed. It refers to the amount of memory which is required to execute the `fit_transform` step. More concretely, it estimates how much more memory is used standardized by the input memory usage (memory usage increase during function execution divided @@ -434,14 +451,14 @@ class PandasMemoryProfiler(MemoryProfiler): Attributes ---------- - usage_increases_mean: float + increases_mean: float The mean of the absolute memory increases across all iterations in bytes. - usage_input: int + input: int Memory usage of input dataframes in bytes. - usage_output: int + output: int Memory usage of output dataframes in bytes. - usage_ratio: float + ratio: float The amount of memory required for computation in units of input memory usage. @@ -475,16 +492,7 @@ def profile(self, *dfs: pd.DataFrame, **kwargs): return self @property - def usage_increases_mean(self) -> float: - """Returns the mean of the absolute memory increases across all - iterations in bytes. - - """ - - return self.increases_mean - - @property - def usage_input(self) -> float: + def input(self) -> float: """Returns the memory usage of the input dataframes in bytes. """ @@ -493,7 +501,7 @@ def usage_input(self) -> float: return self._usage_input @property - def usage_output(self) -> float: + def output(self) -> float: """Returns the memory usage of the output dataframes in bytes. """ @@ -502,7 +510,7 @@ def usage_output(self) -> float: return self._usage_output @cached_property - def usage_ratio(self) -> float: + def ratio(self) -> float: """Refers to the amount of memory which is required to execute the `fit_transform` step. More concretely, it estimates how much more memory is used standardized by the input memory usage (memory usage @@ -514,7 +522,7 @@ def usage_ratio(self) -> float: """ - return self.usage_increases_mean / self.usage_input + return self.increases_mean / self.input def report(self): """Profile memory usage via `profile` and provide human readable @@ -536,16 +544,16 @@ def report(self): str_header = header("{} - memory usage".format(wrangler_name)) # string part for input and output dfs - dict_dfs = {"Input dfs": sizeof(self.usage_input), - "Ouput dfs": sizeof(self.usage_output)} + dict_dfs = {"Input dfs": sizeof(self.input), + "Ouput dfs": sizeof(self.output)} str_dfs = enumeration(dict_dfs, **enum_kwargs) # string part for transform/fit and ratio - str_inc = sizeof(self.usage_increases_mean) + str_inc = sizeof(self.increases_mean) str_std = sizeof(self.increases_std, width=0) str_inc += " (Std: {})".format(str_std) - str_ratio = "{:>7.2f}".format(self.usage_ratio) + str_ratio = "{:>7.2f}".format(self.ratio) str_baseline_change = sizeof(self.baseline_change) dict_inc = {"Fit/Transform": str_inc, "Ratio": str_ratio, @@ -561,7 +569,7 @@ def report(self): @staticmethod def _memory_usage_dfs(*dfs: pd.DataFrame) -> int: - """Return the memory usage in Bytes for all dataframes `dfs`. + """Return memory usage in bytes for all given dataframes. Parameters ---------- diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 15685fc..3fd493b 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -151,7 +151,7 @@ def transform(self, df): memory_profiler = PandasMemoryProfiler(DummyWrangler()) - assert memory_profiler.profile(empty_df).usage_increases_mean > 29 * MIB + assert memory_profiler.profile(empty_df).increases_mean > 29 * MIB def test_pandas_memory_profiler_usage_input_output(): @@ -167,8 +167,8 @@ def transform(self, df): memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input) - assert memory_profiler.usage_input == test_df_input - assert memory_profiler.usage_output == test_df_output + assert memory_profiler.input == test_df_input + assert memory_profiler.output == test_df_output def test_pandas_memory_profiler_usage_ratio(): @@ -183,7 +183,7 @@ def transform(self, df): memory_profiler = PandasMemoryProfiler(DummyWrangler()) - assert memory_profiler.profile(df_input).usage_ratio > test_output + assert memory_profiler.profile(df_input).ratio > test_output def test_time_profiler_return_self(): @@ -195,6 +195,7 @@ def dummy(): def test_time_profiler_properties(): + def dummy(): pass @@ -205,6 +206,7 @@ def dummy(): assert time_profiler.standard_deviation == 1 assert time_profiler.fastest == 1 assert time_profiler.repetitions == 4 + assert time_profiler.timings == time_profiler._timings def test_time_profiler_repetitions(): From 1ba767fd03af16cff02aa5ac46cccd97d74b3332 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 21:30:41 +0200 Subject: [PATCH 14/48] Add dask base wrangler. --- src/pywrangler/wranglers/dask/__init__.py | 0 src/pywrangler/wranglers/dask/base.py | 53 +++++++++++++++++++++++ tests/wranglers/dask/__init__.py | 0 tests/wranglers/dask/test_base.py | 17 ++++++++ 4 files changed, 70 insertions(+) create mode 100644 src/pywrangler/wranglers/dask/__init__.py create mode 100644 src/pywrangler/wranglers/dask/base.py create mode 100644 tests/wranglers/dask/__init__.py create mode 100644 tests/wranglers/dask/test_base.py diff --git a/src/pywrangler/wranglers/dask/__init__.py b/src/pywrangler/wranglers/dask/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pywrangler/wranglers/dask/base.py b/src/pywrangler/wranglers/dask/base.py new file mode 100644 index 0000000..6953c16 --- /dev/null +++ b/src/pywrangler/wranglers/dask/base.py @@ -0,0 +1,53 @@ +"""This module contains the dask base wrangler. + +""" + +from dask.dataframe import DataFrame + +from pywrangler.wranglers.base import BaseWrangler + + +class DaskWrangler(BaseWrangler): + """Contains methods common to all dask based wranglers. + + """ + + @property + def computation_engine(self): + return "dask" + + +class DaskSingleNoFit(DaskWrangler): + """Mixin class defining `fit` and `fit_transform` for all wranglers with + a single data frame input and output with no fitting necessary. + + """ + + def fit(self, df: DataFrame): + """Do nothing and return the wrangler unchanged. + + This method is just there to implement the usual API and hence work in + pipelines. + + Parameters + ---------- + df: pd.DataFrame + + """ + + return self + + def fit_transform(self, df: DataFrame) -> DataFrame: + """Apply fit and transform in sequence at once. + + Parameters + ---------- + df: pd.DataFrame + + Returns + ------- + result: pd.DataFrame + + """ + + return self.fit(df).transform(df) diff --git a/tests/wranglers/dask/__init__.py b/tests/wranglers/dask/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/wranglers/dask/test_base.py b/tests/wranglers/dask/test_base.py new file mode 100644 index 0000000..da4ae61 --- /dev/null +++ b/tests/wranglers/dask/test_base.py @@ -0,0 +1,17 @@ +"""Test dask base wrangler. + +""" + +import pytest + +try: + from pywrangler.wranglers.dask.base import DaskWrangler +except ImportError: + DaskWrangler = None + + +@pytest.mark.dask +def test_dask_base_wrangler_engine(): + wrangler = DaskWrangler() + + assert wrangler.computation_engine == "dask" From dea81bd283b9636853cb6fc1d6a0fcca9daf5578 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 21:31:10 +0200 Subject: [PATCH 15/48] Add spark base wrangler. --- src/pywrangler/wranglers/spark/__init__.py | 0 src/pywrangler/wranglers/spark/base.py | 53 ++++++++++++++++++++++ tests/wranglers/spark/__init__.py | 0 tests/wranglers/spark/test_base.py | 17 +++++++ 4 files changed, 70 insertions(+) create mode 100644 src/pywrangler/wranglers/spark/__init__.py create mode 100644 src/pywrangler/wranglers/spark/base.py create mode 100644 tests/wranglers/spark/__init__.py create mode 100644 tests/wranglers/spark/test_base.py diff --git a/src/pywrangler/wranglers/spark/__init__.py b/src/pywrangler/wranglers/spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pywrangler/wranglers/spark/base.py b/src/pywrangler/wranglers/spark/base.py new file mode 100644 index 0000000..8f7ba14 --- /dev/null +++ b/src/pywrangler/wranglers/spark/base.py @@ -0,0 +1,53 @@ +"""This module contains the dask base wrangler. + +""" + +from pyspark.sql import DataFrame + +from pywrangler.wranglers.base import BaseWrangler + + +class SparkWrangler(BaseWrangler): + """Contains methods common to all spark based wranglers. + + """ + + @property + def computation_engine(self): + return "spark" + + +class SparkSingleNoFit(SparkWrangler): + """Mixin class defining `fit` and `fit_transform` for all wranglers with + a single data frame input and output with no fitting necessary. + + """ + + def fit(self, df: DataFrame): + """Do nothing and return the wrangler unchanged. + + This method is just there to implement the usual API and hence work in + pipelines. + + Parameters + ---------- + df: pd.DataFrame + + """ + + return self + + def fit_transform(self, df: DataFrame) -> DataFrame: + """Apply fit and transform in sequence at once. + + Parameters + ---------- + df: pd.DataFrame + + Returns + ------- + result: pd.DataFrame + + """ + + return self.fit(df).transform(df) diff --git a/tests/wranglers/spark/__init__.py b/tests/wranglers/spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/wranglers/spark/test_base.py b/tests/wranglers/spark/test_base.py new file mode 100644 index 0000000..6c20d21 --- /dev/null +++ b/tests/wranglers/spark/test_base.py @@ -0,0 +1,17 @@ +"""Test spark base wrangler. + +""" + +import pytest + +try: + from pywrangler.wranglers.spark.base import SparkWrangler +except ImportError: + SparkWrangler = None + + +@pytest.mark.pyspark +def test_spark_base_wrangler_engine(): + wrangler = SparkWrangler() + + assert wrangler.computation_engine == "spark" From 533685e6dbe00e161455f762284db24e25cbafd0 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 21:31:26 +0200 Subject: [PATCH 16/48] Fix typo. --- tests/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_environment.py b/tests/test_environment.py index d8cee0d..5018288 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -45,7 +45,7 @@ def test_pyspark_import(): @pytest.mark.pyspark def test_pyspark_pandas_interaction(spark): - """Check simple interaction between pyspark and pandes. + """Check simple interaction between pyspark and pandas. """ From 15d0952ae4f1493cf7f5396b4aeb257bd5095d1a Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 21:33:04 +0200 Subject: [PATCH 17/48] Add custom `report` method for `TimeProfiler`. Add `SparkTimeProfiler`. --- src/pywrangler/benchmark.py | 120 ++++++++++++++++++++++++++++++++++-- tests/test_benchmark.py | 59 ++++++++++++++++-- 2 files changed, 169 insertions(+), 10 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index 8b6156f..b823234 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -6,7 +6,7 @@ import inspect import sys import timeit -from typing import Callable, Iterable, List, Union +from typing import Any, Callable, Iterable, List, Union import numpy as np import pandas as pd @@ -15,7 +15,17 @@ from pywrangler.util import sanitizer from pywrangler.util._pprint import enumeration, header, sizeof from pywrangler.util.helper import cached_property -from pywrangler.wranglers.pandas.base import PandasWrangler +from pywrangler.wranglers.base import BaseWrangler + +try: + from pyspark.sql import DataFrame as SparkDataFrame +except ImportError: + SparkDataFrame = Any + +try: + from dask.dataframe import DataFrame as DaskDataFrame +except ImportError: + DaskDataFrame = Any def allocate_memory(size: float) -> np.ndarray: @@ -395,6 +405,35 @@ def repetitions(self) -> int: return len(self._timings) + def report(self): + """Profile time via `profile` and provide human readable report. + + Returns + ------- + None. Prints report to stdout. + + """ + + enum_kwargs = dict(align_width=15, bullet_char="") + + # string part for header + wrangler_name = self._wrangler.__class__.__name__ + str_header = header("{} - time profiling".format(wrangler_name)) + + # string part for values + dict_values = {"Fastest": "{:.2f}s".format(self.fastest), + "Median": "{:.2f}s".format(self.median), + "Std": "{:.2f}s".format(self.standard_deviation), + "Repetitions": self.repetitions} + + str_values = enumeration(dict_values, **enum_kwargs) + + # build complete string and print + template = "{}\n{}\n" + report_string = template.format(str_header, str_values) + + print(report_string) + class PandasTimeProfiler(TimeProfiler): """Approximate time that a pandas wrangler instance requires to execute the @@ -402,7 +441,7 @@ class PandasTimeProfiler(TimeProfiler): Parameters ---------- - wrangler: pywrangler.wranglers.pandas.base.PandasWrangler + wrangler: pywrangler.wranglers.base.BaseWrangler The wrangler instance to be profiled. repetitions: None, int, optional Number of repetitions. If `None`, `timeit.Timer.autorange` will @@ -423,7 +462,7 @@ class PandasTimeProfiler(TimeProfiler): """ - def __init__(self, wrangler: PandasWrangler, + def __init__(self, wrangler: BaseWrangler, repetitions: Union[None, int] = None): self._wrangler = wrangler super().__init__(wrangler.fit_transform, repetitions) @@ -464,7 +503,7 @@ class PandasMemoryProfiler(MemoryProfiler): """ - def __init__(self, wrangler: PandasWrangler, repetitions: int = 5): + def __init__(self, wrangler: BaseWrangler, repetitions: int = 5): self._wrangler = wrangler self._usage_input = None @@ -587,3 +626,74 @@ def _memory_usage_dfs(*dfs: pd.DataFrame) -> int: for df in dfs] return int(np.sum(mem_usages)) + + +class SparkTimeProfiler(TimeProfiler): + """Approximate time that a spark wrangler instance requires to execute the + `fit_transform` step. + + Please note, input dataframes are cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. This may cause + problems if the size of input dataframes exceeds available memory. + + Parameters + ---------- + wrangler: pywrangler.wranglers.base.BaseWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + timings: list + The timing measurements in seconds. + median: float + The median of the timing measurements in seconds. + standard_deviation: float + The standard deviation of the timing measurements in seconds. + fastast: float + The fastest value of the timing measurements in seconds. + repetitions: int + The number of measurements. + + """ + + def __init__(self, wrangler: BaseWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + + def wrapper(*args, **kwargs): + """Wrapper function to call `count()` to enforce computation. + + """ + + wrangler.fit_transform(*args, **kwargs).count() + + super().__init__(wrapper, repetitions) + + def profile(self, *dfs: SparkDataFrame, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. + + Please note, input dataframes are cached before timing execution to + ensure timing measurements only capture wrangler's `fit_transform`. + This may cause problems if the size of input dataframes exceeds + available memory. + + """ + + # cache input dataframes + dfs_cached = [df.cache() for df in dfs] + + # enforce caching calling count() action + for df in dfs_cached: + df.count() + + super().profile(*dfs_cached, **kwargs) + + # clear caches + for df in dfs_cached: + df.unpersist() + + return self diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 3fd493b..891dc80 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -15,12 +15,18 @@ MemoryProfiler, PandasMemoryProfiler, PandasTimeProfiler, + SparkTimeProfiler, TimeProfiler, allocate_memory ) from pywrangler.exceptions import NotProfiledError from pywrangler.wranglers.pandas.base import PandasSingleNoFit +try: + from pywrangler.wranglers.spark.base import SparkSingleNoFit +except ImportError: + SparkSingleNoFit = None + MIB = 2 ** 20 @@ -195,7 +201,6 @@ def dummy(): def test_time_profiler_properties(): - def dummy(): pass @@ -231,15 +236,59 @@ def dummy(): def test_pandas_time_profiler_fastest(): + """Basic test for pandas time profiler ensuring fastest timing is slower + than forced sleep. + + """ sleep = 0.0001 - df_dummy = pd.DataFrame() + df_input = pd.DataFrame() class DummyWrangler(PandasSingleNoFit): def transform(self, df): time.sleep(sleep) - pass + return df + + time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input) + + assert time_profiler.fastest >= sleep + + +@pytest.mark.pyspark +def test_spark_time_profiler_fastest(spark): + """Basic test for spark time profiler ensuring fastest timing is slower + than forced sleep. + + """ + + sleep = 0.0001 + df_input = spark.range(10).toDF("col") + + class DummyWrangler(SparkSingleNoFit): + def transform(self, df): + time.sleep(sleep) + return df + + time_profiler = SparkTimeProfiler(DummyWrangler(), 1).profile(df_input) + + assert time_profiler.fastest >= sleep + + +@pytest.mark.pyspark +def test_spark_time_profiler_no_caching(spark): + """Pyspark input dataframes are cached during time profiling. Ensure input + dataframes are released from caching after profiling. + + """ + + sleep = 0.0001 + df_input = spark.range(10).toDF("col") + + class DummyWrangler(SparkSingleNoFit): + def transform(self, df): + time.sleep(sleep) + return df - time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_dummy) + SparkTimeProfiler(DummyWrangler(), 1).profile(df_input) - time_profiler.fastest >= sleep + assert df_input.is_cached is False From 9576941b0f78d9645d48fe2b31e779d02c26b2c1 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 22:54:58 +0200 Subject: [PATCH 18/48] Add `DaskTimeProfiler`. --- src/pywrangler/benchmark.py | 92 +++++++++++++++++++++++++++++++++++-- tests/test_benchmark.py | 28 +++++++++++ 2 files changed, 115 insertions(+), 5 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index b823234..3d596bc 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -174,6 +174,9 @@ class MemoryProfiler(BaseProfiler): Callable object to be memory profiled. repetitions: int, optional Number of repetitions. + interval: float, optional + Defines interval duration between consecutive memory usage + measurements in seconds. Notes ----- @@ -183,9 +186,10 @@ class MemoryProfiler(BaseProfiler): """ - def __init__(self, func, repetitions=5): + def __init__(self, func, repetitions: int = 5, interval: float = 0.01): self._func = func self._repetitions = repetitions + self._interval = interval self._max_usages = None self._baselines = None @@ -212,10 +216,12 @@ def profile(self, *args, **kwargs): while counter < self._repetitions: gc.collect() baseline = memory_usage()[0] - max_usage = memory_usage(mem_args, max_usage=True)[0] + max_usage = memory_usage(mem_args, + interval=self._interval, + max_usage=True) baselines.append(self._mb_to_bytes(baseline)) - max_usages.append(self._mb_to_bytes(max_usage)) + max_usages.append(self._mb_to_bytes(max_usage[0])) counter += 1 self._max_usages = max_usages @@ -487,6 +493,9 @@ class PandasMemoryProfiler(MemoryProfiler): The wrangler instance to be profiled. repetitions: int The number of measurements for memory profiling. + interval: float, optional + Defines interval duration between consecutive memory usage + measurements in seconds. Attributes ---------- @@ -503,13 +512,14 @@ class PandasMemoryProfiler(MemoryProfiler): """ - def __init__(self, wrangler: BaseWrangler, repetitions: int = 5): + def __init__(self, wrangler: BaseWrangler, repetitions: int = 5, + interval: float = 0.01): self._wrangler = wrangler self._usage_input = None self._usage_output = None - super().__init__(wrangler.fit_transform, repetitions) + super().__init__(wrangler.fit_transform, repetitions, interval) def profile(self, *dfs: pd.DataFrame, **kwargs): """Profiles the actual memory usage given input dataframes `dfs` @@ -695,5 +705,77 @@ def profile(self, *dfs: SparkDataFrame, **kwargs): # clear caches for df in dfs_cached: df.unpersist() + del df + + del dfs_cached + + return self + + +class DaskTimeProfiler(TimeProfiler): + """Approximate time that a dask wrangler instance requires to execute the + `fit_transform` step. + + Please note, input dataframes are cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. This may cause + problems if the size of input dataframes exceeds available memory. + + Parameters + ---------- + wrangler: pywrangler.wranglers.base.BaseWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + timings: list + The timing measurements in seconds. + median: float + The median of the timing measurements in seconds. + standard_deviation: float + The standard deviation of the timing measurements in seconds. + fastast: float + The fastest value of the timing measurements in seconds. + repetitions: int + The number of measurements. + + """ + + def __init__(self, wrangler: BaseWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + + def wrapper(*args, **kwargs): + """Wrapper function to call `compute()` to enforce computation. + + """ + + wrangler.fit_transform(*args, **kwargs).compute() + + super().__init__(wrapper, repetitions) + + def profile(self, *dfs: DaskDataFrame, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. + + Please note, input dataframes are cached before timing execution to + ensure timing measurements only capture wrangler's `fit_transform`. + This may cause problems if the size of input dataframes exceeds + available memory. + + """ + + # cache input dataframes + dfs_cached = [df.persist() for df in dfs] + + super().profile(*dfs_cached, **kwargs) + + # clear caches + for df in dfs_cached: + del df + + del dfs_cached return self diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 891dc80..e73434c 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -12,6 +12,7 @@ from pywrangler.benchmark import ( BaseProfiler, + DaskTimeProfiler, MemoryProfiler, PandasMemoryProfiler, PandasTimeProfiler, @@ -27,6 +28,11 @@ except ImportError: SparkSingleNoFit = None +try: + from pywrangler.wranglers.dask.base import DaskSingleNoFit +except ImportError: + DaskSingleNoFit = None + MIB = 2 ** 20 @@ -292,3 +298,25 @@ def transform(self, df): SparkTimeProfiler(DummyWrangler(), 1).profile(df_input) assert df_input.is_cached is False + + +@pytest.mark.dask +def test_dask_time_profiler_fastest(spark): + """Basic test for dask time profiler ensuring fastest timing is slower + than forced sleep. + + """ + + from dask import dataframe as dd + + sleep = 0.0001 + df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) + + class DummyWrangler(DaskSingleNoFit): + def transform(self, df): + time.sleep(sleep) + return df + + time_profiler = DaskTimeProfiler(DummyWrangler(), 1).profile(df_input) + + assert time_profiler.fastest >= sleep From 4c1bb91a018d050628c8dd442ca9b22c79cb3ebc Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 22:55:21 +0200 Subject: [PATCH 19/48] Add dask setup to tox. --- tox.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tox.ini b/tox.ini index 0c8d8a6..8779c07 100644 --- a/tox.ini +++ b/tox.ini @@ -2,6 +2,7 @@ envlist = {py35,py36,py37}-pandas{0190,0191,0192,0200,0201,0202,0203,0210,0211,0220,0230,0231,0232,0233,0234,0240,0241} {py35,py36,py37}-pyspark{231,240} + {py35,py36,py37}-dask{115} flake8 skip_missing_interpreters = True @@ -34,6 +35,8 @@ deps = pyspark240: pyspark==2.4.0 pyspark231: pyspark==2.3.1 + dask115: dask[dataframe]==1.1.5 + setenv = PYWRANGLER_TEST_ENV = {envname} From ff9666357bb766afd25646f813b11f3918584365 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 3 Apr 2019 22:56:10 +0200 Subject: [PATCH 20/48] Add dask for TravisCI. --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8ff1d1b..79ad3cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,8 @@ env: - ENV_STRING=pyspark2.4.0 - ENV_STRING=pyspark2.3.1 + - ENV_STRING=dask1.1.5 + # Remove python/pandas version interactions which do not have wheels on pypi matrix: From a725f584f992d4e95b680e09a9fd34fb27795ddf Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 20 Apr 2019 15:08:20 +0200 Subject: [PATCH 21/48] Add `pretty_time_duration`. Rename `sizeof` to `pretty_file_size`. --- src/pywrangler/util/_pprint.py | 54 +++++++++++++++++++++++++++++----- tests/util/test_pprint.py | 28 ++++++++++++++---- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/src/pywrangler/util/_pprint.py b/src/pywrangler/util/_pprint.py index d602168..70f282d 100644 --- a/src/pywrangler/util/_pprint.py +++ b/src/pywrangler/util/_pprint.py @@ -62,7 +62,7 @@ def header(name: str, indent: int = 0, underline: str = "-") -> str: _indent = " " * indent _header = _indent + name - _underline = _indent + underline*len(name) + "\n" + _underline = _indent + underline * len(name) + "\n" return _join([_header, _underline]) @@ -111,8 +111,8 @@ def enumeration(values: ENUM, indent: int = 0, bullet_char: str = "-", return _join(indented) -def sizeof(size: float, precision: int = 2, align: str = ">", - width=None) -> str: +def pretty_file_size(size: float, precision: int = 2, align: str = ">", + width: int = 0) -> str: """Helper function to format size in human readable format. Parameters @@ -138,10 +138,6 @@ def sizeof(size: float, precision: int = 2, align: str = ">", """ # noqa: E501 template = "{size:{align}{width}.{precision}f} {unit}B" - - if width is None: - width = precision + 5 - kwargs = dict(width=width, precision=precision, align=align) # iterate units (multiples of 1024 bytes) @@ -152,3 +148,47 @@ def sizeof(size: float, precision: int = 2, align: str = ">", else: return template.format(size=size, unit='Yi', **kwargs) + + +def pretty_time_duration(seconds: float, precision: int = 1, align: str = ">", + width: int = 0) -> str: + """Helper function to format time duration in human readable format. + + Parameters + ---------- + seconds: float + The size in seconds to be converted into human readable format. + precision: int, optional + Define shown precision. + align: {'<', '^', '>'}, optional + Format align specifier. + width: int + Define maximum width for number. + + Returns + ------- + human_fmt: str + Human readable representation of given `seconds`. + + """ + + template = "{time_delta:{align}{width}.{precision}f} {unit}" + + units = [('year', 60 * 60 * 24 * 365), + ('month', 60 * 60 * 24 * 30), + ('d', 60 * 60 * 24), + ('h', 60 * 60), + ('min', 60), + ('s', 1), + ('ms', 1e-3), + ('µs', 1e-6), + ('ns', 1e-9)] + + for unit_name, unit_seconds in units: + if seconds > unit_seconds: + time_delta = seconds / unit_seconds + return template.format(time_delta=time_delta, + align=align, + width=width, + precision=precision, + unit=unit_name) diff --git a/tests/util/test_pprint.py b/tests/util/test_pprint.py index 35a71b0..82f5ef6 100644 --- a/tests/util/test_pprint.py +++ b/tests/util/test_pprint.py @@ -82,9 +82,25 @@ def test_enumeration_list_with_bullet(): assert _pprint.enumeration(test_input, bullet_char="o") == test_output -def test_sizeof(): - assert _pprint.sizeof(1024, precision=1, width=0) == '1.0 KiB' - assert _pprint.sizeof(1024, precision=1) == ' 1.0 KiB' - assert _pprint.sizeof(1024, precision=1, align="<") == '1.0 KiB' - assert _pprint.sizeof(1024 ** 2, precision=1, width=0) == '1.0 MiB' - assert _pprint.sizeof(1024 ** 8, precision=2, width=0) == '1.00 YiB' +def test_pretty_file_size(): + pfs = _pprint.pretty_file_size + + assert pfs(1024, precision=1, width=4) == ' 1.0 KiB' + assert pfs(1024, precision=1, width=4, align="<") == '1.0 KiB' + assert pfs(1024, precision=1) == '1.0 KiB' + assert pfs(1024 ** 2, precision=1, width=0) == '1.0 MiB' + assert pfs(1024 ** 8, precision=2, width=0) == '1.00 YiB' + + +def test_pretty_time_duration(): + ptd = _pprint.pretty_time_duration + + assert ptd(1.1) == "1.1 s" + assert ptd(1.59, width=5) == " 1.6 s" + assert ptd(1.55, width=7, precision=2) == " 1.55 s" + assert ptd(1.55, width=7, precision=2, align="<") == "1.55 s" + assert ptd(120, precision=2) == "2.00 min" + assert ptd(5400, precision=1) == "1.5 h" + assert ptd(0.5, precision=1) == "500.0 ms" + assert ptd(0.0005, precision=1) == "500.0 µs" + assert ptd(0.0000005, precision=1) == "500.0 ns" From a8381611d7a34ef6334b31413fd3c209c9ca14fe Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:22:47 +0200 Subject: [PATCH 22/48] Move spark benchmark utility into spark subpackage. --- src/pywrangler/wranglers/spark/benchmark.py | 96 +++++++++++++++++++++ tests/wranglers/spark/test_benchmark.py | 52 +++++++++++ 2 files changed, 148 insertions(+) create mode 100644 src/pywrangler/wranglers/spark/benchmark.py create mode 100644 tests/wranglers/spark/test_benchmark.py diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py new file mode 100644 index 0000000..ef5cbf2 --- /dev/null +++ b/src/pywrangler/wranglers/spark/benchmark.py @@ -0,0 +1,96 @@ +"""This module contains benchmarking utility for pandas wranglers. + +""" + +from typing import Union + +from pyspark.sql import DataFrame + +from pywrangler.benchmark import TimeProfiler +from pywrangler.wranglers.spark.base import SparkWrangler + + +class SparkTimeProfiler(TimeProfiler): + """Approximate time that a spark wrangler instance requires to execute the + `fit_transform` step. + + Please note, input dataframes are cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. This may cause + problems if the size of input dataframes exceeds available memory. + + Parameters + ---------- + wrangler: pywrangler.wranglers.base.BaseWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + measurements: list + The actual profiling measurements in seconds. + best: float + The best measurement in seconds. + median: float + The median of measurements in seconds. + worst: float + The worst measurement in seconds. + std: float + The standard deviation of measurements in seconds. + runs: int + The number of measurements. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + """ + + def __init__(self, wrangler: SparkWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + + def wrapper(*args, **kwargs): + """Wrapper function to call `count()` to enforce computation. + + """ + + wrangler.fit_transform(*args, **kwargs).count() + + super().__init__(wrapper, repetitions) + + def profile(self, *dfs: DataFrame, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. + + Please note, input dataframes are cached before timing execution to + ensure timing measurements only capture wrangler's `fit_transform`. + This may cause problems if the size of input dataframes exceeds + available memory. + + """ + + # cache input dataframes + dfs_cached = [df.cache() for df in dfs] + + # enforce caching calling count() action + for df in dfs_cached: + df.count() + + super().profile(*dfs_cached, **kwargs) + + # clear caches + for df in dfs_cached: + df.unpersist() + del df + + del dfs_cached + + return self diff --git a/tests/wranglers/spark/test_benchmark.py b/tests/wranglers/spark/test_benchmark.py new file mode 100644 index 0000000..c353448 --- /dev/null +++ b/tests/wranglers/spark/test_benchmark.py @@ -0,0 +1,52 @@ +"""This module contains tests for spark benchmarks. + +isort:skip_file +""" + +import time + +import pytest + +pytestmark = pytest.mark.pyspark # noqa: E402 +pyspark = pytest.importorskip("pyspark") # noqa: E402 + +from pywrangler.wranglers.spark.base import SparkSingleNoFit +from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler + +SLEEP = 0.0001 + + +@pytest.fixture +def wrangler_sleeps(): + class DummyWrangler(SparkSingleNoFit): + def transform(self, df): + time.sleep(SLEEP) + return df + + return DummyWrangler + + +def test_spark_time_profiler_fastest(spark, wrangler_sleeps): + """Basic test for spark time profiler ensuring fastest timing is slower + than forced sleep. + + """ + + df_input = spark.range(10).toDF("col") + + time_profiler = SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + + assert time_profiler.best >= SLEEP + + +def test_spark_time_profiler_no_caching(spark, wrangler_sleeps): + """Pyspark input dataframes are cached during time profiling. Ensure input + dataframes are released from caching after profiling. + + """ + + df_input = spark.range(10).toDF("col") + + SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + + assert df_input.is_cached is False From 3e6b6eede1cb6e2481114b24b2966872cffdfa92 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:24:30 +0200 Subject: [PATCH 23/48] Reuse `get_param_names` helper function. --- src/pywrangler/wranglers/base.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py index 94b02a4..0ee47df 100644 --- a/src/pywrangler/wranglers/base.py +++ b/src/pywrangler/wranglers/base.py @@ -3,9 +3,8 @@ """ -import inspect - from pywrangler.util import _pprint +from pywrangler.util.helper import get_param_names class BaseWrangler: @@ -60,11 +59,7 @@ def get_params(self) -> dict: """ - init = self.__class__.__init__ - signature = inspect.signature(init) - parameters = signature.parameters.values() - - param_names = [x.name for x in parameters if x.name != "self"] + param_names = get_param_names(self.__class__.__init__, ["self"]) param_dict = {x: getattr(self, x) for x in param_names} return param_dict From 63723e1e02b953e283de693a90689bb1131e9acf Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:28:10 +0200 Subject: [PATCH 24/48] Refactor `BaseProfiler`, `MemoryProfiler` and `TimeProfiler` to use common interface for measurements. Remove wrangler specific benchmark functions. --- src/pywrangler/benchmark.py | 781 ++++++++++++------------------------ tests/test_benchmark.py | 325 +++++---------- 2 files changed, 372 insertions(+), 734 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index 3d596bc..7111571 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -3,29 +3,20 @@ """ import gc -import inspect import sys import timeit -from typing import Any, Callable, Iterable, List, Union +from typing import Callable, Iterable, List, Union import numpy as np -import pandas as pd from pywrangler.exceptions import NotProfiledError -from pywrangler.util import sanitizer -from pywrangler.util._pprint import enumeration, header, sizeof -from pywrangler.util.helper import cached_property -from pywrangler.wranglers.base import BaseWrangler - -try: - from pyspark.sql import DataFrame as SparkDataFrame -except ImportError: - SparkDataFrame = Any - -try: - from dask.dataframe import DataFrame as DaskDataFrame -except ImportError: - DaskDataFrame = Any +from pywrangler.util._pprint import ( + enumeration, + header, + pretty_file_size, + pretty_time_duration +) +from pywrangler.util.helper import get_param_names def allocate_memory(size: float) -> np.ndarray: @@ -57,51 +48,128 @@ def allocate_memory(size: float) -> np.ndarray: class BaseProfiler: - """Base class defining interface and common helper methods for memory and - time profiler. + """Base class defining the interface for all profilers. + + Subclasses have to implement `profile` (the actual profiling + implementation) and `less_is_better` (defining the ranking of profiling + measurements). + + Attributes + ---------- + measurements: list + The actual profiling measurements. + best: float + The best measurement. + median: float + The median of measurements. + worst: float + The worst measurement. + std: float + The standard deviation of measurements. + runs: int + The number of measurements. - By convention, the profiled object should always be the first argument - (ignoring self) passed to `__init__`. - All public profiling metrics have to should be defined as properties. All - private attributes need to start with an underscore. + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. """ - def profile(self, *args, **kwargs): - """Contains the actual profiling implementation and should always - return self. + @property + def measurements(self) -> List[float]: + """Return measurements of profiling. + + """ + + self._check_is_profiled(["_measurements"]) + + return self._measurements + + @property + def best(self) -> float: + """Returns the best measurement. + + """ + + if self.less_is_better: + return np.min(self.measurements) + else: + return np.max(self.measurements) + + @property + def median(self) -> float: + """Returns the median of measurements. + + """ + + return np.median(self.measurements) + + @property + def worst(self) -> float: + """Returns the worst measurement. + + """ + + if self.less_is_better: + return np.max(self.measurements) + else: + return np.min(self.measurements) + + @property + def std(self) -> float: + """Returns the standard deviation of measurements. + + """ + + return np.std(self.measurements) + + @property + def runs(self) -> int: + """Return number of measurements. + + """ + + return len(self.measurements) + + @property + def less_is_better(self) -> bool: + """Defines ranking of measurements. """ raise NotImplementedError - def report(self): - """Print simple report consisting of the name of the profiler class, - the name of the profiled object, and all defined metrics/properties. + def profile(self, *args, **kwargs): + """Contains the actual profiling implementation and has to set + `self._measurements`. Always returns self. """ - # get name of profiler - profiler_name = self.__class__.__name__ + raise NotImplementedError - # get name of profiled object - parameters = inspect.signature(self.__init__).parameters.keys() - profiled_object = getattr(self, '_{}'.format(list(parameters)[0])) + def report(self): + """Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + + """ - try: - profiled_obj_name = profiled_object.__name__ - except AttributeError: - profiled_obj_name = profiled_object.__class__.__name__ + tpl = "{best} {sign} {median} {sign} {worst} ± {std} ({runs} runs)" - # get relevant metrics - ignore = ('profile', 'report', 'profile_report') - metric_names = [x for x in dir(self) - if not x.startswith('_') - and x not in ignore] - metric_values = {x: getattr(self, x) for x in metric_names} + fmt = self._pretty_formatter + values = {"best": fmt(self.best), + "median": fmt(self.median), + "worst": fmt(self.worst), + "std": fmt(self.std), + "runs": self.runs, + "sign": "<" if self.less_is_better else ">"} - print(header('{}: {}'.format(profiler_name, profiled_obj_name)), '\n', - enumeration(metric_values), sep='') + print(tpl.format(**values)) def profile_report(self, *args, **kwargs): """Calls profile and report in sequence. @@ -110,6 +178,25 @@ def profile_report(self, *args, **kwargs): self.profile(*args, **kwargs).report() + def _pretty_formatter(self, value: float) -> str: + """String formatter for human readable output of given input `value`. + Should be replaced with sensible formatters for file size or time + duration. + + Parameters + ---------- + value: float + Numeric value to be formatted. + + Returns + ------- + pretty_string: str + Human readable representation of `value`. + + """ + + return str(value) + def _check_is_profiled(self, attributes: Iterable[str]) -> None: """Check if `profile` was already called by ensuring passed attributes are not `None`. @@ -133,30 +220,26 @@ def _check_is_profiled(self, attributes: Iterable[str]) -> None: """ - if any([getattr(self, x) is None for x in attributes]): + if any([getattr(self, x, None) is None for x in attributes]): msg = ("This {}'s instance is not profiled yet. Call 'profile' " "with appropriate arguments before using this method." .format(self.__class__.__name__)) raise NotProfiledError(msg) - @staticmethod - def _mb_to_bytes(size_mib: float) -> int: - """Helper method to convert MiB to Bytes. + def __repr__(self): + """Print representation of profiler instance. - Parameters - ---------- - size_mib: float - Size in MiB + """ - Returns - ------- - size_bytes: int - Size in bytes. + # get name of profiler + profiler_name = self.__class__.__name__ - """ + # get parameter names + param_names = get_param_names(self.__class__.__init__, ["self"]) + param_dict = {x: getattr(self, x) for x in param_names} - return int(size_mib * (2 ** 20)) + return header(profiler_name) + enumeration(param_dict) class MemoryProfiler(BaseProfiler): @@ -165,6 +248,8 @@ class MemoryProfiler(BaseProfiler): usage during function execution and the baseline memory usage before function execution. + Note, memory consumption of child processes are included. + In addition, compute the mean increase in baseline memory usage between repetitions which might indicate memory leakage. @@ -178,6 +263,33 @@ class MemoryProfiler(BaseProfiler): Defines interval duration between consecutive memory usage measurements in seconds. + Attributes + ---------- + measurements: list + The actual profiling measurements in bytes. + best: float + The best measurement in bytes. + median: float + The median of measurements in bytes. + worst: float + The worst measurement in bytes. + std: float + The standard deviation of measurements in bytes. + runs: int + The number of measurements. + baseline_change: float + The median change in baseline memory usage across all runs in bytes. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + Notes ----- The implementation is based on `memory_profiler` and is inspired by the @@ -186,13 +298,11 @@ class MemoryProfiler(BaseProfiler): """ - def __init__(self, func, repetitions: int = 5, interval: float = 0.01): - self._func = func - self._repetitions = repetitions - self._interval = interval - - self._max_usages = None - self._baselines = None + def __init__(self, func: Callable, repetitions: int = 5, + interval: float = 0.01): + self.func = func + self.repetitions = repetitions + self.interval = interval def profile(self, *args, **kwargs): """Executes the actual memory profiling. @@ -211,14 +321,16 @@ def profile(self, *args, **kwargs): counter = 0 baselines = [] max_usages = [] - mem_args = (self._func, args, kwargs) - while counter < self._repetitions: + func_args = (self.func, args, kwargs) + mem_args = dict(interval=self.interval, + multiprocess=True, + max_usage=True) + + while counter < self.repetitions: gc.collect() - baseline = memory_usage()[0] - max_usage = memory_usage(mem_args, - interval=self._interval, - max_usage=True) + baseline = memory_usage(**mem_args) + max_usage = memory_usage(func_args, **mem_args) baselines.append(self._mb_to_bytes(baseline)) max_usages.append(self._mb_to_bytes(max_usage[0])) @@ -226,12 +338,21 @@ def profile(self, *args, **kwargs): self._max_usages = max_usages self._baselines = baselines + self._measurements = np.subtract(max_usages, baselines).tolist() return self + @property + def less_is_better(self) -> bool: + """Less memory consumption is better. + + """ + + return True + @property def max_usages(self) -> List[int]: - """Returns the absolute, maximum memory usages for each iteration in + """Returns the absolute, maximum memory usages for each run in bytes. """ @@ -242,7 +363,7 @@ def max_usages(self) -> List[int]: @property def baselines(self) -> List[int]: - """Returns the absolute, baseline memory usages for each iteration in + """Returns the absolute, baseline memory usages for each run in bytes. The baseline memory usage is defined as the memory usage before function execution. @@ -253,43 +374,49 @@ def baselines(self) -> List[int]: return self._baselines @property - def increases(self) -> List[int]: - """Returns the absolute memory increase for each iteration in bytes. - The memory increase is defined as the difference between the maximum - memory usage during function execution and the baseline memory usage + def baseline_change(self) -> float: + """Returns the median change in baseline memory usage across all + run. The baseline memory usage is defined as the memory usage before function execution. - """ - return np.subtract(self.max_usages, self.baselines).tolist() + changes = np.diff(self.baselines) + return float(np.median(changes)) - @property - def increases_mean(self) -> float: - """Returns the mean of the absolute memory increases across all - iterations. + def _pretty_formatter(self, value: float) -> str: + """String formatter for human readable output of given input `value`. + + Parameters + ---------- + value: float + Numeric value to be formatted. + + Returns + ------- + pretty_string: str + Human readable representation of `value`. """ - return float(np.mean(self.increases)) + return pretty_file_size(value) - @property - def increases_std(self) -> float: - """Returns the standard variation of the absolute memory increases - across all iterations. + @staticmethod + def _mb_to_bytes(size_mib: float) -> int: + """Helper method to convert MiB to Bytes. - """ + Parameters + ---------- + size_mib: float + Size in MiB - return float(np.std(self.increases)) + Returns + ------- + size_bytes: int + Size in bytes. - @property - def baseline_change(self) -> float: - """Returns the mean change in baseline memory usage across all - all iterations. The baseline memory usage is defined as the memory - usage before function execution. """ - changes = np.diff(self.baselines) - return float(np.mean(changes)) + return int(size_mib * (2 ** 20)) class TimeProfiler(BaseProfiler): @@ -307,17 +434,29 @@ class TimeProfiler(BaseProfiler): Attributes ---------- - timings: list - The timing measurements in seconds. + measurements: list + The actual profiling measurements in seconds. + best: float + The best measurement in seconds. median: float - The median of the timing measurements in seconds. - standard_deviation: float - The standard deviation of the timing measurements in seconds. - fastast: float - The fastest value of the timing measurements in seconds. - repetitions: int + The median of measurements in seconds. + worst: float + The worst measurement in seconds. + std: float + The standard deviation of measurements in seconds. + runs: int The number of measurements. + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + Notes ----- The implementation is based on standard library's `timeit` module. @@ -325,13 +464,8 @@ class TimeProfiler(BaseProfiler): """ def __init__(self, func: Callable, repetitions: Union[None, int] = None): - self._func = func - self._repetitions = repetitions - - self._timings = None - self._timings_mean = None - self._timings_std = None - self._fastest = None + self.func = func + self.repetitions = repetitions def profile(self, *args, **kwargs): """Executes the actual time profiling. @@ -351,431 +485,40 @@ def wrapper(): """ - self._func(*args, **kwargs) + self.func(*args, **kwargs) timer = timeit.Timer(stmt=wrapper) - if self._repetitions is None: + if self.repetitions is None: repeat, _ = timer.autorange(None) else: - repeat = self._repetitions + repeat = self.repetitions - self._timings = timer.repeat(number=1, repeat=repeat) + self._measurements = timer.repeat(number=1, repeat=repeat) return self @property - def timings(self) -> List[float]: - """Returns the timeit measurements in seconds. + def less_is_better(self) -> bool: + """Less time required is better. """ - return self._timings + return True - @property - def median(self) -> float: - """Returns the median of all timeit measurements in seconds. - - """ - - self._check_is_profiled(['_timings']) - - return float(np.median(self._timings)) - - @property - def standard_deviation(self) -> float: - """Returns the standard deviation of all timeit measurements in - seconds. - - """ - - self._check_is_profiled(['_timings']) - - return float(np.std(self._timings)) - - @property - def fastest(self) -> float: - """Returns the fastest timing measurement in seconds. - - """ - - self._check_is_profiled(['_timings']) - - return min(self._timings) - - @property - def repetitions(self) -> int: - """Returns the number of measurements. - - """ - - return len(self._timings) - - def report(self): - """Profile time via `profile` and provide human readable report. - - Returns - ------- - None. Prints report to stdout. - - """ - - enum_kwargs = dict(align_width=15, bullet_char="") - - # string part for header - wrangler_name = self._wrangler.__class__.__name__ - str_header = header("{} - time profiling".format(wrangler_name)) - - # string part for values - dict_values = {"Fastest": "{:.2f}s".format(self.fastest), - "Median": "{:.2f}s".format(self.median), - "Std": "{:.2f}s".format(self.standard_deviation), - "Repetitions": self.repetitions} - - str_values = enumeration(dict_values, **enum_kwargs) - - # build complete string and print - template = "{}\n{}\n" - report_string = template.format(str_header, str_values) - - print(report_string) - - -class PandasTimeProfiler(TimeProfiler): - """Approximate time that a pandas wrangler instance requires to execute the - `fit_transform` step. - - Parameters - ---------- - wrangler: pywrangler.wranglers.base.BaseWrangler - The wrangler instance to be profiled. - repetitions: None, int, optional - Number of repetitions. If `None`, `timeit.Timer.autorange` will - determine a sensible default. - - Attributes - ---------- - timings: list - The timing measurements in seconds. - median: float - The median of the timing measurements in seconds. - standard_deviation: float - The standard deviation of the timing measurements in seconds. - fastast: float - The fastest value of the timing measurements in seconds. - repetitions: int - The number of measurements. - - """ - - def __init__(self, wrangler: BaseWrangler, - repetitions: Union[None, int] = None): - self._wrangler = wrangler - super().__init__(wrangler.fit_transform, repetitions) - - -class PandasMemoryProfiler(MemoryProfiler): - """Approximate memory usage that a pandas wrangler instance requires to - execute the `fit_transform` step. - - As a key metric, `ratio` is computed. It refers to the amount of - memory which is required to execute the `fit_transform` step. More - concretely, it estimates how much more memory is used standardized by the - input memory usage (memory usage increase during function execution divided - by memory usage of input dataframes). In other words, if you have a 1GB - input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free - memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input - dataframe would require 1GB free memory available for computation. - - Parameters - ---------- - wrangler: pywrangler.wranglers.pandas.base.PandasWrangler - The wrangler instance to be profiled. - repetitions: int - The number of measurements for memory profiling. - interval: float, optional - Defines interval duration between consecutive memory usage - measurements in seconds. - - Attributes - ---------- - increases_mean: float - The mean of the absolute memory increases across all iterations in - bytes. - input: int - Memory usage of input dataframes in bytes. - output: int - Memory usage of output dataframes in bytes. - ratio: float - The amount of memory required for computation in units of input - memory usage. - - """ - - def __init__(self, wrangler: BaseWrangler, repetitions: int = 5, - interval: float = 0.01): - self._wrangler = wrangler - - self._usage_input = None - self._usage_output = None - - super().__init__(wrangler.fit_transform, repetitions, interval) - - def profile(self, *dfs: pd.DataFrame, **kwargs): - """Profiles the actual memory usage given input dataframes `dfs` - which are passed to `fit_transform`. - - """ - - # usage input - self._usage_input = self._memory_usage_dfs(*dfs) - - # usage output - dfs_output = self._wrangler.fit_transform(*dfs) - dfs_output = sanitizer.ensure_tuple(dfs_output) - self._usage_output = self._memory_usage_dfs(*dfs_output) - - # usage during fit_transform - super().profile(*dfs, **kwargs) - - return self - - @property - def input(self) -> float: - """Returns the memory usage of the input dataframes in bytes. - - """ - - self._check_is_profiled(['_usage_input']) - return self._usage_input - - @property - def output(self) -> float: - """Returns the memory usage of the output dataframes in bytes. - - """ - - self._check_is_profiled(['_usage_output']) - return self._usage_output - - @cached_property - def ratio(self) -> float: - """Refers to the amount of memory which is required to execute the - `fit_transform` step. More concretely, it estimates how much more - memory is used standardized by the input memory usage (memory usage - increase during function execution divided by memory usage of input - dataframes). In other words, if you have a 1GB input dataframe, and the - `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to - succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would - require 1GB free memory available for computation. - - """ - - return self.increases_mean / self.input - - def report(self): - """Profile memory usage via `profile` and provide human readable - report including memory usage of input and output dataframes, memory - usage during `fit_transform`, the usage ratio and shows if - the wrangler may have side effects in regard to memory consumption via - the change in baseline memory usage. - - Returns - ------- - None. Prints report to stdout. - - """ - - enum_kwargs = dict(align_width=15, bullet_char="") - - # string part for header - wrangler_name = self._wrangler.__class__.__name__ - str_header = header("{} - memory usage".format(wrangler_name)) - - # string part for input and output dfs - dict_dfs = {"Input dfs": sizeof(self.input), - "Ouput dfs": sizeof(self.output)} - - str_dfs = enumeration(dict_dfs, **enum_kwargs) - - # string part for transform/fit and ratio - str_inc = sizeof(self.increases_mean) - str_std = sizeof(self.increases_std, width=0) - str_inc += " (Std: {})".format(str_std) - str_ratio = "{:>7.2f}".format(self.ratio) - str_baseline_change = sizeof(self.baseline_change) - dict_inc = {"Fit/Transform": str_inc, - "Ratio": str_ratio, - "Baseline change": str_baseline_change} - - str_inc = enumeration(dict_inc, **enum_kwargs) - - # build complete string and print - template = "{}\n{}\n\n{}" - report_string = template.format(str_header, str_dfs, str_inc) - - print(report_string) - - @staticmethod - def _memory_usage_dfs(*dfs: pd.DataFrame) -> int: - """Return memory usage in bytes for all given dataframes. + def _pretty_formatter(self, value: float) -> str: + """String formatter for human readable output of given input `value`. Parameters ---------- - dfs: pd.DataFrame - The pandas dataframes for which memory usage should be computed. + value: float + Numeric value to be formatted. Returns ------- - memory_usage: int - The computed memory usage in bytes. - - """ - - mem_usages = [df.memory_usage(deep=True, index=True).sum() - for df in dfs] - - return int(np.sum(mem_usages)) - - -class SparkTimeProfiler(TimeProfiler): - """Approximate time that a spark wrangler instance requires to execute the - `fit_transform` step. - - Please note, input dataframes are cached before timing execution to ensure - timing measurements only capture wrangler's `fit_transform`. This may cause - problems if the size of input dataframes exceeds available memory. - - Parameters - ---------- - wrangler: pywrangler.wranglers.base.BaseWrangler - The wrangler instance to be profiled. - repetitions: None, int, optional - Number of repetitions. If `None`, `timeit.Timer.autorange` will - determine a sensible default. - - Attributes - ---------- - timings: list - The timing measurements in seconds. - median: float - The median of the timing measurements in seconds. - standard_deviation: float - The standard deviation of the timing measurements in seconds. - fastast: float - The fastest value of the timing measurements in seconds. - repetitions: int - The number of measurements. - - """ - - def __init__(self, wrangler: BaseWrangler, - repetitions: Union[None, int] = None): - self._wrangler = wrangler - - def wrapper(*args, **kwargs): - """Wrapper function to call `count()` to enforce computation. - - """ - - wrangler.fit_transform(*args, **kwargs).count() - - super().__init__(wrapper, repetitions) - - def profile(self, *dfs: SparkDataFrame, **kwargs): - """Profiles timing given input dataframes `dfs` which are passed to - `fit_transform`. - - Please note, input dataframes are cached before timing execution to - ensure timing measurements only capture wrangler's `fit_transform`. - This may cause problems if the size of input dataframes exceeds - available memory. + pretty_string: str + Human readable representation of `value`. """ - # cache input dataframes - dfs_cached = [df.cache() for df in dfs] - - # enforce caching calling count() action - for df in dfs_cached: - df.count() - - super().profile(*dfs_cached, **kwargs) - - # clear caches - for df in dfs_cached: - df.unpersist() - del df - - del dfs_cached - - return self - - -class DaskTimeProfiler(TimeProfiler): - """Approximate time that a dask wrangler instance requires to execute the - `fit_transform` step. - - Please note, input dataframes are cached before timing execution to ensure - timing measurements only capture wrangler's `fit_transform`. This may cause - problems if the size of input dataframes exceeds available memory. - - Parameters - ---------- - wrangler: pywrangler.wranglers.base.BaseWrangler - The wrangler instance to be profiled. - repetitions: None, int, optional - Number of repetitions. If `None`, `timeit.Timer.autorange` will - determine a sensible default. - - Attributes - ---------- - timings: list - The timing measurements in seconds. - median: float - The median of the timing measurements in seconds. - standard_deviation: float - The standard deviation of the timing measurements in seconds. - fastast: float - The fastest value of the timing measurements in seconds. - repetitions: int - The number of measurements. - - """ - - def __init__(self, wrangler: BaseWrangler, - repetitions: Union[None, int] = None): - self._wrangler = wrangler - - def wrapper(*args, **kwargs): - """Wrapper function to call `compute()` to enforce computation. - - """ - - wrangler.fit_transform(*args, **kwargs).compute() - - super().__init__(wrapper, repetitions) - - def profile(self, *dfs: DaskDataFrame, **kwargs): - """Profiles timing given input dataframes `dfs` which are passed to - `fit_transform`. - - Please note, input dataframes are cached before timing execution to - ensure timing measurements only capture wrangler's `fit_transform`. - This may cause problems if the size of input dataframes exceeds - available memory. - - """ - - # cache input dataframes - dfs_cached = [df.persist() for df in dfs] - - super().profile(*dfs_cached, **kwargs) - - # clear caches - for df in dfs_cached: - del df - - del dfs_cached - - return self + return pretty_time_duration(value) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index e73434c..e126b24 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -7,33 +7,23 @@ import pytest -import numpy as np -import pandas as pd - from pywrangler.benchmark import ( BaseProfiler, - DaskTimeProfiler, MemoryProfiler, - PandasMemoryProfiler, - PandasTimeProfiler, - SparkTimeProfiler, TimeProfiler, allocate_memory ) from pywrangler.exceptions import NotProfiledError -from pywrangler.wranglers.pandas.base import PandasSingleNoFit -try: - from pywrangler.wranglers.spark.base import SparkSingleNoFit -except ImportError: - SparkSingleNoFit = None +MIB = 2 ** 20 -try: - from pywrangler.wranglers.dask.base import DaskSingleNoFit -except ImportError: - DaskSingleNoFit = None -MIB = 2 ** 20 +@pytest.fixture() +def func_no_effect(): + def func(): + pass + + return func def test_allocate_memory_empty(): @@ -51,7 +41,7 @@ def test_allocate_memory_5mb(): def test_base_profiler_not_implemented(): base_profiler = BaseProfiler() - for will_raise in ('profile', 'profile_report'): + for will_raise in ('profile', 'profile_report', 'less_is_better'): with pytest.raises(NotImplementedError): getattr(base_profiler, will_raise)() @@ -67,169 +57,155 @@ def test_base_profiler_check_is_profiled(): base_profiler._check_is_profiled(['_is_set']) -def test_base_profiler_mb_to_bytes(): - assert BaseProfiler._mb_to_bytes(1) == 1048576 - assert BaseProfiler._mb_to_bytes(1.5) == 1572864 - assert BaseProfiler._mb_to_bytes(0.33) == 346030 - - -def test_memory_profiler_return_self(): - def dummy(): - pass - - memory_profiler = MemoryProfiler(dummy) - assert memory_profiler.profile() is memory_profiler - - -def test_memory_profiler_properties(): - def dummy(): - pass - - memory_profiler = MemoryProfiler(dummy) - memory_profiler._baselines = [0, 1, 2, 3] - memory_profiler._max_usages = [4, 5, 7, 8] - - assert memory_profiler.max_usages == memory_profiler._max_usages - assert memory_profiler.baselines == memory_profiler._baselines - assert memory_profiler.increases == [4, 4, 5, 5] - assert memory_profiler.increases_mean == 4.5 - assert memory_profiler.increases_std == 0.5 - assert memory_profiler.baseline_change == 1 - - -def test_memory_profiler_no_side_effect(): - def no_side_effect(): - dummy = 5 - return dummy - - assert MemoryProfiler(no_side_effect).profile().baseline_change < 0.5 * MIB - +def test_base_profiler_measurements_less_is_better(capfd): + measurements = range(7) -def test_memory_profiler_side_effect(): - side_effect_container = [] + class Profiler(BaseProfiler): - def side_effect(): - memory_holder = allocate_memory(5) - side_effect_container.append(memory_holder) + @property + def less_is_better(self): + return True - return memory_holder + def profile(self, *args, **kwargs): + self._measurements = measurements + return self - assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB + def _pretty_formatter(self, value): + return "{:.0f}".format(value) + base_profiler = Profiler() + base_profiler.profile_report() -def test_memory_profiler_no_increase(): - def no_increase(): - pass + assert base_profiler.median == 3 + assert base_profiler.best == 0 + assert base_profiler.worst == 6 + assert base_profiler.std == 2 + assert base_profiler.runs == 7 + assert base_profiler.measurements == measurements - assert MemoryProfiler(no_increase).profile().increases_mean < 0.1 * MIB - assert MemoryProfiler(no_increase).profile().increases_std < 0.1 * MIB + out, _ = capfd.readouterr() + assert out == "0 < 3 < 6 ± 2 (7 runs)\n" -def test_memory_profiler_increase(): - def increase(): - memory_holder = allocate_memory(30) - return memory_holder +def test_base_profiler_measurements_more_is_better(capfd): + measurements = range(7) - assert MemoryProfiler(increase).profile().increases_mean > 29 * MIB + class Profiler(BaseProfiler): + @property + def less_is_better(self): + return False + def profile(self, *args, **kwargs): + self._measurements = measurements + return self -def test_pandas_memory_profiler_memory_usage_dfs(): - df1 = pd.DataFrame(np.random.rand(10)) - df2 = pd.DataFrame(np.random.rand(10)) + def _pretty_formatter(self, value): + return "{:.0f}".format(value) - test_input = [df1, df2] - test_output = int(df1.memory_usage(index=True, deep=True).sum() + - df2.memory_usage(index=True, deep=True).sum()) + base_profiler = Profiler() + base_profiler.profile_report() - assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output + assert base_profiler.median == 3 + assert base_profiler.best == 6 + assert base_profiler.worst == 0 + assert base_profiler.std == 2 + assert base_profiler.runs == 7 + assert base_profiler.measurements == measurements + out, _ = capfd.readouterr() + assert out == "6 > 3 > 0 ± 2 (7 runs)\n" -def test_pandas_memory_profiler_return_self(): - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame() - memory_profiler = PandasMemoryProfiler(DummyWrangler()) +def test_memory_profiler_mb_to_bytes(): + assert MemoryProfiler._mb_to_bytes(1) == 1048576 + assert MemoryProfiler._mb_to_bytes(1.5) == 1572864 + assert MemoryProfiler._mb_to_bytes(0.33) == 346030 - assert memory_profiler is memory_profiler.profile(pd.DataFrame()) +def test_memory_profiler_return_self(func_no_effect): + memory_profiler = MemoryProfiler(func_no_effect) + assert memory_profiler.profile() is memory_profiler -def test_pandas_memory_profiler_usage_increases_mean(): - empty_df = pd.DataFrame() - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame(allocate_memory(30)) +def test_memory_profiler_measurements(func_no_effect): + baselines = [0, 1, 2, 3] + max_usages = [4, 5, 7, 8] + measurements = [4, 4, 5, 5] + + memory_profiler = MemoryProfiler(func_no_effect) + memory_profiler._baselines = baselines + memory_profiler._max_usages = max_usages + memory_profiler._measurements = measurements + + assert memory_profiler.less_is_better is True + assert memory_profiler.max_usages == max_usages + assert memory_profiler.baselines == baselines + assert memory_profiler.measurements == measurements + assert memory_profiler.median == 4.5 + assert memory_profiler.std == 0.5 + assert memory_profiler.best == 4 + assert memory_profiler.worst == 5 + assert memory_profiler.baseline_change == 1 - memory_profiler = PandasMemoryProfiler(DummyWrangler()) - assert memory_profiler.profile(empty_df).increases_mean > 29 * MIB +def test_memory_profiler_no_side_effect(func_no_effect): + baseline_change = MemoryProfiler(func_no_effect).profile().baseline_change + assert baseline_change < 0.5 * MIB -def test_pandas_memory_profiler_usage_input_output(): - df_input = pd.DataFrame(np.random.rand(1000)) - df_output = pd.DataFrame(np.random.rand(10000)) - test_df_input = df_input.memory_usage(index=True, deep=True).sum() - test_df_output = df_output.memory_usage(index=True, deep=True).sum() +def test_memory_profiler_side_effect(): + side_effect_container = [] - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return df_output + def side_effect(): + memory_holder = allocate_memory(5) + side_effect_container.append(memory_holder) - memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input) + return memory_holder - assert memory_profiler.input == test_df_input - assert memory_profiler.output == test_df_output + assert MemoryProfiler(side_effect).profile().baseline_change > 4.9 * MIB -def test_pandas_memory_profiler_usage_ratio(): - usage_mib = 30 - df_input = pd.DataFrame(np.random.rand(1000000)) - usage_input = df_input.memory_usage(index=True, deep=True).sum() - test_output = ((usage_mib - 1) * MIB) / usage_input +def test_memory_profiler_no_increase(func_no_effect): + memory_profiler = MemoryProfiler(func_no_effect).profile() + print(memory_profiler.measurements) - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame(allocate_memory(usage_mib)) + assert memory_profiler.median < MIB - memory_profiler = PandasMemoryProfiler(DummyWrangler()) - assert memory_profiler.profile(df_input).ratio > test_output +def test_memory_profiler_increase(): + def increase(): + memory_holder = allocate_memory(30) + return memory_holder + assert MemoryProfiler(increase).profile().median > 29 * MIB -def test_time_profiler_return_self(): - def dummy(): - pass - time_profiler = TimeProfiler(dummy, 1) +def test_time_profiler_return_self(func_no_effect): + time_profiler = TimeProfiler(func_no_effect, 1) assert time_profiler.profile() is time_profiler -def test_time_profiler_properties(): - def dummy(): - pass +def test_time_profiler_measurements(func_no_effect): + measurements = [1, 1, 3, 3] - time_profiler = TimeProfiler(dummy) - time_profiler._timings = [1, 1, 3, 3] + time_profiler = TimeProfiler(func_no_effect) + time_profiler._measurements = [1, 1, 3, 3] + assert time_profiler.less_is_better is True assert time_profiler.median == 2 - assert time_profiler.standard_deviation == 1 - assert time_profiler.fastest == 1 - assert time_profiler.repetitions == 4 - assert time_profiler.timings == time_profiler._timings - - -def test_time_profiler_repetitions(): - def dummy(): - pass + assert time_profiler.std == 1 + assert time_profiler.best == 1 + assert time_profiler.runs == 4 + assert time_profiler.measurements == measurements - time_profiler = TimeProfiler(dummy, repetitions=10).profile() +def test_time_profiler_repetitions(func_no_effect): + time_profiler = TimeProfiler(func_no_effect, repetitions=10) assert time_profiler.repetitions == 10 -def test_time_profiler_fastest(): +def test_time_profiler_best(): sleep = 0.0001 def dummy(): @@ -238,85 +214,4 @@ def dummy(): time_profiler = TimeProfiler(dummy, repetitions=1).profile() - assert time_profiler.fastest >= sleep - - -def test_pandas_time_profiler_fastest(): - """Basic test for pandas time profiler ensuring fastest timing is slower - than forced sleep. - - """ - - sleep = 0.0001 - df_input = pd.DataFrame() - - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - time.sleep(sleep) - return df - - time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input) - - assert time_profiler.fastest >= sleep - - -@pytest.mark.pyspark -def test_spark_time_profiler_fastest(spark): - """Basic test for spark time profiler ensuring fastest timing is slower - than forced sleep. - - """ - - sleep = 0.0001 - df_input = spark.range(10).toDF("col") - - class DummyWrangler(SparkSingleNoFit): - def transform(self, df): - time.sleep(sleep) - return df - - time_profiler = SparkTimeProfiler(DummyWrangler(), 1).profile(df_input) - - assert time_profiler.fastest >= sleep - - -@pytest.mark.pyspark -def test_spark_time_profiler_no_caching(spark): - """Pyspark input dataframes are cached during time profiling. Ensure input - dataframes are released from caching after profiling. - - """ - - sleep = 0.0001 - df_input = spark.range(10).toDF("col") - - class DummyWrangler(SparkSingleNoFit): - def transform(self, df): - time.sleep(sleep) - return df - - SparkTimeProfiler(DummyWrangler(), 1).profile(df_input) - - assert df_input.is_cached is False - - -@pytest.mark.dask -def test_dask_time_profiler_fastest(spark): - """Basic test for dask time profiler ensuring fastest timing is slower - than forced sleep. - - """ - - from dask import dataframe as dd - - sleep = 0.0001 - df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) - - class DummyWrangler(DaskSingleNoFit): - def transform(self, df): - time.sleep(sleep) - return df - - time_profiler = DaskTimeProfiler(DummyWrangler(), 1).profile(df_input) - - assert time_profiler.fastest >= sleep + assert time_profiler.best >= sleep From 07b4d10bddbcd7ccb85029352163567bf232112e Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:28:56 +0200 Subject: [PATCH 25/48] Move spark fixture into spark subpackage. --- tests/conftest.py | 20 -------------------- tests/wranglers/spark/conftest.py | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 tests/wranglers/spark/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py index 12bba43..5bb8255 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,23 +53,3 @@ def pytest_collection_modifyitems(config, items): for item in items: if skip_item in item.keywords: item.add_marker(skip) - - -@pytest.fixture(scope="session") -def spark(request): - """Provide session wide Spark Session to avoid expensive recreation for - each test. - - If pyspark is not available, skip tests. - - """ - - try: - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - - request.addfinalizer(lambda: spark.stop()) - return spark - - except ImportError: - pytest.skip("Pyspark not available.") diff --git a/tests/wranglers/spark/conftest.py b/tests/wranglers/spark/conftest.py new file mode 100644 index 0000000..eba57af --- /dev/null +++ b/tests/wranglers/spark/conftest.py @@ -0,0 +1,25 @@ +"""pytest configuration + +""" + +import pytest + + +@pytest.fixture(scope="session") +def spark(request): + """Provide session wide Spark Session to avoid expensive recreation for + each test. + + If pyspark is not available, skip tests. + + """ + + try: + from pyspark.sql import SparkSession + spark = SparkSession.builder.getOrCreate() + + request.addfinalizer(lambda: spark.stop()) + return spark + + except ImportError: + pytest.skip("Pyspark not available.") From c8da336adb0ec70cd9831a18a181d5d8dab9bbb3 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:29:27 +0200 Subject: [PATCH 26/48] Move spark environment tests into spark subpackage. --- tests/{ => wranglers/spark}/test_environment.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) rename tests/{ => wranglers/spark}/test_environment.py (96%) diff --git a/tests/test_environment.py b/tests/wranglers/spark/test_environment.py similarity index 96% rename from tests/test_environment.py rename to tests/wranglers/spark/test_environment.py index 5018288..155c6d6 100644 --- a/tests/test_environment.py +++ b/tests/wranglers/spark/test_environment.py @@ -7,8 +7,9 @@ import pytest +pytestmark = pytest.mark.pyspark + -@pytest.mark.pyspark def test_java_environment(): """Pyspark requires Java to be available. It uses Py4J to start and communicate with the JVM. Py4J looks for JAVA_HOME or falls back calling @@ -29,7 +30,6 @@ def test_java_environment(): raise EnvironmentError("Java setup broken.") -@pytest.mark.pyspark def test_pyspark_import(): """Fail if pyspark can't be imported. This test is mandatory because other spark tests will be skipped if the spark session fixture fails. @@ -43,7 +43,6 @@ def test_pyspark_import(): pytest.fail("pyspark can't be imported") -@pytest.mark.pyspark def test_pyspark_pandas_interaction(spark): """Check simple interaction between pyspark and pandas. From cdfb47e7628d7341d97331b43c1d542014f77472 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:30:28 +0200 Subject: [PATCH 27/48] Move `SparkWrangler` tests into spark subpackage. --- tests/wranglers/spark/test_base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/wranglers/spark/test_base.py b/tests/wranglers/spark/test_base.py index 6c20d21..6b061f0 100644 --- a/tests/wranglers/spark/test_base.py +++ b/tests/wranglers/spark/test_base.py @@ -1,16 +1,16 @@ """Test spark base wrangler. +isort:skip_file """ import pytest -try: - from pywrangler.wranglers.spark.base import SparkWrangler -except ImportError: - SparkWrangler = None +pytestmark = pytest.mark.pyspark # noqa: E402 +pyspark = pytest.importorskip("pyspark") # noqa: E402 + +from pywrangler.wranglers.spark.base import SparkWrangler -@pytest.mark.pyspark def test_spark_base_wrangler_engine(): wrangler = SparkWrangler() From 170392758937b9d76fc8b1ca530b7a41c1fdcb1e Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:31:44 +0200 Subject: [PATCH 28/48] Move pandas wrangler benchmark functions into pandas subpackage. --- src/pywrangler/wranglers/pandas/benchmark.py | 196 +++++++++++++++++++ tests/wranglers/pandas/test_benchmark.py | 101 ++++++++++ 2 files changed, 297 insertions(+) create mode 100644 src/pywrangler/wranglers/pandas/benchmark.py create mode 100644 tests/wranglers/pandas/test_benchmark.py diff --git a/src/pywrangler/wranglers/pandas/benchmark.py b/src/pywrangler/wranglers/pandas/benchmark.py new file mode 100644 index 0000000..c6211a5 --- /dev/null +++ b/src/pywrangler/wranglers/pandas/benchmark.py @@ -0,0 +1,196 @@ +"""This module contains benchmarking utility for pandas wranglers. + +""" + +from typing import Union + +import numpy as np +import pandas as pd + +from pywrangler.benchmark import MemoryProfiler, TimeProfiler +from pywrangler.util import sanitizer +from pywrangler.wranglers.pandas.base import PandasWrangler + + +class PandasTimeProfiler(TimeProfiler): + """Approximate time that a pandas wrangler instance requires to execute the + `fit_transform` step. + + Parameters + ---------- + wrangler: pywrangler.wranglers.base.BaseWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + measurements: list + The actual profiling measurements in seconds. + best: float + The best measurement in seconds. + median: float + The median of measurements in seconds. + worst: float + The worst measurement in seconds. + std: float + The standard deviation of measurements in seconds. + runs: int + The number of measurements. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + """ + + def __init__(self, wrangler: PandasWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + super().__init__(wrangler.fit_transform, repetitions) + + +class PandasMemoryProfiler(MemoryProfiler): + """Approximate memory usage that a pandas wrangler instance requires to + execute the `fit_transform` step. + + As a key metric, `ratio` is computed. It refers to the amount of + memory which is required to execute the `fit_transform` step. More + concretely, it estimates how much more memory is used standardized by the + input memory usage (memory usage increase during function execution divided + by memory usage of input dataframes). In other words, if you have a 1GB + input dataframe, and the `usage_ratio` is 5, `fit_transform` needs 5GB free + memory available to succeed. A `usage_ratio` of 0.5 given a 2GB input + dataframe would require 1GB free memory available for computation. + + Parameters + ---------- + wrangler: pywrangler.wranglers.pandas.base.PandasWrangler + The wrangler instance to be profiled. + repetitions: int + The number of measurements for memory profiling. + interval: float, optional + Defines interval duration between consecutive memory usage + measurements in seconds. + + Attributes + ---------- + measurements: list + The actual profiling measurements in bytes. + best: float + The best measurement in bytes. + median: float + The median of measurements in bytes. + worst: float + The worst measurement in bytes. + std: float + The standard deviation of measurements in bytes. + runs: int + The number of measurements. + baseline_change: float + The median change in baseline memory usage across all runs in bytes. + input: int + Memory usage of input dataframes in bytes. + output: int + Memory usage of output dataframes in bytes. + ratio: float + The amount of memory required for computation in units of input + memory usage. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + """ + + def __init__(self, wrangler: PandasWrangler, repetitions: int = 5, + interval: float = 0.01): + self._wrangler = wrangler + + super().__init__(wrangler.fit_transform, repetitions, interval) + + def profile(self, *dfs: pd.DataFrame, **kwargs): + """Profiles the actual memory usage given input dataframes `dfs` + which are passed to `fit_transform`. + + """ + + # usage input + self._usage_input = self._memory_usage_dfs(*dfs) + + # usage output + dfs_output = self._wrangler.fit_transform(*dfs) + dfs_output = sanitizer.ensure_tuple(dfs_output) + self._usage_output = self._memory_usage_dfs(*dfs_output) + + # usage during fit_transform + super().profile(*dfs, **kwargs) + + return self + + @property + def input(self) -> float: + """Returns the memory usage of the input dataframes in bytes. + + """ + + self._check_is_profiled(['_usage_input']) + return self._usage_input + + @property + def output(self) -> float: + """Returns the memory usage of the output dataframes in bytes. + + """ + + self._check_is_profiled(['_usage_output']) + return self._usage_output + + @property + def ratio(self) -> float: + """Refers to the amount of memory which is required to execute the + `fit_transform` step. More concretely, it estimates how much more + memory is used standardized by the input memory usage (memory usage + increase during function execution divided by memory usage of input + dataframes). In other words, if you have a 1GB input dataframe, and the + `usage_ratio` is 5, `fit_transform` needs 5GB free memory available to + succeed. A `usage_ratio` of 0.5 given a 2GB input dataframe would + require 1GB free memory available for computation. + + """ + + return self.median / self.input + + @staticmethod + def _memory_usage_dfs(*dfs: pd.DataFrame) -> int: + """Return memory usage in bytes for all given dataframes. + + Parameters + ---------- + dfs: pd.DataFrame + The pandas dataframes for which memory usage should be computed. + + Returns + ------- + memory_usage: int + The computed memory usage in bytes. + + """ + + mem_usages = [df.memory_usage(deep=True, index=True).sum() + for df in dfs] + + return int(np.sum(mem_usages)) diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py new file mode 100644 index 0000000..6964a54 --- /dev/null +++ b/tests/wranglers/pandas/test_benchmark.py @@ -0,0 +1,101 @@ +"""This module contains tests for pandas benchmarks. + +""" + +import time + +import numpy as np +import pandas as pd + +from pywrangler.benchmark import allocate_memory +from pywrangler.wranglers.pandas.base import PandasSingleNoFit +from pywrangler.wranglers.pandas.benchmark import ( + PandasMemoryProfiler, + PandasTimeProfiler +) + +MIB = 2 ** 20 + + +def test_pandas_memory_profiler_memory_usage_dfs(): + df1 = pd.DataFrame(np.random.rand(10)) + df2 = pd.DataFrame(np.random.rand(10)) + + test_input = [df1, df2] + test_output = int(df1.memory_usage(index=True, deep=True).sum() + + df2.memory_usage(index=True, deep=True).sum()) + + assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output + + +def test_pandas_memory_profiler_return_self(): + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame() + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler is memory_profiler.profile(pd.DataFrame()) + + +def test_pandas_memory_profiler_usage_increases_mean(): + empty_df = pd.DataFrame() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame(allocate_memory(30)) + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler.profile(empty_df).median > 29 * MIB + + +def test_pandas_memory_profiler_usage_input_output(): + df_input = pd.DataFrame(np.random.rand(1000)) + df_output = pd.DataFrame(np.random.rand(10000)) + + test_df_input = df_input.memory_usage(index=True, deep=True).sum() + test_df_output = df_output.memory_usage(index=True, deep=True).sum() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return df_output + + memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input) + + assert memory_profiler.input == test_df_input + assert memory_profiler.output == test_df_output + + +def test_pandas_memory_profiler_usage_ratio(): + usage_mib = 30 + df_input = pd.DataFrame(np.random.rand(1000000)) + usage_input = df_input.memory_usage(index=True, deep=True).sum() + test_output = ((usage_mib - 1) * MIB) / usage_input + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + return pd.DataFrame(allocate_memory(usage_mib)) + + memory_profiler = PandasMemoryProfiler(DummyWrangler()) + + assert memory_profiler.profile(df_input).ratio > test_output + + +def test_pandas_time_profiler_fastest(): + """Basic test for pandas time profiler ensuring fastest timing is slower + than forced sleep. + + """ + + sleep = 0.0001 + df_input = pd.DataFrame() + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + time.sleep(sleep) + return df + + time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input) + + assert time_profiler.best >= sleep From fbcd573c3480e15bee1905dc8e573ee32798f67a Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:33:27 +0200 Subject: [PATCH 29/48] Move dask benchmarks into dask subpackage. --- src/pywrangler/wranglers/dask/benchmark.py | 91 ++++++++++++++++++++++ tests/wranglers/dask/test_benchmark.py | 43 ++++++++++ 2 files changed, 134 insertions(+) create mode 100644 src/pywrangler/wranglers/dask/benchmark.py create mode 100644 tests/wranglers/dask/test_benchmark.py diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py new file mode 100644 index 0000000..44f2802 --- /dev/null +++ b/src/pywrangler/wranglers/dask/benchmark.py @@ -0,0 +1,91 @@ +"""This module contains benchmarking utility for pandas wranglers. + +""" + +from typing import Union + +from dask.dataframe import DataFrame + +from pywrangler.benchmark import TimeProfiler +from pywrangler.wranglers.dask.base import DaskWrangler + + +class DaskTimeProfiler(TimeProfiler): + """Approximate time that a dask wrangler instance requires to execute the + `fit_transform` step. + + Please note, input dataframes are cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. This may cause + problems if the size of input dataframes exceeds available memory. + + Parameters + ---------- + wrangler: pywrangler.wranglers.base.BaseWrangler + The wrangler instance to be profiled. + repetitions: None, int, optional + Number of repetitions. If `None`, `timeit.Timer.autorange` will + determine a sensible default. + + Attributes + ---------- + measurements: list + The actual profiling measurements in seconds. + best: float + The best measurement in seconds. + median: float + The median of measurements in seconds. + worst: float + The worst measurement in seconds. + std: float + The standard deviation of measurements in seconds. + runs: int + The number of measurements. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + """ + + def __init__(self, wrangler: DaskWrangler, + repetitions: Union[None, int] = None): + self._wrangler = wrangler + + def wrapper(*args, **kwargs): + """Wrapper function to call `compute()` to enforce computation. + + """ + + wrangler.fit_transform(*args, **kwargs).compute() + + super().__init__(wrapper, repetitions) + + def profile(self, *dfs: DataFrame, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. + + Please note, input dataframes are cached before timing execution to + ensure timing measurements only capture wrangler's `fit_transform`. + This may cause problems if the size of input dataframes exceeds + available memory. + + """ + + # cache input dataframes + dfs_cached = [df.persist() for df in dfs] + + super().profile(*dfs_cached, **kwargs) + + # clear caches + for df in dfs_cached: + del df + + del dfs_cached + + return self diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py new file mode 100644 index 0000000..421d850 --- /dev/null +++ b/tests/wranglers/dask/test_benchmark.py @@ -0,0 +1,43 @@ +"""This module contains tests for dask benchmarks. + +isort:skip_file +""" + +import time + +import pytest +import pandas as pd +import numpy as np + +pytestmark = pytest.mark.dask # noqa: E402 +dask = pytest.importorskip("dask") # noqa: E402 + +from dask import dataframe as dd + +from pywrangler.wranglers.dask.benchmark import DaskTimeProfiler +from pywrangler.wranglers.dask.base import DaskSingleNoFit + +SLEEP = 0.0001 + + +@pytest.fixture +def wrangler_sleeps(): + class DummyWrangler(DaskSingleNoFit): + def transform(self, df): + time.sleep(SLEEP) + return df + + return DummyWrangler + + +def test_dask_time_profiler_fastest(spark, wrangler_sleeps): + """Basic test for dask time profiler ensuring fastest timing is slower + than forced sleep. + + """ + + df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) + + time_profiler = DaskTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + + assert time_profiler.best >= SLEEP From a05738f48902ed7de551b84bde6689d75b061a34 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Fri, 26 Apr 2019 22:40:38 +0200 Subject: [PATCH 30/48] Add `get_param_names` helper function. --- src/pywrangler/util/helper.py | 34 +++++++++++++++++++++++++++++++++- src/pywrangler/util/types.py | 5 ++++- tests/__init__.py | 0 tests/util/test_helper.py | 19 +++++++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/util/test_helper.py diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py index b845033..e05a3b8 100644 --- a/src/pywrangler/util/helper.py +++ b/src/pywrangler/util/helper.py @@ -2,7 +2,10 @@ """ -from typing import Callable +import inspect +from typing import Callable, List + +from pywrangler.util.types import T_STR_OPT_MUL def cached_property(method: Callable) -> property: @@ -36,3 +39,32 @@ def get_prop_value(obj): return value return property(get_prop_value, doc=docstring) + + +def get_param_names(func: Callable, + ignore: T_STR_OPT_MUL = None) -> List[str]: + """Retrieve all parameter names for given function. + + Parameters + ---------- + func: Callable + Function for which parameter names should be retrieved. + ignore: iterable, None, optional + Parameter names to be ignored. For example, `self` for `__init__` + functions. + + Returns + ------- + param_names: list + List of parameter names. + + """ + + ignore = ignore or [] + + signature = inspect.signature(func) + parameters = signature.parameters.values() + + param_names = [x.name for x in parameters if x.name not in ignore] + + return param_names diff --git a/src/pywrangler/util/types.py b/src/pywrangler/util/types.py index 928106c..52c8213 100644 --- a/src/pywrangler/util/types.py +++ b/src/pywrangler/util/types.py @@ -4,5 +4,8 @@ from typing import Iterable, Union -TYPE_COLUMNS = Union[str, Iterable[str], None] +T_STR_OPT_MUL = Union[Iterable[str], None] +T_STR_OPT_SING_MUL = Union[str, Iterable[str], None] + +TYPE_COLUMNS = T_STR_OPT_SING_MUL TYPE_ASCENDING = Union[bool, Iterable[bool], None] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/util/test_helper.py b/tests/util/test_helper.py new file mode 100644 index 0000000..1cc4d42 --- /dev/null +++ b/tests/util/test_helper.py @@ -0,0 +1,19 @@ +"""This module contains tests for the helper module. + +""" + +from pywrangler.util.helper import get_param_names + + +def test_get_param_names(): + + def func(): + pass + + assert get_param_names(func) == [] + + def func1(a, b=4, c=6): + pass + + assert get_param_names(func1) == ["a", "b", "c"] + assert get_param_names(func1, ["a"]) == ["b", "c"] From bb0a7a1c3641a330cceb21ce3dd520f8f7b4a155 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 27 Apr 2019 14:20:13 +0200 Subject: [PATCH 31/48] Add pytestmarks and remove wrong spark fixture. --- tests/wranglers/dask/test_base.py | 10 +++++----- tests/wranglers/dask/test_benchmark.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/wranglers/dask/test_base.py b/tests/wranglers/dask/test_base.py index da4ae61..3b5711c 100644 --- a/tests/wranglers/dask/test_base.py +++ b/tests/wranglers/dask/test_base.py @@ -1,16 +1,16 @@ """Test dask base wrangler. +isort:skip_file """ import pytest -try: - from pywrangler.wranglers.dask.base import DaskWrangler -except ImportError: - DaskWrangler = None +pytestmark = pytest.mark.dask # noqa: E402 +dask = pytest.importorskip("dask") # noqa: E402 + +from pywrangler.wranglers.dask.base import DaskWrangler -@pytest.mark.dask def test_dask_base_wrangler_engine(): wrangler = DaskWrangler() diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py index 421d850..037b834 100644 --- a/tests/wranglers/dask/test_benchmark.py +++ b/tests/wranglers/dask/test_benchmark.py @@ -30,7 +30,7 @@ def transform(self, df): return DummyWrangler -def test_dask_time_profiler_fastest(spark, wrangler_sleeps): +def test_dask_time_profiler_fastest(wrangler_sleeps): """Basic test for dask time profiler ensuring fastest timing is slower than forced sleep. From 6e8b31157620ac12df5ec27513eda0ad346f42a7 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 27 Apr 2019 14:21:22 +0200 Subject: [PATCH 32/48] Refactor tests to use `create_wrangler` fixture. Add sleeps for memory profiling. --- tests/wranglers/pandas/test_benchmark.py | 88 ++++++++++++++---------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py index 6964a54..3d915b3 100644 --- a/tests/wranglers/pandas/test_benchmark.py +++ b/tests/wranglers/pandas/test_benchmark.py @@ -4,6 +4,8 @@ import time +import pytest + import numpy as np import pandas as pd @@ -17,6 +19,42 @@ MIB = 2 ** 20 +@pytest.fixture +def test_wrangler(): + """Helper fixture to generate PandasWrangler instances with parametrization + of transform output and sleep. + + """ + + def create_wrangler(size=None, result=None, sleep=0): + """Return instance of PandasWrangler. + + Parameters + ---------- + size: float + Memory size in MiB to allocate during transform step. + result: pd.DataFrame + Define extact return value of transform step. + sleep: float + Define sleep interval. + + """ + + class DummyWrangler(PandasSingleNoFit): + def transform(self, df): + if size is not None: + df_out = pd.DataFrame(allocate_memory(size)) + else: + df_out = pd.DataFrame(result) + + time.sleep(sleep) + return df_out + + return DummyWrangler() + + return create_wrangler + + def test_pandas_memory_profiler_memory_usage_dfs(): df1 = pd.DataFrame(np.random.rand(10)) df2 = pd.DataFrame(np.random.rand(10)) @@ -28,74 +66,54 @@ def test_pandas_memory_profiler_memory_usage_dfs(): assert PandasMemoryProfiler._memory_usage_dfs(*test_input) == test_output -def test_pandas_memory_profiler_return_self(): - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame() - - memory_profiler = PandasMemoryProfiler(DummyWrangler()) +def test_pandas_memory_profiler_return_self(test_wrangler): + memory_profiler = PandasMemoryProfiler(test_wrangler()) assert memory_profiler is memory_profiler.profile(pd.DataFrame()) -def test_pandas_memory_profiler_usage_increases_mean(): - empty_df = pd.DataFrame() - - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame(allocate_memory(30)) +def test_pandas_memory_profiler_usage_median(test_wrangler): + wrangler = test_wrangler(size=30, sleep=0.01) + memory_profiler = PandasMemoryProfiler(wrangler) - memory_profiler = PandasMemoryProfiler(DummyWrangler()) + assert memory_profiler.profile(pd.DataFrame()).median > 29 * MIB - assert memory_profiler.profile(empty_df).median > 29 * MIB - -def test_pandas_memory_profiler_usage_input_output(): +def test_pandas_memory_profiler_usage_input_output(test_wrangler): df_input = pd.DataFrame(np.random.rand(1000)) df_output = pd.DataFrame(np.random.rand(10000)) test_df_input = df_input.memory_usage(index=True, deep=True).sum() test_df_output = df_output.memory_usage(index=True, deep=True).sum() - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return df_output - - memory_profiler = PandasMemoryProfiler(DummyWrangler()).profile(df_input) + wrangler = test_wrangler(result=df_output) + memory_profiler = PandasMemoryProfiler(wrangler).profile(df_input) assert memory_profiler.input == test_df_input assert memory_profiler.output == test_df_output -def test_pandas_memory_profiler_usage_ratio(): +def test_pandas_memory_profiler_ratio(test_wrangler): usage_mib = 30 df_input = pd.DataFrame(np.random.rand(1000000)) usage_input = df_input.memory_usage(index=True, deep=True).sum() test_output = ((usage_mib - 1) * MIB) / usage_input - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - return pd.DataFrame(allocate_memory(usage_mib)) + wrangler = test_wrangler(size=usage_mib, sleep=0.01) - memory_profiler = PandasMemoryProfiler(DummyWrangler()) + memory_profiler = PandasMemoryProfiler(wrangler) assert memory_profiler.profile(df_input).ratio > test_output -def test_pandas_time_profiler_fastest(): +def test_pandas_time_profiler_best(test_wrangler): """Basic test for pandas time profiler ensuring fastest timing is slower than forced sleep. """ sleep = 0.0001 - df_input = pd.DataFrame() - - class DummyWrangler(PandasSingleNoFit): - def transform(self, df): - time.sleep(sleep) - return df - - time_profiler = PandasTimeProfiler(DummyWrangler(), 1).profile(df_input) + wrangler = test_wrangler(sleep=sleep) + time_profiler = PandasTimeProfiler(wrangler, 1).profile(pd.DataFrame()) assert time_profiler.best >= sleep From a2b4dfc9bae6dd9c4419c62a3a7b95db4fd8047a Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 27 Apr 2019 14:21:46 +0200 Subject: [PATCH 33/48] Add sleeps for memory profiling. --- tests/test_benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index e126b24..a656858 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -176,6 +176,7 @@ def test_memory_profiler_no_increase(func_no_effect): def test_memory_profiler_increase(): def increase(): memory_holder = allocate_memory(30) + time.sleep(0.01) return memory_holder assert MemoryProfiler(increase).profile().median > 29 * MIB From 4bf9faa06646af715007bf1a3f26a618385d5b7e Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Sat, 27 Apr 2019 14:39:42 +0200 Subject: [PATCH 34/48] Allow memory usage tests to fail due to non deterministic memory management. --- tests/test_benchmark.py | 3 ++- tests/wranglers/pandas/test_benchmark.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index a656858..8787216 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -173,10 +173,11 @@ def test_memory_profiler_no_increase(func_no_effect): assert memory_profiler.median < MIB +@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due " + "to non deterministic memory management.") def test_memory_profiler_increase(): def increase(): memory_holder = allocate_memory(30) - time.sleep(0.01) return memory_holder assert MemoryProfiler(increase).profile().median > 29 * MIB diff --git a/tests/wranglers/pandas/test_benchmark.py b/tests/wranglers/pandas/test_benchmark.py index 3d915b3..f7916d7 100644 --- a/tests/wranglers/pandas/test_benchmark.py +++ b/tests/wranglers/pandas/test_benchmark.py @@ -72,6 +72,8 @@ def test_pandas_memory_profiler_return_self(test_wrangler): assert memory_profiler is memory_profiler.profile(pd.DataFrame()) +@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due " + "to non deterministic memory management.") def test_pandas_memory_profiler_usage_median(test_wrangler): wrangler = test_wrangler(size=30, sleep=0.01) memory_profiler = PandasMemoryProfiler(wrangler) @@ -93,6 +95,8 @@ def test_pandas_memory_profiler_usage_input_output(test_wrangler): assert memory_profiler.output == test_df_output +@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due " + "to non deterministic memory management.") def test_pandas_memory_profiler_ratio(test_wrangler): usage_mib = 30 df_input = pd.DataFrame(np.random.rand(1000000)) From d14a5ff9b2325fa2a680422f584c061c27866f90 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 1 May 2019 15:03:30 +0200 Subject: [PATCH 35/48] Add assertion for number of runs. --- tests/test_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 8787216..16b49d8 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -146,6 +146,7 @@ def test_memory_profiler_measurements(func_no_effect): assert memory_profiler.best == 4 assert memory_profiler.worst == 5 assert memory_profiler.baseline_change == 1 + assert memory_profiler.runs == 4 def test_memory_profiler_no_side_effect(func_no_effect): @@ -192,7 +193,7 @@ def test_time_profiler_measurements(func_no_effect): measurements = [1, 1, 3, 3] time_profiler = TimeProfiler(func_no_effect) - time_profiler._measurements = [1, 1, 3, 3] + time_profiler._measurements = measurements assert time_profiler.less_is_better is True assert time_profiler.median == 2 From 0927a2a891355b798651fcb0c915de5b105f310a Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 1 May 2019 15:04:20 +0200 Subject: [PATCH 36/48] Remove note about memory consumption of child processes. --- src/pywrangler/benchmark.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index 7111571..26a7e63 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -248,8 +248,6 @@ class MemoryProfiler(BaseProfiler): usage during function execution and the baseline memory usage before function execution. - Note, memory consumption of child processes are included. - In addition, compute the mean increase in baseline memory usage between repetitions which might indicate memory leakage. From b28f6dbf9f14c90f1503419160d878f39c94fc09 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 1 May 2019 15:07:18 +0200 Subject: [PATCH 37/48] Add `DaskBaseProfiler` and `DaskMemoryProfiler`. --- src/pywrangler/wranglers/dask/benchmark.py | 211 ++++++++++++++++++--- tests/wranglers/dask/test_benchmark.py | 162 +++++++++++++++- 2 files changed, 335 insertions(+), 38 deletions(-) diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py index 44f2802..936a242 100644 --- a/src/pywrangler/wranglers/dask/benchmark.py +++ b/src/pywrangler/wranglers/dask/benchmark.py @@ -2,22 +2,89 @@ """ -from typing import Union +import gc +import sys +import warnings +from typing import Callable, List, Union -from dask.dataframe import DataFrame +import numpy as np +from dask.diagnostics import ResourceProfiler -from pywrangler.benchmark import TimeProfiler +from pywrangler.benchmark import MemoryProfiler, TimeProfiler from pywrangler.wranglers.dask.base import DaskWrangler -class DaskTimeProfiler(TimeProfiler): +class DaskBaseProfiler: + """Define common methods for dask profiler. + + """ + + def _wrap_fit_transform(self) -> Callable: + """Wrapper function to call `compute()` on dask wrangler instances to + enforce computation on lazily evaluated dask graphs. + + Returns + ------- + wrapped: callable + Wrapped `fit_transform` method as a function. + + """ + + def wrapped(*args, **kwargs): + return self.wrangler.fit_transform(*args, **kwargs).compute() + + return wrapped + + @staticmethod + def _cache_input(dfs) -> List: + """Persist lazily evaluated dask input collections before profiling to + capture only relevant `fit_transform`. + + Parameters + ---------- + dfs: iterable + Dask collections which can be persisted. + + Returns + ------- + persisted: iterable + List of computed dask collections. + + """ + + return [df.persist() for df in dfs] + + @staticmethod + def _clear_cached_input(dfs): + """Remove original reference to previously persisted dask collections + to enable garbage collection to free memory. Explicitly check reference + count and give warning if persisted dask collections are referenced + elsewhere which would prevent memory deallocation. + + Parameters + ---------- + dfs: iterable + Persisted dask collections which should be removed. + + """ + + # ensure reference counts are updated + gc.collect() + + # check ref counts + for df in dfs: + if sys.getrefcount(df) > 3: + warnings.warn("Persisted dask collection is referenced " + "elsewhere and prevents garbage collection", + ResourceWarning) + + dfs.clear() + + +class DaskTimeProfiler(TimeProfiler, DaskBaseProfiler): """Approximate time that a dask wrangler instance requires to execute the `fit_transform` step. - Please note, input dataframes are cached before timing execution to ensure - timing measurements only capture wrangler's `fit_transform`. This may cause - problems if the size of input dataframes exceeds available memory. - Parameters ---------- wrangler: pywrangler.wranglers.base.BaseWrangler @@ -25,6 +92,10 @@ class DaskTimeProfiler(TimeProfiler): repetitions: None, int, optional Number of repetitions. If `None`, `timeit.Timer.autorange` will determine a sensible default. + cache_input: bool, optional + Dask collections may be cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. By + default, it is disabled. Attributes ---------- @@ -54,38 +125,122 @@ class DaskTimeProfiler(TimeProfiler): """ def __init__(self, wrangler: DaskWrangler, - repetitions: Union[None, int] = None): - self._wrangler = wrangler + repetitions: Union[None, int] = None, + cache_input: bool = False): + self.wrangler = wrangler + self.cache_input = cache_input - def wrapper(*args, **kwargs): - """Wrapper function to call `compute()` to enforce computation. + func = self._wrap_fit_transform() + super().__init__(func, repetitions) - """ + def profile(self, *dfs, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. - wrangler.fit_transform(*args, **kwargs).compute() + """ + + if self.cache_input: + dfs = self._cache_input(dfs) + + super().profile(*dfs, **kwargs) + + if self.cache_input: + self._clear_cached_input(dfs) + + return self - super().__init__(wrapper, repetitions) - def profile(self, *dfs: DataFrame, **kwargs): +class DaskMemoryProfiler(MemoryProfiler, DaskBaseProfiler): + """Approximate memory usage that a dask wrangler instance requires to + execute the `fit_transform` step. + + Parameters + ---------- + func: callable + Callable object to be memory profiled. + repetitions: int, optional + Number of repetitions. + interval: float, optional + Defines interval duration between consecutive memory usage + measurements in seconds. + cache_input: bool, optional + Dask collections may be cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. By + default, it is disabled. + + Attributes + ---------- + measurements: list + The actual profiling measurements in bytes. + best: float + The best measurement in bytes. + median: float + The median of measurements in bytes. + worst: float + The worst measurement in bytes. + std: float + The standard deviation of measurements in bytes. + runs: int + The number of measurements. + baseline_change: float + The median change in baseline memory usage across all runs in bytes. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + Notes + ----- + The implementation uses dask's own `ResourceProfiler`. + + """ + + def __init__(self, wrangler: DaskWrangler, + repetitions: Union[None, int] = 5, + interval: float = 0.01, + cache_input: bool = False): + self.wrangler = wrangler + self.cache_input = cache_input + + func = self._wrap_fit_transform() + super().__init__(func, repetitions, interval) + + def profile(self, *dfs, **kwargs): """Profiles timing given input dataframes `dfs` which are passed to `fit_transform`. - Please note, input dataframes are cached before timing execution to - ensure timing measurements only capture wrangler's `fit_transform`. - This may cause problems if the size of input dataframes exceeds - available memory. - """ - # cache input dataframes - dfs_cached = [df.persist() for df in dfs] + if self.cache_input: + dfs = self._cache_input(dfs) + + counter = 0 + baselines = [] + max_usages = [] + + while counter < self.repetitions: + gc.collect() + + with ResourceProfiler(dt=self.interval) as rprof: + self.func(*dfs, **kwargs) + + mem_usages = [x.mem for x in rprof.results] + baselines.append(np.min(mem_usages)) + max_usages.append(np.max(mem_usages)) - super().profile(*dfs_cached, **kwargs) + counter += 1 - # clear caches - for df in dfs_cached: - del df + self._max_usages = max_usages + self._baselines = baselines + self._measurements = np.subtract(max_usages, baselines).tolist() - del dfs_cached + if self.cache_input: + self._clear_cached_input(dfs) return self diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py index 037b834..df4ad61 100644 --- a/tests/wranglers/dask/test_benchmark.py +++ b/tests/wranglers/dask/test_benchmark.py @@ -14,30 +14,172 @@ from dask import dataframe as dd -from pywrangler.wranglers.dask.benchmark import DaskTimeProfiler +from pywrangler.benchmark import allocate_memory +from pywrangler.wranglers.dask.benchmark import ( + DaskTimeProfiler, + DaskMemoryProfiler, + DaskBaseProfiler +) from pywrangler.wranglers.dask.base import DaskSingleNoFit -SLEEP = 0.0001 - @pytest.fixture -def wrangler_sleeps(): +def mean_wranger(): class DummyWrangler(DaskSingleNoFit): def transform(self, df): - time.sleep(SLEEP) - return df + return df.mean() + + return DummyWrangler() + + +@pytest.fixture +def test_wrangler(): + """Helper fixture to generate DaskWrangler instances with parametrization + of transform output and sleep. + + """ + + def create_wrangler(size=None, result=None, sleep=0): + """Return instance of DaskWrangler. + + Parameters + ---------- + size: float + Memory size in MiB to allocate during transform step. + result: Dask DataFrame + Define extact return value of transform step. + sleep: float + Define sleep interval. + + """ + + class DummyWrangler(DaskSingleNoFit): + def transform(self, df): + if size is not None: + pdf = pd.DataFrame(allocate_memory(size)) + df_out = dd.from_pandas(pdf) + elif result is not None: + df_out = result + else: + df_out = dd.from_pandas(pd.DataFrame([0]), 1) + + time.sleep(sleep) + return df_out + + return DummyWrangler() + + return create_wrangler + + +def test_dask_base_profiler_wrap_fit_transform(test_wrangler): + pdf = pd.DataFrame(np.random.rand(50, 5)) + df = dd.from_pandas(pdf, 5).max().max() + + profiler = DaskTimeProfiler(wrangler=test_wrangler(result=df), + repetitions=1) + + wrapped = profiler._wrap_fit_transform() + + assert callable(wrapped) + assert wrapped(df) == pdf.max().max() + + +def test_dask_base_profiler_cache_input(): + class MockPersist: + def persist(self): + self.persist_called = True + return self + + dask_mocks = [MockPersist(), MockPersist()] + + persisted = DaskBaseProfiler._cache_input(dask_mocks) + + assert all([x.persist_called for x in persisted]) + + +def test_dask_base_profiler_clear_cache_input(): + pdf = pd.DataFrame(np.random.rand(50, 5)) - return DummyWrangler + with pytest.warns(None) as record: + DaskBaseProfiler._clear_cached_input([dd.from_pandas(pdf, 5)]) + assert len(record) == 0 + df = dd.from_pandas(pdf, 5) + ref = df # noqa: F841 -def test_dask_time_profiler_fastest(wrangler_sleeps): + with pytest.warns(ResourceWarning): + DaskBaseProfiler._clear_cached_input([df]) + + +def test_dask_time_profiler_fastest(test_wrangler): """Basic test for dask time profiler ensuring fastest timing is slower than forced sleep. """ + sleep = 0.001 + df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) - time_profiler = DaskTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + time_profiler = DaskTimeProfiler(wrangler=test_wrangler(sleep=sleep), + repetitions=1, + cache_input=True) + + assert time_profiler.profile(df_input).best >= sleep + + +def test_dask_time_profiler_profile_return_self(test_wrangler): + df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) + + time_profiler = DaskTimeProfiler(wrangler=test_wrangler(), + repetitions=1) + + assert time_profiler.profile(df_input) is time_profiler + + +def test_dask_time_profiler_cached_faster(mean_wranger): + pdf = pd.DataFrame(np.random.rand(1000000, 10)) + df_input = dd.from_pandas(pdf, 2).mean() + + time_profiler_no_cache = DaskTimeProfiler(wrangler=mean_wranger, + repetitions=5, + cache_input=False) + + time_profiler_cache = DaskTimeProfiler(wrangler=mean_wranger, + repetitions=5, + cache_input=True) + + no_cache_time = time_profiler_no_cache.profile(df_input).median + cache_time = time_profiler_cache.profile(df_input).median + + assert no_cache_time > cache_time + + +def test_dask_memory_profiler_profile_return_self(test_wrangler): + df_input = dd.from_pandas(pd.DataFrame(np.random.rand(10, 10)), 2) + + mem_profiler = DaskMemoryProfiler(wrangler=test_wrangler(), + repetitions=1) + + assert mem_profiler.profile(df_input) is mem_profiler + assert mem_profiler.runs == 1 + + +def test_dask_memory_profiler_cached_lower_usage(mean_wranger): + pdf = pd.DataFrame(np.random.rand(1000000, 10)) + df_input = dd.from_pandas(pdf, 5).mean() + + mem_profiler_no_cache = DaskMemoryProfiler(wrangler=mean_wranger, + repetitions=5, + cache_input=False, + interval=0.00001) + + mem_profiler_cache = DaskMemoryProfiler(wrangler=mean_wranger, + repetitions=5, + cache_input=True, + interval=0.00001) + + no_cache_usage = mem_profiler_no_cache.profile(df_input).median + cache_usage = mem_profiler_cache.profile(df_input).median - assert time_profiler.best >= SLEEP + assert no_cache_usage > cache_usage From 01aa643ec10134e2d5f2ceebd3e7a4e4c428a758 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 20:57:16 +0200 Subject: [PATCH 38/48] Doc string improvements. --- src/pywrangler/benchmark.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pywrangler/benchmark.py b/src/pywrangler/benchmark.py index 26a7e63..82cd472 100644 --- a/src/pywrangler/benchmark.py +++ b/src/pywrangler/benchmark.py @@ -20,7 +20,7 @@ def allocate_memory(size: float) -> np.ndarray: - """Helper function for testing to allocate memory by creating numpy array + """Helper function to approximately allocate memory by creating numpy array with given size in MiB. Numpy is used deliberately to define the used memory via dtype. @@ -50,9 +50,10 @@ def allocate_memory(size: float) -> np.ndarray: class BaseProfiler: """Base class defining the interface for all profilers. - Subclasses have to implement `profile` (the actual profiling - implementation) and `less_is_better` (defining the ranking of profiling - measurements). + Subclasses have to implement `profile` (the actual profiling method) and + `less_is_better` (defining the ranking of profiling measurements). + + The private attribute `_measurements` is assumed to be set by `profile`. Attributes ---------- From 81a1aeac6329499d8343893e1f5ef0e7eec51a32 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 20:59:40 +0200 Subject: [PATCH 39/48] Doc string improvement. --- src/pywrangler/wranglers/dask/benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pywrangler/wranglers/dask/benchmark.py b/src/pywrangler/wranglers/dask/benchmark.py index 936a242..b52e4f3 100644 --- a/src/pywrangler/wranglers/dask/benchmark.py +++ b/src/pywrangler/wranglers/dask/benchmark.py @@ -20,8 +20,8 @@ class DaskBaseProfiler: """ def _wrap_fit_transform(self) -> Callable: - """Wrapper function to call `compute()` on dask wrangler instances to - enforce computation on lazily evaluated dask graphs. + """Wrapper function to call `compute()` on wrangler's `fit_transform` + to enforce computation on lazily evaluated dask graphs. Returns ------- From 347cb6a93f1eab03db0ee513d6c55450c790dc3a Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:04:30 +0200 Subject: [PATCH 40/48] Add SparkBaseProfiler. Add `cache_input` parameter. Add SparkMemoryProfiler template without current implementation. --- src/pywrangler/wranglers/spark/benchmark.py | 178 ++++++++++++++++---- 1 file changed, 149 insertions(+), 29 deletions(-) diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py index ef5cbf2..c37ea2a 100644 --- a/src/pywrangler/wranglers/spark/benchmark.py +++ b/src/pywrangler/wranglers/spark/benchmark.py @@ -2,22 +2,81 @@ """ -from typing import Union +import warnings +from typing import Callable, Iterable, Union from pyspark.sql import DataFrame -from pywrangler.benchmark import TimeProfiler +from pywrangler.benchmark import MemoryProfiler, TimeProfiler from pywrangler.wranglers.spark.base import SparkWrangler -class SparkTimeProfiler(TimeProfiler): +class SparkBaseProfiler: + """Define common methods for spark profiler. + + """ + + def _wrap_fit_transform(self) -> Callable: + """Wrapper function to call `count()` on wrangler's `fit_transform` + to enforce computation on lazily evaluated spark dataframes. + + Returns + ------- + wrapped: callable + Wrapped `fit_transform` method as a function. + + """ + + def wrapped(*args, **kwargs): + return self.wrangler.fit_transform(*args, **kwargs).count() + + return wrapped + + @staticmethod + def _cache_input(dfs: Iterable[DataFrame]): + """Persist lazily evaluated spark dataframes before profiling to + capture only relevant `fit_transform`. Apply `count` to enforce + computation to create cached representation. + + Parameters + ---------- + dfs: iterable + Spark dataframes to be persisted. + + Returns + ------- + persisted: iterable + List of computed dask collections. + + """ + + for df in dfs: + df.persist() + df.count() + + @staticmethod + def _clear_cached_input(dfs: Iterable[DataFrame]): + """Unpersist previously persisted spark dataframes after profiling. + + Parameters + ---------- + dfs: iterable + Persisted spark dataframes. + + """ + + for df in dfs: + df.unpersist() + + if df.is_cached: + warnings.warn("Spark dataframe could not be unpersisted.", + ResourceWarning) + + +class SparkTimeProfiler(TimeProfiler, SparkBaseProfiler): """Approximate time that a spark wrangler instance requires to execute the `fit_transform` step. - Please note, input dataframes are cached before timing execution to ensure - timing measurements only capture wrangler's `fit_transform`. This may cause - problems if the size of input dataframes exceeds available memory. - Parameters ---------- wrangler: pywrangler.wranglers.base.BaseWrangler @@ -25,6 +84,10 @@ class SparkTimeProfiler(TimeProfiler): repetitions: None, int, optional Number of repetitions. If `None`, `timeit.Timer.autorange` will determine a sensible default. + cache_input: bool, optional + Spark dataframes may be cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. By + default, it is disabled. Attributes ---------- @@ -54,17 +117,13 @@ class SparkTimeProfiler(TimeProfiler): """ def __init__(self, wrangler: SparkWrangler, - repetitions: Union[None, int] = None): - self._wrangler = wrangler - - def wrapper(*args, **kwargs): - """Wrapper function to call `count()` to enforce computation. - - """ + repetitions: Union[None, int] = None, + cache_input: bool = False): + self.wrangler = wrangler + self.cache_input = cache_input - wrangler.fit_transform(*args, **kwargs).count() - - super().__init__(wrapper, repetitions) + func = self._wrap_fit_transform() + super().__init__(func, repetitions) def profile(self, *dfs: DataFrame, **kwargs): """Profiles timing given input dataframes `dfs` which are passed to @@ -77,20 +136,81 @@ def profile(self, *dfs: DataFrame, **kwargs): """ - # cache input dataframes - dfs_cached = [df.cache() for df in dfs] + if self.cache_input: + self._cache_input(dfs) - # enforce caching calling count() action - for df in dfs_cached: - df.count() + super().profile(*dfs, **kwargs) - super().profile(*dfs_cached, **kwargs) + if self.cache_input: + self._clear_cached_input(dfs) - # clear caches - for df in dfs_cached: - df.unpersist() - del df + return self - del dfs_cached - return self +class SparkMemoryProfiler(MemoryProfiler, SparkBaseProfiler): + """Approximate memory usage that a spark wrangler instance requires to + execute the `fit_transform` step. + + #TODO: provide implementation for profile + + Parameters + ---------- + func: callable + Callable object to be memory profiled. + repetitions: int, optional + Number of repetitions. + interval: float, optional + Defines interval duration between consecutive memory usage + measurements in seconds. + cache_input: bool, optional + Spark dataframes may be cached before timing execution to ensure + timing measurements only capture wrangler's `fit_transform`. By + default, it is disabled. + + Attributes + ---------- + measurements: list + The actual profiling measurements in bytes. + best: float + The best measurement in bytes. + median: float + The median of measurements in bytes. + worst: float + The worst measurement in bytes. + std: float + The standard deviation of measurements in bytes. + runs: int + The number of measurements. + baseline_change: float + The median change in baseline memory usage across all runs in bytes. + + Methods + ------- + profile + Contains the actual profiling implementation. + report + Print simple report consisting of best, median, worst, standard + deviation and the number of measurements. + profile_report + Calls profile and report in sequence. + + + """ + + def __init__(self, wrangler: SparkWrangler, + repetitions: Union[None, int] = 5, + interval: float = 0.01, + cache_input: bool = False): + self.wrangler = wrangler + self.cache_input = cache_input + + func = self._wrap_fit_transform() + super().__init__(func, repetitions, interval) + + def profile(self, *dfs: DataFrame, **kwargs): + """Profiles timing given input dataframes `dfs` which are passed to + `fit_transform`. + + """ + + raise NotImplementedError From 358163d429539bc401e1428845520f9227922c83 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:11:00 +0200 Subject: [PATCH 41/48] Remove unused `cached_property`. --- src/pywrangler/util/helper.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/src/pywrangler/util/helper.py b/src/pywrangler/util/helper.py index e05a3b8..1b6acf4 100644 --- a/src/pywrangler/util/helper.py +++ b/src/pywrangler/util/helper.py @@ -8,39 +8,6 @@ from pywrangler.util.types import T_STR_OPT_MUL -def cached_property(method: Callable) -> property: - """Decorated method will be called only on first access to calculate a - cached property value. After that, the cached value is returned. - - Parameters - --------- - method: Callable - Getter method to be lazily evaluated. - - Returns - ------- - property - - Notes - ----- - Credit goes to python-pptx: https://github.com/scanny/python-pptx/blob/master/pptx/util.py - - """ # noqa: E501 - - cache_attr_name = '__{}'.format(method.__name__) - docstring = method.__doc__ - - def get_prop_value(obj): - try: - return getattr(obj, cache_attr_name) - except AttributeError: - value = method(obj) - setattr(obj, cache_attr_name, value) - return value - - return property(get_prop_value, doc=docstring) - - def get_param_names(func: Callable, ignore: T_STR_OPT_MUL = None) -> List[str]: """Retrieve all parameter names for given function. From 1c910fb30934ac0c48f4d769dcf768294e488612 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:11:23 +0200 Subject: [PATCH 42/48] Remove obsolete commentaries. --- src/pywrangler/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pywrangler/__init__.py b/src/pywrangler/__init__.py index c99a72a..49ab580 100644 --- a/src/pywrangler/__init__.py +++ b/src/pywrangler/__init__.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- from pkg_resources import get_distribution, DistributionNotFound try: - # Change here if project is renamed and does not equal the package name dist_name = __name__ __version__ = get_distribution(dist_name).version except DistributionNotFound: From cd1ac619148897cd3fd4a036a23ac6ce6d46f5c8 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:30:36 +0200 Subject: [PATCH 43/48] Allow failing of memory test due to non deterministic behaviour. --- tests/wranglers/dask/test_benchmark.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/wranglers/dask/test_benchmark.py b/tests/wranglers/dask/test_benchmark.py index df4ad61..1be0db1 100644 --- a/tests/wranglers/dask/test_benchmark.py +++ b/tests/wranglers/dask/test_benchmark.py @@ -165,6 +165,8 @@ def test_dask_memory_profiler_profile_return_self(test_wrangler): assert mem_profiler.runs == 1 +@pytest.mark.xfail(reason="Succeeds locally but sometimes fails remotely due " + "to non deterministic memory management.") def test_dask_memory_profiler_cached_lower_usage(mean_wranger): pdf = pd.DataFrame(np.random.rand(1000000, 10)) df_input = dd.from_pandas(pdf, 5).mean() From e9b803d4b52f8e89d9bd3a2486f27ffc4549ac1f Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:32:29 +0200 Subject: [PATCH 44/48] Remove SparkMemoryProfiler. --- src/pywrangler/wranglers/spark/benchmark.py | 73 +-------------------- 1 file changed, 3 insertions(+), 70 deletions(-) diff --git a/src/pywrangler/wranglers/spark/benchmark.py b/src/pywrangler/wranglers/spark/benchmark.py index c37ea2a..492fc49 100644 --- a/src/pywrangler/wranglers/spark/benchmark.py +++ b/src/pywrangler/wranglers/spark/benchmark.py @@ -1,5 +1,7 @@ """This module contains benchmarking utility for pandas wranglers. +TODO: implement SparkMemoryProfiler + """ import warnings @@ -7,7 +9,7 @@ from pyspark.sql import DataFrame -from pywrangler.benchmark import MemoryProfiler, TimeProfiler +from pywrangler.benchmark import TimeProfiler from pywrangler.wranglers.spark.base import SparkWrangler @@ -145,72 +147,3 @@ def profile(self, *dfs: DataFrame, **kwargs): self._clear_cached_input(dfs) return self - - -class SparkMemoryProfiler(MemoryProfiler, SparkBaseProfiler): - """Approximate memory usage that a spark wrangler instance requires to - execute the `fit_transform` step. - - #TODO: provide implementation for profile - - Parameters - ---------- - func: callable - Callable object to be memory profiled. - repetitions: int, optional - Number of repetitions. - interval: float, optional - Defines interval duration between consecutive memory usage - measurements in seconds. - cache_input: bool, optional - Spark dataframes may be cached before timing execution to ensure - timing measurements only capture wrangler's `fit_transform`. By - default, it is disabled. - - Attributes - ---------- - measurements: list - The actual profiling measurements in bytes. - best: float - The best measurement in bytes. - median: float - The median of measurements in bytes. - worst: float - The worst measurement in bytes. - std: float - The standard deviation of measurements in bytes. - runs: int - The number of measurements. - baseline_change: float - The median change in baseline memory usage across all runs in bytes. - - Methods - ------- - profile - Contains the actual profiling implementation. - report - Print simple report consisting of best, median, worst, standard - deviation and the number of measurements. - profile_report - Calls profile and report in sequence. - - - """ - - def __init__(self, wrangler: SparkWrangler, - repetitions: Union[None, int] = 5, - interval: float = 0.01, - cache_input: bool = False): - self.wrangler = wrangler - self.cache_input = cache_input - - func = self._wrap_fit_transform() - super().__init__(func, repetitions, interval) - - def profile(self, *dfs: DataFrame, **kwargs): - """Profiles timing given input dataframes `dfs` which are passed to - `fit_transform`. - - """ - - raise NotImplementedError From a83811410ced03958c38af49a5dc4572f48768ab Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:33:07 +0200 Subject: [PATCH 45/48] Add test for caching/uncaching. --- tests/wranglers/spark/test_benchmark.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/wranglers/spark/test_benchmark.py b/tests/wranglers/spark/test_benchmark.py index c353448..6610960 100644 --- a/tests/wranglers/spark/test_benchmark.py +++ b/tests/wranglers/spark/test_benchmark.py @@ -11,7 +11,8 @@ pyspark = pytest.importorskip("pyspark") # noqa: E402 from pywrangler.wranglers.spark.base import SparkSingleNoFit -from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler +from pywrangler.wranglers.spark.benchmark import SparkTimeProfiler, \ + SparkBaseProfiler SLEEP = 0.0001 @@ -40,13 +41,27 @@ def test_spark_time_profiler_fastest(spark, wrangler_sleeps): def test_spark_time_profiler_no_caching(spark, wrangler_sleeps): - """Pyspark input dataframes are cached during time profiling. Ensure input - dataframes are released from caching after profiling. + df_input = spark.range(10).toDF("col") + + SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + + assert df_input.is_cached is False - """ +def test_spark_time_profiler_caching(spark, wrangler_sleeps): + """Cache is released after profiling.""" df_input = spark.range(10).toDF("col") - SparkTimeProfiler(wrangler_sleeps(), 1).profile(df_input) + SparkTimeProfiler(wrangler_sleeps(), 1, cache_input=True).profile(df_input) assert df_input.is_cached is False + + +def test_spark_base_profiler_cache_input(spark): + df = spark.range(10).toDF("col") + + SparkBaseProfiler._cache_input([df]) + assert df.is_cached is True + + SparkBaseProfiler._clear_cached_input([df]) + assert df.is_cached is False From e94fab28cc0d7c3e51dfaedda134a94f1f002332 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 21:39:16 +0200 Subject: [PATCH 46/48] Update changelog. --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e975cfd..74025be 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Version 0.1.0 This is the initial release of pywrangler. +- Add benchmark utilities for pandas, spark and dask wranglers (`#5 `_). - Add sequential ``NaiveIterator`` and vectorized ``VectorizedCumSum`` pandas implementations for ``IntervalIdentifier`` wrangler (`#2 `_). - Add ``PandasWrangler`` (`#2 `_). - Add ``IntervalIdentifier`` wrangler interface (`#2 `_). From 9d0d36382eca2646bdae43afa5396bd2e374cf85 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 22:00:07 +0200 Subject: [PATCH 47/48] Add args and kwargs to `fit`, `transform` and `fit_transform` to allow subclasses to implement varying positional and keyword arguments (linters). --- src/pywrangler/wranglers/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pywrangler/wranglers/base.py b/src/pywrangler/wranglers/base.py index 0ee47df..6dd3c54 100644 --- a/src/pywrangler/wranglers/base.py +++ b/src/pywrangler/wranglers/base.py @@ -91,13 +91,13 @@ def set_params(self, **params): return self - def fit(self): + def fit(self, *args, **kwargs): raise NotImplementedError - def transform(self): + def transform(self, *args, **kwargs): raise NotImplementedError - def fit_transform(self): + def fit_transform(self, *args, **kwargs): raise NotImplementedError def __repr__(self): From 559fb46f0aa7fd7f0b2eba08cc0686dc1d37c940 Mon Sep 17 00:00:00 2001 From: mansenfranzen Date: Wed, 5 Jun 2019 22:00:54 +0200 Subject: [PATCH 48/48] Remove unnecessary else clause. --- src/pywrangler/wranglers/pandas/interval_identifier.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/pywrangler/wranglers/pandas/interval_identifier.py b/src/pywrangler/wranglers/pandas/interval_identifier.py index 844fa08..b6c05a0 100644 --- a/src/pywrangler/wranglers/pandas/interval_identifier.py +++ b/src/pywrangler/wranglers/pandas/interval_identifier.py @@ -157,9 +157,8 @@ def is_valid_end(value, active): else: intermediate.append(active) - else: - # finally, add rest to result which must be invalid - result.extend([0] * len(intermediate)) + # finally, add rest to result which must be invalid + result.extend([0] * len(intermediate)) return result