diff --git a/docs/Release Notes/Notices.md b/docs/Release Notes/Notices.md index a7f86939a..dca3b11f6 100644 --- a/docs/Release Notes/Notices.md +++ b/docs/Release Notes/Notices.md @@ -4,6 +4,7 @@ Opteryx uses the following libraries and components: Component | Licence ---------------------------------------------------------- | ------- +[distogram](https://github.com/maki-nage/distogram) | [MIT](https://github.com/maki-nage/distogram/blob/master/LICENSE.txt) [pyarrow](https://github.com/apache/arrow/) | [Apache 2.0](https://github.com/apache/arrow/blob/master/LICENSE.txt) [sqloxide](https://github.com/wseaton/sqloxide) | [MIT](https://github.com/wseaton/sqloxide/blob/master/LICENSE) [cython](https://github.com/cython/cython) | [Apache 2.0](https://github.com/cython/cython/blob/master/LICENSE.txt) diff --git a/opteryx/engine/planner/operations/show_columns.py b/opteryx/engine/planner/operations/show_columns.py index 200d70367..235b95564 100644 --- a/opteryx/engine/planner/operations/show_columns.py +++ b/opteryx/engine/planner/operations/show_columns.py @@ -21,12 +21,14 @@ from typing import Iterable import numpy +import orjson import pyarrow from opteryx.engine.query_statistics import QueryStatistics from opteryx.engine.attribute_types import OPTERYX_TYPES, determine_type from opteryx.engine.planner.operations.base_plan_node import BasePlanNode from opteryx.exceptions import SqlError +from opteryx.third_party import distogram from opteryx.utils.columns import Columns @@ -43,9 +45,213 @@ def myhash(any): return CityHash64(str(any)) +def _simple_collector(page): + """ + Collect the very summary type information only, we read only a single page to do + this so it's pretty quick - helpful if you want to know what fields are available + programatically. + """ + columns = Columns(page) + + buffer = [] + for column in page.column_names: + column_data = page.column(column) + _type = determine_type(str(column_data.type)) + new_row = {"column_name": columns.get_preferred_name(column), "type": _type} + buffer.append(new_row) + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="show_columns", + table_aliases=[], + ) + return table + + +def _full_collector(pages): + """ + Collect basic count information about columns, to do this we read the entire + dataset. + """ + + EMPTY_PROFILE = orjson.dumps( + { + "column_name": None, + "type": [], + "count": 0, + "min": None, + "max": None, + "missing": 0, + } + ) + + columns = None + profile_collector = {} + + for page in pages: + if columns is None: + columns = Columns(page) + + for column in page.column_names: + column_data = page.column(column) + profile = profile_collector.get(column, orjson.loads(EMPTY_PROFILE)) + _type = determine_type(str(column_data.type)) + if _type not in profile["type"]: + profile["type"].append(_type) + + profile["count"] += len(column_data) + profile["missing"] += column_data.null_count + + if _type == OPTERYX_TYPES.NUMERIC: + if profile["min"]: + profile["min"] = min(profile["min"], numpy.nanmin(column_data)) + profile["max"] = max(profile["max"], numpy.nanmax(column_data)) + else: + profile["min"] = numpy.nanmin(column_data) + profile["max"] = numpy.nanmax(column_data) + + profile_collector[column] = profile + + buffer = [] + + for column, profile in profile_collector.items(): + profile["column_name"] = columns.get_preferred_name(column) + profile["type"] = ", ".join(profile["type"]) + buffer.append(profile) + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="show_columns", + table_aliases=[], + ) + return table + + +def _extended_collector(pages): + """ + Collect summary statistics about each column + """ + + EMPTY_PROFILE = orjson.dumps( + { + "column_name": None, + "type": [], + "count": 0, + "min": None, + "max": None, + "missing": 0, + "mean": None, + "quantiles": None, + "histogram": None, + "unique": 0, + "most_frequent_values": None, + "most_frequent_counts": None, + "distogram": None, + } + ) + + columns = None + profile_collector = {} + + for page in pages: + if columns is None: + columns = Columns(page) + + for column in page.column_names: + column_data = page.column(column) + profile = profile_collector.get(column, orjson.loads(EMPTY_PROFILE)) + _type = determine_type(str(column_data.type)) + if _type not in profile["type"]: + profile["type"].append(_type) + + profile["count"] += len(column_data) + + # calculate the missing count more robustly + missing = reduce( + lambda x, y: x + 1, + (i for i in column_data if i in (None, numpy.nan)), + 0, + ) + profile["missing"] += missing + + if _type in (OPTERYX_TYPES.LIST, OPTERYX_TYPES.STRUCT, OPTERYX_TYPES.OTHER): + profile_collector[column] = profile + continue + + # convert TIMESTAMP into a NUMERIC (seconds after Linux Epoch) + if _type == OPTERYX_TYPES.TIMESTAMP: + import datetime + + to_linux_epoch = ( + lambda x: numpy.nan + if x.as_py() is None + else datetime.datetime.fromisoformat( + x.as_py().isoformat() + ).timestamp() + ) + column_data = (to_linux_epoch(i) for i in column_data) + else: + column_data = (i.as_py() for i in column_data) + + # remove empty values + column_data = numpy.array( + [i for i in column_data if i not in (None, numpy.nan)] + ) + + if _type in (OPTERYX_TYPES.BOOLEAN, OPTERYX_TYPES.VARCHAR): + if profile[""] + + if _type in (OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP): + # populate the distogram + if profile["distogram"] is None: + dgram = distogram.Distogram(10) + else: + dgram = profile["distogram"] + values, counts = numpy.unique(column_data, return_counts=True) + for index, value in enumerate(values): + dgram = distogram.update(dgram, value=value, count=counts[index]) + profile["distogram"] = dgram + + profile_collector[column] = profile + + buffer = [] + + for column, profile in profile_collector.items(): + profile["column_name"] = columns.get_preferred_name(column) + profile["type"] = ", ".join(profile["type"]) + dgram = profile.pop("distogram") + if dgram: + profile["min"], profile["max"] = distogram.bounds(dgram) + profile["mean"] = distogram.mean(dgram) + + histogram = distogram.histogram(dgram, bin_count=10) + if histogram: + profile["histogram"] = histogram[0] + + profile["quantiles"] = ( + distogram.quantile(dgram, value=0.25), + distogram.quantile(dgram, value=0.5), + distogram.quantile(dgram, value=0.75), + ) + buffer.append(profile) + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="show_columns", + table_aliases=[], + ) + return table + + class ShowColumnsNode(BasePlanNode): def __init__(self, statistics: QueryStatistics, **config): - self._full = (config.get("full"),) + self._full = config.get("full") self._extended = config.get("extended") pass @@ -67,136 +273,18 @@ def execute(self) -> Iterable: if data_pages is None: return None - if self._full: - dataset = pyarrow.concat_tables(data_pages.execute()) - else: - dataset = next(data_pages.execute()) - - source_metadata = Columns(dataset) - - buffer = [] - for column in dataset.column_names: - column_data = dataset.column(column) - _type = determine_type(str(column_data.type)) - new_row = { - "column_name": source_metadata.get_preferred_name(column), - "type": _type, - "nulls": (column_data.null_count) > 0, - } - - if self._extended: - - new_row = { - **new_row, - "count": -1, - "min": None, - "max": None, - "mean": None, - "quantiles": None, - "histogram": None, - "unique": -1, - "missing": -1, - "most_frequent_values": None, - "most_frequent_counts": None, - } - - # Basic counting statistics - new_row["count"] = len(column_data) - new_row["missing"] = reduce( - lambda x, y: x + 1, - (i for i in column_data if i in (None, numpy.nan)), - 0, - ) - # Number of unique items in the column) - # We use hashes because some types don't play nicely - values = numpy.unique( - [hash(i) for i in column_data if i not in (None, numpy.nan)] - ) - unique_values = len(values) - new_row["unique"] = unique_values - del values - - # LISTS and STRUCTS are complex, don't profile them - if _type in (OPTERYX_TYPES.LIST, OPTERYX_TYPES.STRUCT): - continue - - # convert TIMESTAMP into a NUMERIC (seconds after Linux Epoch) - if _type == OPTERYX_TYPES.TIMESTAMP: - import datetime - - to_linux_epoch = ( - lambda x: numpy.nan - if x.as_py() is None - else datetime.datetime.fromisoformat( - x.as_py().isoformat() - ).timestamp() - ) - column_data = (to_linux_epoch(i) for i in column_data) - else: - column_data = (i.as_py() for i in column_data) - - # remove empty values - column_data = numpy.array( - [i for i in column_data if i not in (None, numpy.nan)] - ) - - # don't work with long strings - if _type == OPTERYX_TYPES.VARCHAR: - if max(len(i) for i in column_data) > 32: - continue - - # For NUMERIC and TIMESTAMPS (now NUMERIC), get min, max, mean, - # quantiles and distribution - if _type in ( - OPTERYX_TYPES.NUMERIC, - OPTERYX_TYPES.TIMESTAMP, - ): - new_row["min"] = numpy.min(column_data) - new_row["max"] = numpy.max(column_data) - - # Python has no practical limits on numbers, but Arrow does - if ( - new_row["min"] < -9007199254740992 - or new_row["max"] > 9007199254740992 - ): - new_row["min"] = None - new_row["max"] = None - else: - new_row["mean"] = numpy.mean(column_data) - new_row["quantiles"] = numpy.percentile( - column_data, [25, 50, 75] - ) - new_row["histogram"], boundaries = numpy.histogram( - column_data, min(unique_values, 10) - ) - del boundaries - - # Don't work out frequencies for TIMESTAMPS - if _type not in (OPTERYX_TYPES.TIMESTAMP) and unique_values < 10: - column_data, counts = numpy.unique(column_data, return_counts=True) - # skip if everything occurs the same number of times - if max(counts) != min(counts): - top_counts = sorted(counts, reverse=True)[0:5] - most_frequent = { - str(v): c - for v, c in zip(column_data, counts) - if c in top_counts - } - new_row["most_frequent_values"] = list(most_frequent.keys()) - new_row["most_frequent_counts"] = most_frequent.values() - del most_frequent - del counts - - del column_data - - buffer.append(new_row) - - table = pyarrow.Table.from_pylist(buffer) - table = Columns.create_table_metadata( - table=table, - expected_rows=len(buffer), - name="show_columns", - table_aliases=[], - ) - yield table - return + if not (self._full or self._extended): + # if it's not full or extended, do just get the list of columns and their + # types + yield _simple_collector(next(data_pages.execute())) + return + + if self._full and not self._extended: + # we're going to read the full table, so we can count stuff + yield _full_collector(data_pages.execute()) + return + + if self._extended: + # get everything we can reasonable get + yield _extended_collector(data_pages.execute()) + return diff --git a/opteryx/third_party/distogram/LICENSE.txt b/opteryx/third_party/distogram/LICENSE.txt new file mode 100644 index 000000000..96b7dfe5d --- /dev/null +++ b/opteryx/third_party/distogram/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) 2020 by Romain Picard + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/opteryx/third_party/distogram/README.rst b/opteryx/third_party/distogram/README.rst new file mode 100644 index 000000000..71f96f375 --- /dev/null +++ b/opteryx/third_party/distogram/README.rst @@ -0,0 +1,152 @@ +========== +DistoGram +========== + + +.. image:: https://badge.fury.io/py/distogram.svg + :target: https://badge.fury.io/py/distogram + +.. image:: https://github.com/maki-nage/distogram/workflows/Python%20package/badge.svg + :target: https://github.com/maki-nage/distogram/actions?query=workflow%3A%22Python+package%22 + :alt: Github WorkFlows + +.. image:: https://img.shields.io/codecov/c/github/maki-nage/distogram?style=plastic&color=brightgreen&logo=codecov&style=for-the-badge + :target: https://codecov.io/gh/maki-nage/distogram + :alt: Coverage + +.. image:: https://readthedocs.org/projects/distogram/badge/?version=latest + :target: https://distogram.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +.. image:: https://mybinder.org/badge_logo.svg + :target: https://mybinder.org/v2/gh/maki-nage/distogram/master?urlpath=notebooks%2Fexamples%2Fdistogram.ipynb + + +DistoGram is a library that allows to compute histogram on streaming data, in +distributed environments. The implementation follows the algorithms described in +Ben-Haim's `Streaming Parallel Decision Trees +`__ + +Get Started +============ + +First create a compressed representation of a distribution: + +.. code:: python + + import numpy as np + import distogram + + distribution = np.random.normal(size=10000) + + # Create and feed distogram from distribution + # on a real usage, data comes from an event stream + h = distogram.Distogram() + for i in distribution: + h = distogram.update(h, i) + + +Compute statistics on the distribution: + +.. code:: python + + nmin, nmax = distogram.bounds(h) + print("count: {}".format(distogram.count(h))) + print("mean: {}".format(distogram.mean(h))) + print("stddev: {}".format(distogram.stddev(h))) + print("min: {}".format(nmin)) + print("5%: {}".format(distogram.quantile(h, 0.05))) + print("25%: {}".format(distogram.quantile(h, 0.25))) + print("50%: {}".format(distogram.quantile(h, 0.50))) + print("75%: {}".format(distogram.quantile(h, 0.75))) + print("95%: {}".format(distogram.quantile(h, 0.95))) + print("max: {}".format(nmax)) + + +.. code:: console + + count: 10000 + mean: -0.005082954640481095 + stddev: 1.0028524290149186 + min: -3.5691130319855047 + 5%: -1.6597242392338374 + 25%: -0.6785107421744653 + 50%: -0.008672960012168916 + 75%: 0.6720718926935414 + 95%: 1.6476822301131866 + max: 3.8800560034877427 + +Compute and display the histogram of the distribution: + +.. code:: python + + hist = distogram.histogram(h) + df_hist = pd.DataFrame(np.array(hist), columns=["bin", "count"]) + fig = px.bar(df_hist, x="bin", y="count", title="distogram") + fig.update_layout(height=300) + fig.show() + +.. image:: docs/normal_histogram.png + :scale: 60% + :align: center + +Install +======== + +DistoGram is available on PyPi and can be installed with pip: + +.. code:: console + + pip install distogram + + +Play With Me +============ + +You can test this library directly on this +`live notebook `__. + + +Performances +============= + +Distogram is design for fast updates when using python types. The following +numbers show the results of the benchmark program located in the examples. + +On a i7-9800X Intel CPU, performances are: + +============ ========== ======= ========== +Interpreter Operation Numpy Req/s +============ ========== ======= ========== +pypy 7.3 update no 6563311 +pypy 7.3 update yes 111318 +CPython 3.7 update no 436709 +CPython 3.7 update yes 251603 +============ ========== ======= ========== + +On a modest 2014 13" macbook pro, performances are: + +============ ========== ======= ========== +Interpreter Operation Numpy Req/s +============ ========== ======= ========== +pypy 7.3 update no 3572436 +pypy 7.3 update yes 37630 +CPython 3.7 update no 112749 +CPython 3.7 update yes 81005 +============ ========== ======= ========== + +As you can see, your are encouraged to use pypy with python native types. Pypy's +jit is penalised by numpy native types, causing a huge performance hit. Moreover +the streaming phylosophy of Distogram is more adapted to python native types +while numpy is optimized for batch computations, even with CPython. + + +Credits +======== + +Although this code has been written by following the aforementioned research +paper, some parts are also inspired by the implementation from +`Carson Farmer `__. + +Thanks to `John Belmonte `_ for his help on +performances and accuracy improvements. \ No newline at end of file diff --git a/opteryx/third_party/distogram/__init__.py b/opteryx/third_party/distogram/__init__.py new file mode 100644 index 000000000..8b5fd8b84 --- /dev/null +++ b/opteryx/third_party/distogram/__init__.py @@ -0,0 +1,425 @@ +# type:ignore +__author__ = """Romain Picard""" +__email__ = "romain.picard@oakbits.com" +__version__ = "3.0.0" + +import math +from bisect import bisect_left +from functools import reduce +from itertools import accumulate +from operator import itemgetter +from typing import List +from typing import Optional +from typing import Tuple + +EPSILON = 1e-5 +Bin = Tuple[float, int] + + +# bins is a tuple of (cut point, count) +class Distogram(object): + """Compressed representation of a distribution.""" + + __slots__ = "bin_count", "bins", "min", "max", "diffs", "min_diff", "weighted_diff" + + def __init__(self, bin_count: int = 100, weighted_diff: bool = False): + """Creates a new Distogram object + + Args: + bin_count: [Optional] the number of bins to use. + weighted_diff: [Optional] Whether to use weighted bin sizes. + + Returns: + A Distogram object. + """ + self.bin_count: int = bin_count + self.bins: List[Bin] = list() + self.min: Optional[float] = None + self.max: Optional[float] = None + self.diffs: Optional[List[float]] = None + self.min_diff: Optional[float] = None + self.weighted_diff: bool = weighted_diff + + +def _linspace(start: float, stop: float, num: int) -> List[float]: + if num == 1: + return [start, stop] + step = (stop - start) / float(num) + values = [start + step * i for i in range(num)] + values.append(stop) + return values + + +def _moment(x: List[float], counts: List[float], c: float, n: int) -> float: + m = (ci * (v - c) ** n for i, (ci, v) in enumerate(zip(counts, x))) + return sum(m) / sum(counts) + + +def _weighted_diff(h: Distogram, left: Bin, right: Bin): + diff = left[0] - right[0] + if h.weighted_diff is True: + diff *= math.log(EPSILON + min(left[1], right[1])) + return diff + + +def _update_diffs(h: Distogram, i: int) -> None: + if h.diffs is not None: + update_min = False + + if i > 0: + if h.diffs[i - 1] == h.min_diff: + update_min = True + + h.diffs[i - 1] = _weighted_diff(h, h.bins[i], h.bins[i - 1]) + if h.diffs[i - 1] < h.min_diff: + h.min_diff = h.diffs[i - 1] + + if i < len(h.bins) - 1: + if h.diffs[i] == h.min_diff: + update_min = True + + h.diffs[i] = _weighted_diff(h, h.bins[i + 1], h.bins[i]) + if h.diffs[i] < h.min_diff: + h.min_diff = h.diffs[i] + + if update_min is True: + h.min_diff = min(h.diffs) + + return + + +def _trim(h: Distogram) -> Distogram: + while len(h.bins) > h.bin_count: + if h.diffs is not None: + i = h.diffs.index(h.min_diff) + else: + diffs = [ + (i - 1, _weighted_diff(h, b, h.bins[i - 1])) + for i, b in enumerate(h.bins[1:], start=1) + ] + i, _ = min(diffs, key=itemgetter(1)) + + v1, f1 = h.bins[i] + v2, f2 = h.bins[i + 1] + h.bins[i] = (v1 * f1 + v2 * f2) / (f1 + f2), f1 + f2 + del h.bins[i + 1] + + if h.diffs is not None: + del h.diffs[i] + _update_diffs(h, i) + h.min_diff = min(h.diffs) + + return h + + +def _trim_in_place(h: Distogram, value: float, c: int, i: int): + v, f = h.bins[i] + h.bins[i] = (v * f + value * c) / (f + c), f + c + _update_diffs(h, i) + return h + + +def _compute_diffs(h: Distogram) -> List[float]: + if h.weighted_diff is True: + diffs = [ + (v2 - v1) * math.log(EPSILON + min(f1, f2)) + for (v1, f1), (v2, f2) in zip(h.bins[:-1], h.bins[1:]) + ] + else: + diffs = [v2 - v1 for (v1, _), (v2, _) in zip(h.bins[:-1], h.bins[1:])] + h.min_diff = min(diffs) + + return diffs + + +def _search_in_place_index(h: Distogram, new_value: float, index: int) -> int: + if h.diffs is None: + h.diffs = _compute_diffs(h) + + if index > 0: + diff1 = _weighted_diff(h, (new_value, 1), h.bins[index - 1]) + diff2 = _weighted_diff(h, h.bins[index], (new_value, 1)) + + i_bin, diff = (index - 1, diff1) if diff1 < diff2 else (index, diff2) + + return i_bin if diff < h.min_diff else -1 + + return -1 + + +def update(h: Distogram, value: float, count: int = 1) -> Distogram: + """Adds a new element to the distribution. + + Args: + h: A Distogram object. + value: The value to add on the histogram. + count: [Optional] The number of times that value must be added. + + Returns: + A Distogram object where value as been processed. + + Raises: + ValueError if count is not strictly positive. + """ + if count <= 0: + raise ValueError("count must be strictly positive") + + index = 0 + if len(h.bins) > 0: + if value <= h.bins[0][0]: + index = 0 + elif value >= h.bins[-1][0]: + index = -1 + else: + index = bisect_left(h.bins, (value, 1)) + + vi, fi = h.bins[index] + if vi == value: + h.bins[index] = (vi, fi + count) + return h + + if index > 0 and len(h.bins) >= h.bin_count: + in_place_index = _search_in_place_index(h, value, index) + if in_place_index > 0: + h = _trim_in_place(h, value, count, in_place_index) + return h + + if index == -1: + h.bins.append((value, count)) + if h.diffs is not None: + diff = _weighted_diff(h, h.bins[-1], h.bins[-2]) + h.diffs.append(diff) + h.min_diff = min(h.min_diff, diff) + else: + h.bins.insert(index, (value, count)) + if h.diffs is not None: + h.diffs.insert(index, 0) + _update_diffs(h, index) + + if (h.min is None) or (h.min > value): + h.min = value + if (h.max is None) or (h.max < value): + h.max = value + + return _trim(h) + + +def merge(h1: Distogram, h2: Distogram) -> Distogram: + """Merges two Distogram objects + + Args: + h1: First Distogram. + h2: Second Distogram. + + Returns: + A Distogram object being the composition of h1 and h2. The number of + bins in this Distogram is equal to the number of bins in h1. + """ + h = reduce( + lambda residual, b: update(residual, *b), + h2.bins, + h1, + ) + return h + + +def count_at(h: Distogram, value: float): + """Counts the number of elements present in the distribution up to value. + + Args: + h: A Distogram object. + value: The value up to what elements must be counted. + + Returns: + An estimation of the real count, computed from the compressed + representation of the distribution. Returns None if the Distogram + object contains no element or value is outside of the distribution + bounds. + """ + if len(h.bins) == 0: + return None + + if value < h.min or value > h.max: + return None + + if value == h.min: + return 0 + + if value == h.max: + return count(h) + + v0, f0 = h.bins[0] + vl, fl = h.bins[-1] + if value <= v0: # left + ratio = (value - h.min) / (v0 - h.min) + result = ratio * v0 / 2 + elif value >= vl: # right + ratio = (value - vl) / (h.max - vl) + result = (1 + ratio) * fl / 2 + result += sum((f for _, f in h.bins[:-1])) + else: + i = sum(((value > v) for v, _ in h.bins)) - 1 + vi, fi = h.bins[i] + vj, fj = h.bins[i + 1] + + mb = fi + (fj - fi) / (vj - vi) * (value - vi) + result = (fi + mb) / 2 * (value - vi) / (vj - vi) + result += sum((f for _, f in h.bins[:i])) + + result = result + fi / 2 + + return result + + +def count(h: Distogram) -> float: + """Counts the number of elements in the distribution. + + Args: + h: A Distogram object. + + Returns: + The number of elements in the distribution. + """ + return sum((f for _, f in h.bins)) + + +def bounds(h: Distogram) -> Tuple[float, float]: + """Returns the min and max values of the distribution. + + Args: + h: A Distogram object. + + Returns: + A tuple containing the minimum and maximum values of the distribution. + """ + return h.min, h.max + + +def mean(h: Distogram) -> float: + """Returns the mean of the distribution. + + Args: + h: A Distogram object. + + Returns: + An estimation of the mean of the values in the distribution. + """ + p, m = zip(*h.bins) + return _moment(p, m, 0, 1) + + +def variance(h: Distogram) -> float: + """Returns the variance of the distribution. + + Args: + h: A Distogram object. + + Returns: + An estimation of the variance of the values in the distribution. + """ + p, m = zip(*h.bins) + return _moment(p, m, mean(h), 2) + + +def stddev(h: Distogram) -> float: + """Returns the standard deviation of the distribution. + + Args: + h: A Distogram object. + + Returns: + An estimation of the standard deviation of the values in the + distribution. + """ + return math.sqrt(variance(h)) + + +def histogram(h: Distogram, bin_count: int = 100) -> Tuple[List[float], List[float]]: + """Returns a histogram of the distribution in numpy format. + + Args: + h: A Distogram object. + bin_count: [Optional] The number of bins in the histogram. + + Returns: + An estimation of the histogram of the distribution, or None + if there is not enough items in the distribution. + """ + if len(h.bins) < bin_count: + return None + + bin_bounds = _linspace(h.min, h.max, num=bin_count) + counts = [count_at(h, e) for e in bin_bounds] + counts = [new - last for new, last in zip(counts[1:], counts[:-1])] + u = tuple([counts, bin_bounds]) + return u + + +def frequency_density_distribution(h: Distogram) -> Tuple[List[float], List[float]]: + """Returns a histogram of the distribution + + Args: + h: A Distogram object. + + Returns: + An estimation of the frequency density distribution, or None if + there are not enough values in the distribution. + """ + + if count(h) < 2: + return None + + bin_bounds = [float(i[0]) for i in h.bins] + bin_widths = [ + (bin_bounds[i] - bin_bounds[i - 1]) for i in range(1, len(bin_bounds)) + ] + counts = [0] + counts.extend([count_at(h, e) for e in bin_bounds[1:]]) + densities = [ + (new - last) / delta + for new, last, delta in zip(counts[1:], counts[:-1], bin_widths) + ] + return (densities, bin_bounds) + + +def quantile(h: Distogram, value: float) -> Optional[float]: + """Returns a quantile of the distribution + + Args: + h: A Distogram object. + value: The quantile to compute. Must be between 0 and 1 + + Returns: + An estimation of the quantile. Returns None if the Distogram + object contains no element or value is outside of [0, 1]. + """ + if len(h.bins) == 0: + return None + + if not (0 <= value <= 1): + return None + + total_count = count(h) + q_count = int(total_count * value) + v0, f0 = h.bins[0] + vl, fl = h.bins[-1] + + if q_count <= (f0 / 2): # left values + fraction = q_count / (f0 / 2) + result = h.min + (fraction * (v0 - h.min)) + + elif q_count >= (total_count - (fl / 2)): # right values + base = q_count - (total_count - (fl / 2)) + fraction = base / (fl / 2) + result = vl + (fraction * (h.max - vl)) + + else: + mb = q_count - f0 / 2 + mids = [(fi + fj) / 2 for (_, fi), (_, fj) in zip(h.bins[:-1], h.bins[1:])] + i, _ = next(filter(lambda i_f: mb < i_f[1], enumerate(accumulate(mids)))) + + (vi, _), (vj, _) = h.bins[i], h.bins[i + 1] + fraction = (mb - sum(mids[:i])) / mids[i] + result = vi + (fraction * (vj - vi)) + + return result diff --git a/opteryx/third_party/distogram/tests/test_bounds.py b/opteryx/third_party/distogram/tests/test_bounds.py new file mode 100644 index 000000000..e4efa9c63 --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_bounds.py @@ -0,0 +1,18 @@ +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +import distogram +import random + + +def test_bounds(): + normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)] + h = distogram.Distogram() + + for i in normal: + h = distogram.update(h, i) + + dmin, dmax = distogram.bounds(h) + assert dmin == min(normal) + assert dmax == max(normal) diff --git a/opteryx/third_party/distogram/tests/test_count.py b/opteryx/third_party/distogram/tests/test_count.py new file mode 100644 index 000000000..1fd420b4a --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_count.py @@ -0,0 +1,17 @@ +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +import distogram + + +def test_count(): + h = distogram.Distogram(bin_count=3) + assert distogram.count(h) == 0 + + h = distogram.update(h, 16, count=4) + assert distogram.count(h) == 4 + h = distogram.update(h, 23, count=3) + assert distogram.count(h) == 7 + h = distogram.update(h, 28, count=5) + assert distogram.count(h) == 12 diff --git a/opteryx/third_party/distogram/tests/test_count_at.py b/opteryx/third_party/distogram/tests/test_count_at.py new file mode 100644 index 000000000..ccdd1d11e --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_count_at.py @@ -0,0 +1,77 @@ +# type:ignore +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +from pytest import approx +import random +import distogram + + +def test_count_at(): + h = distogram.Distogram(bin_count=3) + print(h) + + # fill histogram + h = distogram.update(h, 16, count=4) + h = distogram.update(h, 23, count=3) + h = distogram.update(h, 28, count=5) + print(h) + + actual_result = distogram.count_at(h, 25) + assert actual_result == approx(6.859999999) + + +def test_count_at_normal(): + points = 10000 + normal = [random.normalvariate(0.0, 1.0) for _ in range(points)] + h = distogram.Distogram() + + for i in normal: + h = distogram.update(h, i) + + assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05) + + +def test_count_at_not_enough_elements(): + h = distogram.Distogram() + + h = distogram.update(h, 1) + h = distogram.update(h, 2) + h = distogram.update(h, 3) + + assert distogram.count_at(h, 2.5) == 2 + + +def test_count_at_left(): + h = distogram.Distogram(bin_count=6) + + for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]: + h = distogram.update(h, i) + + assert distogram.count_at(h, 0.77) == approx(0.14) + + +def test_count_at_right(): + h = distogram.Distogram(bin_count=6) + + for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]: + h = distogram.update(h, i) + + assert distogram.count_at(h, 6.5) == approx(7.307692307692308) + + +def test_count_at_empty(): + h = distogram.Distogram() + + assert distogram.count_at(h, 6.5) is None + + +def test_count_at_out_of_bouns(): + h = distogram.Distogram() + + for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]: + h = distogram.update(h, i) + + assert distogram.count_at(h, 0.2) is None + assert distogram.count_at(h, 10) is None diff --git a/opteryx/third_party/distogram/tests/test_histogram.py b/opteryx/third_party/distogram/tests/test_histogram.py new file mode 100644 index 000000000..83144f5fe --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_histogram.py @@ -0,0 +1,50 @@ +# type:ignore +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +import random +import numpy as np +from pytest import approx +import distogram + + +def test_histogram(): + normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)] + h = distogram.Distogram(bin_count=64) + + for i in normal: + h = distogram.update(h, i) + + np_values, np_edges = np.histogram(normal, 10) + d_values, d_edges = distogram.histogram(h, 10) + + h = distogram.Distogram(bin_count=3) + h = distogram.update(h, 23) + h = distogram.update(h, 28) + h = distogram.update(h, 16) + assert distogram.histogram(h, bin_count=3) == ( + approx([1.0714285714285714, 0.6285714285714286, 1.3]), + [16.0, 20.0, 24.0, 28], + ) + assert sum(distogram.histogram(h, bin_count=3)[0]) == approx(3.0) + + +def test_histogram_on_too_small_distribution(): + h = distogram.Distogram(bin_count=64) + + for i in range(5): + h = distogram.update(h, i) + + assert distogram.histogram(h, 10) == None + + +def test_format_histogram(): + bin_count = 4 + h = distogram.Distogram(bin_count=bin_count) + + for i in range(4): + h = distogram.update(h, i) + + hist = distogram.histogram(h, bin_count=bin_count) + assert len(hist[1]) == len(hist[0]) + 1 diff --git a/opteryx/third_party/distogram/tests/test_quantile.py b/opteryx/third_party/distogram/tests/test_quantile.py new file mode 100644 index 000000000..8e576c282 --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_quantile.py @@ -0,0 +1,79 @@ +# type:ignore +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +from pytest import approx +import distogram + +import numpy as np +import random + + +def test_quantile(): + h = distogram.Distogram(bin_count=3) + h = distogram.update(h, 16, count=4) + h = distogram.update(h, 23, count=3) + h = distogram.update(h, 28, count=5) + + assert distogram.quantile(h, 0.5) == approx(23.625) + + +def test_quantile_not_enough_elemnts(): + h = distogram.Distogram(bin_count=10) + + for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]: + h = distogram.update(h, i) + + assert distogram.quantile(h, 0.5) == approx(13.14) + + +def test_quantile_on_left(): + h = distogram.Distogram(bin_count=6) + + data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98] + for i in data: + h = distogram.update(h, i) + + assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01) + assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05) + assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05) + + +def test_quantile_on_right(): + h = distogram.Distogram(bin_count=6) + + data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1] + for i in data: + h = distogram.update(h, i) + + assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01) + assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01) + + +def test_normal(): + # normal = np.random.normal(0,1, 1000) + normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)] + h = distogram.Distogram(bin_count=64) + + for i in normal: + h = distogram.update(h, i) + + assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2) + assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2) + + +def test_quantile_empty(): + h = distogram.Distogram() + + assert distogram.quantile(h, 0.3) is None + + +def test_quantile_out_of_bouns(): + h = distogram.Distogram() + + for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]: + h = distogram.update(h, i) + + assert distogram.quantile(h, -0.2) is None + assert distogram.quantile(h, 10) is None diff --git a/opteryx/third_party/distogram/tests/test_stats.py b/opteryx/third_party/distogram/tests/test_stats.py new file mode 100644 index 000000000..3a4434eab --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_stats.py @@ -0,0 +1,22 @@ +# type:ignore +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +from pytest import approx +import distogram + +import numpy as np +import random + + +def test_stats(): + normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)] + h = distogram.Distogram() + + for i in normal: + h = distogram.update(h, i) + + assert distogram.mean(h) == approx(np.mean(normal), abs=0.1) + assert distogram.variance(h) == approx(np.var(normal), abs=0.1) + assert distogram.stddev(h) == approx(np.std(normal), abs=0.1) diff --git a/opteryx/third_party/distogram/tests/test_update.py b/opteryx/third_party/distogram/tests/test_update.py new file mode 100644 index 000000000..bec641e8b --- /dev/null +++ b/opteryx/third_party/distogram/tests/test_update.py @@ -0,0 +1,42 @@ +# type:ignore +import sys +import os + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) +import pytest +from pytest import approx +import distogram + + +def test_update(): + h = distogram.Distogram(bin_count=3) + + # fill histogram + h = distogram.update(h, 23) + assert h.bins == [(23, 1)] + h = distogram.update(h, 28) + assert h.bins == [(23, 1), (28, 1)] + h = distogram.update(h, 16) + assert h.bins == [(16, 1), (23, 1), (28, 1)] + + # update count on existing value + h = distogram.update(h, 23) + assert h.bins == [(16, 1), (23, 2), (28, 1)] + h = distogram.update(h, 28) + assert h.bins == [(16, 1), (23, 2), (28, 2)] + h = distogram.update(h, 16) + assert h.bins == [(16, 2), (23, 2), (28, 2)] + + # merge values + h = distogram.update(h, 26) + assert h.bins[0] == (16, 2) + assert h.bins[1] == (23, 2) + assert h.bins[2][0] == approx(27.33333) + assert h.bins[2][1] == 3 + + +def test_update_with_invalid_count(): + h = distogram.Distogram(bin_count=3) + + with pytest.raises(ValueError): + distogram.update(h, 23, count=0)