From 6d573e577706f3be822fc18c8f96192152651aaf Mon Sep 17 00:00:00 2001 From: Justin Joyce Date: Sat, 18 Jun 2022 22:15:05 +0100 Subject: [PATCH] FIX/#199 --- .../engine/planner/operations/show_columns.py | 22 +++++++++++++++---- opteryx/sketches/__init__.py | 2 +- opteryx/third_party/abctree/__init__.py | 2 +- opteryx/third_party/hyperloglog/__init__.py | 2 +- .../third_party/hyperloglog/hyperloglog.py | 10 ++++----- .../third_party/hyperloglog/tests/test_hll.py | 4 +--- tests/sketches/test_counting_tree.py | 5 +++-- 7 files changed, 30 insertions(+), 17 deletions(-) diff --git a/opteryx/engine/planner/operations/show_columns.py b/opteryx/engine/planner/operations/show_columns.py index 757013da0..fb1d03317 100644 --- a/opteryx/engine/planner/operations/show_columns.py +++ b/opteryx/engine/planner/operations/show_columns.py @@ -32,6 +32,7 @@ MAX_COLLECTOR: int = 17 + def myhash(any): from cityhash import CityHash64 @@ -205,21 +206,34 @@ def _extended_collector(pages): [i for i in column_data if i not in (None, numpy.nan)] ) - if _type in (OPTERYX_TYPES.VARCHAR, OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP): + if _type in ( + OPTERYX_TYPES.VARCHAR, + OPTERYX_TYPES.NUMERIC, + OPTERYX_TYPES.TIMESTAMP, + ): # hyperloglog estimates cardinality/uniqueness hll = profile.get("hyperloglog") if hll is None: hll = hyperloglog.HyperLogLogPlusPlus(p=16) [hll.update(value) for value in column_data] profile["hyperloglog"] = hll - - if _type in (OPTERYX_TYPES.BOOLEAN, OPTERYX_TYPES.VARCHAR, OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP): + + if _type in ( + OPTERYX_TYPES.BOOLEAN, + OPTERYX_TYPES.VARCHAR, + OPTERYX_TYPES.NUMERIC, + OPTERYX_TYPES.TIMESTAMP, + ): # counter is used to collect and count unique values counter = profile.get("counter") if counter is None: counter = CountingTree() if len(counter) < MAX_COLLECTOR: - [counter.insert(value) for value in column_data if len(counter) < MAX_COLLECTOR] + [ + counter.insert(value) + for value in column_data + if len(counter) < MAX_COLLECTOR + ] profile["counter"] = counter if _type in (OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP): diff --git a/opteryx/sketches/__init__.py b/opteryx/sketches/__init__.py index 217a58678..3a8caba6f 100644 --- a/opteryx/sketches/__init__.py +++ b/opteryx/sketches/__init__.py @@ -1,3 +1,3 @@ import pyximport -pyximport.install() \ No newline at end of file +pyximport.install() diff --git a/opteryx/third_party/abctree/__init__.py b/opteryx/third_party/abctree/__init__.py index f5c5c8244..8c9c73c40 100644 --- a/opteryx/third_party/abctree/__init__.py +++ b/opteryx/third_party/abctree/__init__.py @@ -2,4 +2,4 @@ pyximport.install() -from .abctree import ABCTree \ No newline at end of file +from .abctree import ABCTree diff --git a/opteryx/third_party/hyperloglog/__init__.py b/opteryx/third_party/hyperloglog/__init__.py index cb85881a7..062ad28b8 100644 --- a/opteryx/third_party/hyperloglog/__init__.py +++ b/opteryx/third_party/hyperloglog/__init__.py @@ -1,2 +1,2 @@ from .hyperloglog import HyperLogLog -from .hyperloglog import HyperLogLogPlusPlus \ No newline at end of file +from .hyperloglog import HyperLogLogPlusPlus diff --git a/opteryx/third_party/hyperloglog/hyperloglog.py b/opteryx/third_party/hyperloglog/hyperloglog.py index bfc35943e..9881b522e 100644 --- a/opteryx/third_party/hyperloglog/hyperloglog.py +++ b/opteryx/third_party/hyperloglog/hyperloglog.py @@ -283,7 +283,8 @@ def deserialize(cls, buf): ) except TypeError: h.reg = numpy.array( - struct.unpack_from("%dB" % h.m, bytearray(buf), offset), dtype=numpy.int8 + struct.unpack_from("%dB" % h.m, bytearray(buf), offset), + dtype=numpy.int8, ) return h @@ -306,7 +307,8 @@ def __setstate__(self, buf): ) except TypeError: self.reg = numpy.array( - struct.unpack_from("%dB" % self.m, bytearray(buf), offset), dtype=numpy.int8 + struct.unpack_from("%dB" % self.m, bytearray(buf), offset), + dtype=numpy.int8, ) @@ -338,9 +340,7 @@ class HyperLogLogPlusPlus(HyperLogLog): _hash_range_byte = 8 def __init__(self, p=8, reg=None, hashfunc=CityHash64): - super(HyperLogLogPlusPlus, self).__init__( - p=p, reg=reg, hashfunc=hashfunc - ) + super(HyperLogLogPlusPlus, self).__init__(p=p, reg=reg, hashfunc=hashfunc) def _get_threshold(self, p): return _thresholds[p - 4] diff --git a/opteryx/third_party/hyperloglog/tests/test_hll.py b/opteryx/third_party/hyperloglog/tests/test_hll.py index ba823f093..fd9bfa134 100644 --- a/opteryx/third_party/hyperloglog/tests/test_hll.py +++ b/opteryx/third_party/hyperloglog/tests/test_hll.py @@ -6,6 +6,7 @@ import sys import os + sys.path.insert(1, os.path.join(sys.path[0], "..")) from hyperloglog import HyperLogLog, HyperLogLogPlusPlus @@ -126,9 +127,6 @@ def test_copy(self): self.assertEqual(h1.hashfunc, h2.hashfunc) - - - class TestHyperLogLogPlusPlus(TestHyperLogLog): _class = HyperLogLogPlusPlus diff --git a/tests/sketches/test_counting_tree.py b/tests/sketches/test_counting_tree.py index c18ad9d8d..5cd0b9318 100644 --- a/tests/sketches/test_counting_tree.py +++ b/tests/sketches/test_counting_tree.py @@ -5,6 +5,7 @@ from opteryx.sketches.counting_tree import CountingTree + def test_counter(): pass @@ -13,7 +14,7 @@ def test_counter(): ct = CountingTree() for i in range(20): for u in range(i): - #ct.insert(i) + # ct.insert(i) ct.insert(u) - print(ct) \ No newline at end of file + print(ct)