Skip to content

Commit

Permalink
FIX/#199
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jun 18, 2022
1 parent df98380 commit 3418db4
Show file tree
Hide file tree
Showing 14 changed files with 7,627 additions and 15 deletions.
41 changes: 35 additions & 6 deletions opteryx/engine/planner/operations/show_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
from opteryx.engine.attribute_types import OPTERYX_TYPES, determine_type
from opteryx.engine.planner.operations.base_plan_node import BasePlanNode
from opteryx.exceptions import SqlError
from opteryx.third_party import distogram
from opteryx.utils.columns import Columns

MAX_COLLECTOR: int = 17

def myhash(any):
from cityhash import CityHash64
Expand Down Expand Up @@ -135,6 +135,9 @@ def _extended_collector(pages):
"""
Collect summary statistics about each column
"""
from opteryx.sketches.counting_tree import CountingTree
from opteryx.third_party import distogram
from opteryx.third_party import hyperloglog

EMPTY_PROFILE = orjson.dumps(
{
Expand All @@ -147,7 +150,7 @@ def _extended_collector(pages):
"mean": None,
"quantiles": None,
"histogram": None,
"unique": 0,
"unique": None,
"most_frequent_values": None,
"most_frequent_counts": None,
"distogram": None,
Expand Down Expand Up @@ -202,11 +205,25 @@ def _extended_collector(pages):
[i for i in column_data if i not in (None, numpy.nan)]
)

if _type in (OPTERYX_TYPES.BOOLEAN, OPTERYX_TYPES.VARCHAR):
if profile[""]
if _type in (OPTERYX_TYPES.VARCHAR, OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP):
# hyperloglog estimates cardinality/uniqueness
hll = profile.get("hyperloglog")
if hll is None:
hll = hyperloglog.HyperLogLogPlusPlus(p=16)
[hll.update(value) for value in column_data]
profile["hyperloglog"] = hll

if _type in (OPTERYX_TYPES.BOOLEAN, OPTERYX_TYPES.VARCHAR, OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP):
# counter is used to collect and count unique values
counter = profile.get("counter")
if counter is None:
counter = CountingTree()
if len(counter) < MAX_COLLECTOR:
[counter.insert(value) for value in column_data if len(counter) < MAX_COLLECTOR]
profile["counter"] = counter

if _type in (OPTERYX_TYPES.NUMERIC, OPTERYX_TYPES.TIMESTAMP):
# populate the distogram
# populate the distogram, this is used for distribution statistics
if profile["distogram"] is None:
dgram = distogram.Distogram(10)
else:
Expand All @@ -223,7 +240,7 @@ def _extended_collector(pages):
for column, profile in profile_collector.items():
profile["column_name"] = columns.get_preferred_name(column)
profile["type"] = ", ".join(profile["type"])
dgram = profile.pop("distogram")
dgram = profile.pop("distogram", None)
if dgram:
profile["min"], profile["max"] = distogram.bounds(dgram)
profile["mean"] = distogram.mean(dgram)
Expand All @@ -237,8 +254,20 @@ def _extended_collector(pages):
distogram.quantile(dgram, value=0.5),
distogram.quantile(dgram, value=0.75),
)
hll = profile.pop("hyperloglog", None)
if hll:
profile["unique"] = hll.count()
buffer.append(profile)

counter = profile.pop("counter", None)
if counter:
if len(counter) < MAX_COLLECTOR:
profile["unique"] = len(counter)
counts = list(counter.values())
if min(counts) != max(counts):
profile["most_frequent_values"] = [str(k) for k in counter.keys()]
profile["most_frequent_counts"] = counts

table = pyarrow.Table.from_pylist(buffer)
table = Columns.create_table_metadata(
table=table,
Expand Down
3 changes: 3 additions & 0 deletions opteryx/sketches/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import pyximport

pyximport.install()
99 changes: 99 additions & 0 deletions opteryx/sketches/counting_tree.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# cython: language_level=3
#!/usr/bin/env python
#coding:utf-8
# Author: mozman
# Purpose: binary tree module
# Created: 28.04.2010
# Copyright (c) 2010-2013 by Manfred Moitzi
# License: MIT License

"""
The module has been updated from it's original form to improve performance.
This outperforms python dictionaries for data with a lot of duplication.
"""
from typing import List
from opteryx.third_party.abctree import ABCTree

__all__ = ['CountingTree']


cdef class Node:
"""Internal object, represents a tree node."""
__slots__ = ('key', 'value', 'left', 'right')

cdef readonly object key
cdef public int value
cdef readonly Node left
cdef readonly Node right

def __init__(self, key, value):
self.key = key
self.value = value
self.left = None
self.right = None

cdef Node get(self, int key):
return self.left if key == 0 else self.right

cdef void set(self, int key, Node value):
if key == 0:
self.left = value
else:
self.right = value

def __getitem__(self, key):
return self.get(key)

def __setitem__(self, key, Node value):
self.set(key, value)

cdef class _CountingTree(object):
"""
CountingTree implements an unbalanced binary tree with a dict-like interface.
see: http://en.wikipedia.org/wiki/Binary_tree
A binary tree is a tree data structure in which each node has at most two
children.
BinaryTree() -> new empty tree.
BinaryTree(mapping,) -> new tree initialized from a mapping
BinaryTree(seq) -> new tree initialized from seq [(k1, v1), (k2, v2), ... (kn, vn)]
see also abctree.ABCTree() class.
"""
__slots__ = ("_root", "_count")

cdef public Node _root
cdef public int _count

def insert(self, key):
if self._root is None:
self._count += 1
self._root = Node(key, 1)
return

cdef Node parent = None
cdef int direction = 0

node = self._root
while 1:
if node is None:
self._count += 1
parent[direction] = Node(key, 1)
break
if key == node.key:
node.value = node.value + 1
break
else:
parent = node
direction = 0 if key <= node.key else 1
node = node[direction]

def remove(self, key):
raise NotImplementedError("BinaryTree is additive only, you cannot remove items.")


class CountingTree(_CountingTree, ABCTree):
pass
16 changes: 7 additions & 9 deletions opteryx/third_party/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,25 @@

These are third-party modules which we include into the Opteryx codebase.

- [**accumulation_tree**](https://github.com/tkluck/accumulation_tree)
- [**distogram**](https://github.com/maki-nage/distogram)
- [**hyperloglog**](https://github.com/svpcom/hyperloglog)
- [**hyperloglog**](https://github.com/ekzhu/datasketch)
- [**pyarrow_ops**](https://github.com/TomScheffers/pyarrow_ops)
- [**bintrees**](https://github.com/mozman/bintrees)

- [**accumulation_tree**](https://github.com/tkluck/accumulation_tree)
- [**pyudorandom**](https://github.com/mewwts/pyudorandom)
- [**sketch**](https://github.com/dnbaker/sketch)
- [**tdigest**](https://github.com/CamDavidsonPilon/tdigest)
- [**uintset**](https://github.com/standupdev/uintset/)

Being in the Opteryx codebase means they are likely to have some non-annotated
deviations from the original source due to the following reasons:
Being in the Opteryx codebase means they are likely to have some non-annotated deviations from the original source due to the following reasons:

- Formatting with Black
- Resolving errors from Security Testing
- Resolving errors from Quality Testing

These modules are excluded from maintainability checks.

Other changes may have been made to improve performance, readability or to reuse
existing imports (for example, using CityHash as per other parts of Opteryx instead of
a new hash algorithm for the included library).
Other changes may have been made to improve performance, readability or to reuse existing imports (for example, using CityHash as per other parts of Opteryx instead of a new hash algorithm for the included library).

Where changes have been made to extend or alter functionality, these have been noted
inline in the code.
Where changes have been made to extend or alter functionality, these have been noted inline in the code.
5 changes: 5 additions & 0 deletions opteryx/third_party/abctree/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pyximport

pyximport.install()

from .abctree import ABCTree
Loading

0 comments on commit 3418db4

Please sign in to comment.