Merge pull request #1070 from mabel-dev/#1069

#1069
mabel-dev · Jun 11, 2023 · 69331f2 · 69331f2
2 parents 679b271 + 48d03e8
commit 69331f2
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 25 deletions.
diff --git a/opteryx/components/v2/temporary_physical_planner.py b/opteryx/components/v2/temporary_physical_planner.py
@@ -52,6 +52,7 @@ def create_physical_plan(logical_plan):
         #           we need a gen 2 order by that doesn't rely on the columns object
         #            node = operators.SortNode(query_properties, order=node_config["order_by"])
         elif node_type == LogicalPlanStepType.Project:
+            print(node_config)
             node = operators.NoOpNode(query_properties, **node_config)
         elif node_type == LogicalPlanStepType.Scan:
             node = operators.V2ScannerNode(query_properties, **node_config)

diff --git a/opteryx/models/relation_schema.py b/opteryx/models/relation_schema.py
@@ -15,12 +15,17 @@
 from dataclasses import dataclass
 from dataclasses import field
 
+import numpy
+
 from opteryx.constants.attribute_types import OPTERYX_TYPES
 
 
 @dataclass
 class FlatColumn:
     # This is a standard column, backed by PyArrow
+    # Unlike the other column types, we don't store the values for this Column
+    # here, we go and read them from the PyArrow Table when we want them.
+
     name: str
     data_type: OPTERYX_TYPES
     description: str = None
@@ -31,9 +36,37 @@ class FlatColumn:
 class ConstantColumn(FlatColumn):
     # Rather than pass around columns of constant values, where we can we should
     # replace them with this column type.
+
+    # note we don't implement anything here which deals with doing operations on
+    # two constant columns; whilst that would be a good optimization, the better
+    # way to do this is in the query optimizer, do operations on two constants
+    # while we're still working with a query plan.
+
     length: int = 0
     value: typing.Any = None
 
+    def materialize(self):
+        # when we need to expand this column out
+        return numpy.array([self.value] * self.length)
+
+
+@dataclass
+class DictionaryColumn(FlatColumn):
+    # If we know a column has a small amount of unique values AND is a large column
+    # AND we're going to perform an operation on the values, we should dictionary
+    # encode the column. This allows us to operate once on each unique value in the
+    # column, rather than each value in the column individually. At the cost of
+    # constructing and materializing the dictionary encoding.
+
+    values: typing.List[typing.Any] = field(default_factory=list)
+
+    def __post_init__(self):
+        values = numpy.asarray(self.values)
+        self.dictionary, self.encoding = numpy.unique(values, return_inverse=True)
+
+    def materialize(self):
+        return self.dictionary[self.encoding]
+
 
 @dataclass
 class RelationSchema:

diff --git a/opteryx/utils/intervals.py b/opteryx/utils/intervals.py
@@ -13,38 +13,40 @@
 import numpy
 
 
-# based on: https://stackoverflow.com/a/57321916
-# license:  https://creativecommons.org/licenses/by-sa/4.0/
 def generate_range(*args):
     """
     Combines numpy.arange and numpy.isclose to mimic
-    open, half-open and closed intervals.
-    Avoids also floating point rounding errors as with
-    >>> numpy.arange(1, 1.3, 0.1)
-    array([1. , 1.1, 1.2, 1.3])
-
-    args: [start, ]stop, [step, ]
-        as in numpy.arange
-    rtol, atol: floats
-        floating point tolerance as in numpy.isclose
-    include: boolean list-like, length 2
-        if start and end point are included
+    open, half-open, and closed intervals.
+    Avoids floating-point rounding errors like in
+    numpy.arange(1, 1.3, 0.1) returning
+    array([1. , 1.1, 1.2, 1.3]).
+
+    Args:
+        [start, ]stop, [step, ]: Arguments as in numpy.arange.
+
+    Returns:
+        numpy.ndarray: Array of evenly spaced values.
+
+    Raises:
+        ValueError: If the number of arguments is not 1, 2, or 3.
+
+    Examples:
+        generate_range(5)
+        generate_range(1, 5)
+        generate_range(1, 5, 0.5)
     """
-    # process arguments
-    if len(args) == 1:
-        start = 0
-        stop = args[0]
-        step = 1
-    elif len(args) == 2:
+
+    # Process arguments
+    if len(args) == 2:
         start, stop = args
         step = 1
         stop += step
-    else:
-        assert len(args) == 3
-        start, stop, step = tuple(args)
-
-        # ensure the the last item is in the series
-        if ((stop - start) / step) % 1 == 0:
+    elif len(args) == 3:
+        start, stop, step = args
+        # Ensure the last item is in the series
+        if numpy.isclose((stop - start) / step % 1, 0):
             stop += step
+    else:
+        raise ValueError("Invalid number of arguments. Expected 2, or 3: start, stop [, step].")
 
     return numpy.arange(start, stop, step, dtype=numpy.float64)
diff --git a/tests/misc/test_relation_schema.py b/tests/misc/test_relation_schema.py
@@ -0,0 +1,39 @@
+"""
+Test the connection example from the documentation
+"""
+import os
+import sys
+
+sys.path.insert(1, os.path.join(sys.path[0], "../.."))
+
+
+def test_constant_column():
+    from opteryx.models.relation_schema import ConstantColumn
+    from opteryx.constants.attribute_types import OPTERYX_TYPES
+
+    col = ConstantColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, length=100, value=0.75)
+    val = col.materialize()
+
+    print(col)
+    print(val)
+
+    assert sum(val) == 75
+
+
+def test_dictionary_column():
+    from opteryx.models.relation_schema import DictionaryColumn
+    from opteryx.constants.attribute_types import OPTERYX_TYPES
+
+    values = [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 3, 2, 1, 2, 3]
+    col = DictionaryColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, values=values)
+    val = col.materialize()
+
+    assert len(col.dictionary) == len(set(col.dictionary))
+    assert all(val[i] == v for i, v in enumerate(values))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    test_constant_column()
+    test_dictionary_column()
+
+    print("✅ okay")