diff --git a/opteryx/components/v2/temporary_physical_planner.py b/opteryx/components/v2/temporary_physical_planner.py index 781bdcf6..ce605bfa 100644 --- a/opteryx/components/v2/temporary_physical_planner.py +++ b/opteryx/components/v2/temporary_physical_planner.py @@ -52,6 +52,7 @@ def create_physical_plan(logical_plan): # we need a gen 2 order by that doesn't rely on the columns object # node = operators.SortNode(query_properties, order=node_config["order_by"]) elif node_type == LogicalPlanStepType.Project: + print(node_config) node = operators.NoOpNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Scan: node = operators.V2ScannerNode(query_properties, **node_config) diff --git a/opteryx/models/relation_schema.py b/opteryx/models/relation_schema.py index ced61329..ac3d5614 100644 --- a/opteryx/models/relation_schema.py +++ b/opteryx/models/relation_schema.py @@ -15,12 +15,17 @@ from dataclasses import dataclass from dataclasses import field +import numpy + from opteryx.constants.attribute_types import OPTERYX_TYPES @dataclass class FlatColumn: # This is a standard column, backed by PyArrow + # Unlike the other column types, we don't store the values for this Column + # here, we go and read them from the PyArrow Table when we want them. + name: str data_type: OPTERYX_TYPES description: str = None @@ -31,9 +36,37 @@ class FlatColumn: class ConstantColumn(FlatColumn): # Rather than pass around columns of constant values, where we can we should # replace them with this column type. + + # note we don't implement anything here which deals with doing operations on + # two constant columns; whilst that would be a good optimization, the better + # way to do this is in the query optimizer, do operations on two constants + # while we're still working with a query plan. + length: int = 0 value: typing.Any = None + def materialize(self): + # when we need to expand this column out + return numpy.array([self.value] * self.length) + + +@dataclass +class DictionaryColumn(FlatColumn): + # If we know a column has a small amount of unique values AND is a large column + # AND we're going to perform an operation on the values, we should dictionary + # encode the column. This allows us to operate once on each unique value in the + # column, rather than each value in the column individually. At the cost of + # constructing and materializing the dictionary encoding. + + values: typing.List[typing.Any] = field(default_factory=list) + + def __post_init__(self): + values = numpy.asarray(self.values) + self.dictionary, self.encoding = numpy.unique(values, return_inverse=True) + + def materialize(self): + return self.dictionary[self.encoding] + @dataclass class RelationSchema: diff --git a/opteryx/utils/intervals.py b/opteryx/utils/intervals.py index ccb0d731..3fcb77eb 100644 --- a/opteryx/utils/intervals.py +++ b/opteryx/utils/intervals.py @@ -13,38 +13,40 @@ import numpy -# based on: https://stackoverflow.com/a/57321916 -# license: https://creativecommons.org/licenses/by-sa/4.0/ def generate_range(*args): """ Combines numpy.arange and numpy.isclose to mimic - open, half-open and closed intervals. - Avoids also floating point rounding errors as with - >>> numpy.arange(1, 1.3, 0.1) - array([1. , 1.1, 1.2, 1.3]) - - args: [start, ]stop, [step, ] - as in numpy.arange - rtol, atol: floats - floating point tolerance as in numpy.isclose - include: boolean list-like, length 2 - if start and end point are included + open, half-open, and closed intervals. + Avoids floating-point rounding errors like in + numpy.arange(1, 1.3, 0.1) returning + array([1. , 1.1, 1.2, 1.3]). + + Args: + [start, ]stop, [step, ]: Arguments as in numpy.arange. + + Returns: + numpy.ndarray: Array of evenly spaced values. + + Raises: + ValueError: If the number of arguments is not 1, 2, or 3. + + Examples: + generate_range(5) + generate_range(1, 5) + generate_range(1, 5, 0.5) """ - # process arguments - if len(args) == 1: - start = 0 - stop = args[0] - step = 1 - elif len(args) == 2: + + # Process arguments + if len(args) == 2: start, stop = args step = 1 stop += step - else: - assert len(args) == 3 - start, stop, step = tuple(args) - - # ensure the the last item is in the series - if ((stop - start) / step) % 1 == 0: + elif len(args) == 3: + start, stop, step = args + # Ensure the last item is in the series + if numpy.isclose((stop - start) / step % 1, 0): stop += step + else: + raise ValueError("Invalid number of arguments. Expected 2, or 3: start, stop [, step].") return numpy.arange(start, stop, step, dtype=numpy.float64) diff --git a/tests/misc/test_relation_schema.py b/tests/misc/test_relation_schema.py new file mode 100644 index 00000000..f8f861a1 --- /dev/null +++ b/tests/misc/test_relation_schema.py @@ -0,0 +1,39 @@ +""" +Test the connection example from the documentation +""" +import os +import sys + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + + +def test_constant_column(): + from opteryx.models.relation_schema import ConstantColumn + from opteryx.constants.attribute_types import OPTERYX_TYPES + + col = ConstantColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, length=100, value=0.75) + val = col.materialize() + + print(col) + print(val) + + assert sum(val) == 75 + + +def test_dictionary_column(): + from opteryx.models.relation_schema import DictionaryColumn + from opteryx.constants.attribute_types import OPTERYX_TYPES + + values = [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 3, 2, 1, 2, 3] + col = DictionaryColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, values=values) + val = col.materialize() + + assert len(col.dictionary) == len(set(col.dictionary)) + assert all(val[i] == v for i, v in enumerate(values)) + + +if __name__ == "__main__": # pragma: no cover + test_constant_column() + test_dictionary_column() + + print("✅ okay")