Skip to content

Commit

Permalink
Merge pull request #1070 from mabel-dev/#1069
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jun 11, 2023
2 parents 679b271 + 48d03e8 commit 69331f2
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 25 deletions.
1 change: 1 addition & 0 deletions opteryx/components/v2/temporary_physical_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def create_physical_plan(logical_plan):
# we need a gen 2 order by that doesn't rely on the columns object
# node = operators.SortNode(query_properties, order=node_config["order_by"])
elif node_type == LogicalPlanStepType.Project:
print(node_config)
node = operators.NoOpNode(query_properties, **node_config)
elif node_type == LogicalPlanStepType.Scan:
node = operators.V2ScannerNode(query_properties, **node_config)
Expand Down
33 changes: 33 additions & 0 deletions opteryx/models/relation_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@
from dataclasses import dataclass
from dataclasses import field

import numpy

from opteryx.constants.attribute_types import OPTERYX_TYPES


@dataclass
class FlatColumn:
# This is a standard column, backed by PyArrow
# Unlike the other column types, we don't store the values for this Column
# here, we go and read them from the PyArrow Table when we want them.

name: str
data_type: OPTERYX_TYPES
description: str = None
Expand All @@ -31,9 +36,37 @@ class FlatColumn:
class ConstantColumn(FlatColumn):
# Rather than pass around columns of constant values, where we can we should
# replace them with this column type.

# note we don't implement anything here which deals with doing operations on
# two constant columns; whilst that would be a good optimization, the better
# way to do this is in the query optimizer, do operations on two constants
# while we're still working with a query plan.

length: int = 0
value: typing.Any = None

def materialize(self):
# when we need to expand this column out
return numpy.array([self.value] * self.length)


@dataclass
class DictionaryColumn(FlatColumn):
# If we know a column has a small amount of unique values AND is a large column
# AND we're going to perform an operation on the values, we should dictionary
# encode the column. This allows us to operate once on each unique value in the
# column, rather than each value in the column individually. At the cost of
# constructing and materializing the dictionary encoding.

values: typing.List[typing.Any] = field(default_factory=list)

def __post_init__(self):
values = numpy.asarray(self.values)
self.dictionary, self.encoding = numpy.unique(values, return_inverse=True)

def materialize(self):
return self.dictionary[self.encoding]


@dataclass
class RelationSchema:
Expand Down
52 changes: 27 additions & 25 deletions opteryx/utils/intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,40 @@
import numpy


# based on: https://stackoverflow.com/a/57321916
# license: https://creativecommons.org/licenses/by-sa/4.0/
def generate_range(*args):
"""
Combines numpy.arange and numpy.isclose to mimic
open, half-open and closed intervals.
Avoids also floating point rounding errors as with
>>> numpy.arange(1, 1.3, 0.1)
array([1. , 1.1, 1.2, 1.3])
args: [start, ]stop, [step, ]
as in numpy.arange
rtol, atol: floats
floating point tolerance as in numpy.isclose
include: boolean list-like, length 2
if start and end point are included
open, half-open, and closed intervals.
Avoids floating-point rounding errors like in
numpy.arange(1, 1.3, 0.1) returning
array([1. , 1.1, 1.2, 1.3]).
Args:
[start, ]stop, [step, ]: Arguments as in numpy.arange.
Returns:
numpy.ndarray: Array of evenly spaced values.
Raises:
ValueError: If the number of arguments is not 1, 2, or 3.
Examples:
generate_range(5)
generate_range(1, 5)
generate_range(1, 5, 0.5)
"""
# process arguments
if len(args) == 1:
start = 0
stop = args[0]
step = 1
elif len(args) == 2:

# Process arguments
if len(args) == 2:
start, stop = args
step = 1
stop += step
else:
assert len(args) == 3
start, stop, step = tuple(args)

# ensure the the last item is in the series
if ((stop - start) / step) % 1 == 0:
elif len(args) == 3:
start, stop, step = args
# Ensure the last item is in the series
if numpy.isclose((stop - start) / step % 1, 0):
stop += step
else:
raise ValueError("Invalid number of arguments. Expected 2, or 3: start, stop [, step].")

return numpy.arange(start, stop, step, dtype=numpy.float64)
39 changes: 39 additions & 0 deletions tests/misc/test_relation_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Test the connection example from the documentation
"""
import os
import sys

sys.path.insert(1, os.path.join(sys.path[0], "../.."))


def test_constant_column():
from opteryx.models.relation_schema import ConstantColumn
from opteryx.constants.attribute_types import OPTERYX_TYPES

col = ConstantColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, length=100, value=0.75)
val = col.materialize()

print(col)
print(val)

assert sum(val) == 75


def test_dictionary_column():
from opteryx.models.relation_schema import DictionaryColumn
from opteryx.constants.attribute_types import OPTERYX_TYPES

values = [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 3, 2, 1, 2, 3]
col = DictionaryColumn(name="numbers", data_type=OPTERYX_TYPES.INTEGER, values=values)
val = col.materialize()

assert len(col.dictionary) == len(set(col.dictionary))
assert all(val[i] == v for i, v in enumerate(values))


if __name__ == "__main__": # pragma: no cover
test_constant_column()
test_dictionary_column()

print("✅ okay")

0 comments on commit 69331f2

Please sign in to comment.