Skip to content

Commit

Permalink
Merge pull request #1369 from mabel-dev/#1354
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jan 10, 2024
2 parents c88ea93 + a11d05c commit 22462ba
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 13 deletions.
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,7 @@ Designed for data analytics in environments where decisions need to be replayabl

### __Fast__

Benchmarks on M1 Pro Mac running an ad hoc `GROUP BY` over a 1Gb parquet file via the CLI in ~1/5th of a second, from a cold start. _(different systems will have different performance characteristics)_

Rows | Columns | File Size | Query Time
------- | ------- | --------- | ----------
561225 | 81 | 1Gb | 0.22sec
1064539 | 81 | 2Gb | 0.27sec
Benchmarks on M2 Pro Mac running an ad hoc `GROUP BY` over a 6 million row parquet file via the CLI in ~1/4th of a second from a cold start (no caching and predefined schema). _(different systems will have different performance characteristics)_

### __Instant Elasticity__

Expand Down
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 180
__build__ = 184

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
2 changes: 2 additions & 0 deletions opteryx/components/heuristic_optimizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,15 @@

class HeuristicOptimizerVisitor:
def __init__(self):
from .strategies import ConstantFoldingStrategy
from .strategies import DefragmentMorselsStrategy
from .strategies import PredicatePushdownStrategy
from .strategies import ProjectionPushdownStrategy
from .strategies import SplitConjunctivePredicatesStrategy

self.strategies = [
# DefragmentMorselsStrategy(),
ConstantFoldingStrategy(),
SplitConjunctivePredicatesStrategy(),
PredicatePushdownStrategy(),
ProjectionPushdownStrategy(),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .constant_folding import ConstantFoldingStrategy
from .defragment_morsels import DefragmentMorselsStrategy
from .predicate_pushdown import PredicatePushdownStrategy
from .projection_pushdown import ProjectionPushdownStrategy
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
from typing import Any

import numpy
from orso.types import OrsoTypes

from opteryx.components.logical_planner import LogicalPlan
from opteryx.components.logical_planner import LogicalPlanNode
from opteryx.components.logical_planner import LogicalPlanStepType
from opteryx.managers.expression import NodeType
from opteryx.managers.expression import evaluate
from opteryx.managers.expression import get_all_nodes_of_type
from opteryx.models import Node
from opteryx.virtual_datasets import no_table_data

from .optimization_strategy import HeuristicOptimizerContext
from .optimization_strategy import OptimizationStrategy


def build_literal_node(value: Any, root: Node):
# fmt:off
if hasattr(value, "as_py"):
value = value.as_py()

root.schema_column = root.schema_column.to_flatcolumn()
root.value = value
root.node_type = NodeType.LITERAL
if value is None:
root.type=OrsoTypes.NULL
elif isinstance(value, (bool, numpy.bool_)):
# boolean must be before numeric
root.type=OrsoTypes.BOOLEAN
elif isinstance(value, (str)):
root.type=OrsoTypes.VARCHAR
elif isinstance(value, (int, numpy.int64)):
root.type=OrsoTypes.INTEGER
elif isinstance(value, (numpy.datetime64, datetime.datetime)):
root.type=OrsoTypes.TIMESTAMP
elif isinstance(value, (datetime.date)):
root.type=OrsoTypes.DATE
else:
raise Exception("Unable to fold expression")
return root
# fmt:on


def fold_constants(root: Node) -> Node:
identifiers = get_all_nodes_of_type(root, (NodeType.IDENTIFIER, NodeType.WILDCARD))
if len(identifiers) == 0:
table = no_table_data.read()
try:
result = evaluate(root, table, None)[0]
return build_literal_node(result, root)
except:
# what ever the reason, just skip
pass
return root


class ConstantFoldingStrategy(OptimizationStrategy):
def visit(
self, node: LogicalPlanNode, context: HeuristicOptimizerContext
) -> HeuristicOptimizerContext:
"""
Constant Folding is when we precalculate expressions (or sub expressions)
which contain only constant or literal or literal values. These don't
tend to happen IRL, but it's a simple enough strategy so should be
included.
"""
if not context.optimized_plan:
context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore

if node.node_type == LogicalPlanStepType.Filter:
node.condition = fold_constants(node.condition)
if node.condition.node_type == NodeType.LITERAL and node.condition.value:
context.optimized_plan.remove_node(context.node_id, heal=True)

return context

def complete(self, plan: LogicalPlan, context: HeuristicOptimizerContext) -> LogicalPlan:
# No finalization needed for this strategy
return plan
3 changes: 0 additions & 3 deletions opteryx/managers/expression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,6 @@ def _inner_evaluate(root: Node, table: Table, context: ExecutionContext):
root.value = format_expression(root)
root.node_type = NodeType.EVALUATED
if node_type == NodeType.EVALUATED:
column = root.schema_column
if not root.schema_column.identity in table.column_names:
raise ColumnReferencedBeforeEvaluationError(column=root.schema_column.name)
return table[root.schema_column.identity].to_numpy()
Expand Down Expand Up @@ -308,8 +307,6 @@ def get_all_nodes_of_type(root, select_nodes):
"""
Walk a expression tree collecting all the nodes of a specified type.
"""
from opteryx.third_party.travers import Graph

if root is None:
return []
if not isinstance(root, (set, tuple, list)):
Expand Down
6 changes: 3 additions & 3 deletions tests/misc/test_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def test_logical_expressions():
result = evaluate(T_AND_T, table=planets)
assert all(result)
result = evaluate(T_AND_F, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)
result = evaluate(F_AND_T, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)
result = evaluate(F_AND_F, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)

T_OR_T = Node(
NodeType.OR, left=true, right=true, schema_column=FunctionColumn(name="func", type=0)
Expand Down

0 comments on commit 22462ba

Please sign in to comment.