Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1354 #1369

Merged
merged 2 commits into from
Jan 10, 2024
Merged

#1354 #1369

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,7 @@ Designed for data analytics in environments where decisions need to be replayabl

### __Fast__

Benchmarks on M1 Pro Mac running an ad hoc `GROUP BY` over a 1Gb parquet file via the CLI in ~1/5th of a second, from a cold start. _(different systems will have different performance characteristics)_

Rows | Columns | File Size | Query Time
------- | ------- | --------- | ----------
561225 | 81 | 1Gb | 0.22sec
1064539 | 81 | 2Gb | 0.27sec
Benchmarks on M2 Pro Mac running an ad hoc `GROUP BY` over a 6 million row parquet file via the CLI in ~1/4th of a second from a cold start (no caching and predefined schema). _(different systems will have different performance characteristics)_

### __Instant Elasticity__

Expand Down
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 180
__build__ = 184

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
2 changes: 2 additions & 0 deletions opteryx/components/heuristic_optimizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,15 @@

class HeuristicOptimizerVisitor:
def __init__(self):
from .strategies import ConstantFoldingStrategy
from .strategies import DefragmentMorselsStrategy
from .strategies import PredicatePushdownStrategy
from .strategies import ProjectionPushdownStrategy
from .strategies import SplitConjunctivePredicatesStrategy

self.strategies = [
# DefragmentMorselsStrategy(),
ConstantFoldingStrategy(),
SplitConjunctivePredicatesStrategy(),
PredicatePushdownStrategy(),
ProjectionPushdownStrategy(),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .constant_folding import ConstantFoldingStrategy
from .defragment_morsels import DefragmentMorselsStrategy
from .predicate_pushdown import PredicatePushdownStrategy
from .projection_pushdown import ProjectionPushdownStrategy
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
from typing import Any

import numpy
from orso.types import OrsoTypes

from opteryx.components.logical_planner import LogicalPlan
from opteryx.components.logical_planner import LogicalPlanNode
from opteryx.components.logical_planner import LogicalPlanStepType
from opteryx.managers.expression import NodeType
from opteryx.managers.expression import evaluate
from opteryx.managers.expression import get_all_nodes_of_type
from opteryx.models import Node
from opteryx.virtual_datasets import no_table_data

from .optimization_strategy import HeuristicOptimizerContext
from .optimization_strategy import OptimizationStrategy


def build_literal_node(value: Any, root: Node):
# fmt:off
if hasattr(value, "as_py"):
value = value.as_py()

root.schema_column = root.schema_column.to_flatcolumn()
root.value = value
root.node_type = NodeType.LITERAL
if value is None:
root.type=OrsoTypes.NULL
elif isinstance(value, (bool, numpy.bool_)):
# boolean must be before numeric
root.type=OrsoTypes.BOOLEAN
elif isinstance(value, (str)):
root.type=OrsoTypes.VARCHAR
elif isinstance(value, (int, numpy.int64)):
root.type=OrsoTypes.INTEGER
elif isinstance(value, (numpy.datetime64, datetime.datetime)):
root.type=OrsoTypes.TIMESTAMP
elif isinstance(value, (datetime.date)):
root.type=OrsoTypes.DATE
else:
raise Exception("Unable to fold expression")
return root
# fmt:on


def fold_constants(root: Node) -> Node:
identifiers = get_all_nodes_of_type(root, (NodeType.IDENTIFIER, NodeType.WILDCARD))
if len(identifiers) == 0:
table = no_table_data.read()
try:
result = evaluate(root, table, None)[0]
return build_literal_node(result, root)
except:
# what ever the reason, just skip
pass
return root


class ConstantFoldingStrategy(OptimizationStrategy):
def visit(
self, node: LogicalPlanNode, context: HeuristicOptimizerContext
) -> HeuristicOptimizerContext:
"""
Constant Folding is when we precalculate expressions (or sub expressions)
which contain only constant or literal or literal values. These don't
tend to happen IRL, but it's a simple enough strategy so should be
included.
"""
if not context.optimized_plan:
context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore

if node.node_type == LogicalPlanStepType.Filter:
node.condition = fold_constants(node.condition)
if node.condition.node_type == NodeType.LITERAL and node.condition.value:
context.optimized_plan.remove_node(context.node_id, heal=True)

return context

def complete(self, plan: LogicalPlan, context: HeuristicOptimizerContext) -> LogicalPlan:
# No finalization needed for this strategy
return plan
3 changes: 0 additions & 3 deletions opteryx/managers/expression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,6 @@ def _inner_evaluate(root: Node, table: Table, context: ExecutionContext):
root.value = format_expression(root)
root.node_type = NodeType.EVALUATED
if node_type == NodeType.EVALUATED:
column = root.schema_column
if not root.schema_column.identity in table.column_names:
raise ColumnReferencedBeforeEvaluationError(column=root.schema_column.name)
return table[root.schema_column.identity].to_numpy()
Expand Down Expand Up @@ -308,8 +307,6 @@ def get_all_nodes_of_type(root, select_nodes):
"""
Walk a expression tree collecting all the nodes of a specified type.
"""
from opteryx.third_party.travers import Graph

if root is None:
return []
if not isinstance(root, (set, tuple, list)):
Expand Down
6 changes: 3 additions & 3 deletions tests/misc/test_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def test_logical_expressions():
result = evaluate(T_AND_T, table=planets)
assert all(result)
result = evaluate(T_AND_F, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)
result = evaluate(F_AND_T, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)
result = evaluate(F_AND_F, table=planets)
assert not any(c.as_py() for c in result)
assert not any(c for c in result)

T_OR_T = Node(
NodeType.OR, left=true, right=true, schema_column=FunctionColumn(name="func", type=0)
Expand Down