From d0fe829ec72303749a3d81adc9de9869f37d47d2 Mon Sep 17 00:00:00 2001 From: Justin Joyce Date: Sun, 14 May 2023 01:11:35 +0100 Subject: [PATCH] #1034 --- opteryx/components/v2/binder.py | 90 +++++++++++++++++----- opteryx/operators/function_dataset_node.py | 2 +- opteryx/operators/internal_dataset_node.py | 8 +- opteryx/samples/__init__.py | 45 +---------- opteryx/samples/astronaut_data.py | 2 +- opteryx/samples/no_table_data.py | 2 +- opteryx/samples/planet_data.py | 19 ++++- opteryx/samples/satellite_data.py | 2 +- tests/misc/test_expressions.py | 12 +-- 9 files changed, 104 insertions(+), 78 deletions(-) diff --git a/opteryx/components/v2/binder.py b/opteryx/components/v2/binder.py index 0eb08938..388e7181 100644 --- a/opteryx/components/v2/binder.py +++ b/opteryx/components/v2/binder.py @@ -16,79 +16,127 @@ import copy +from orso.logging import get_logger + +from opteryx.exceptions import DatabaseError + +logger = get_logger() + + +class Schema: + relation: str + class BinderVisitor: def visit_node(self, node, context=None): node_type = node.node_type visit_method_name = f"visit_{node_type.split('.')[1].lower()}" visit_method = getattr(self, visit_method_name, self.visit_unsupported) - return visit_method(node, context) + result = visit_method(node, context) + if not isinstance(result, dict): + raise DatabaseError(f"function {visit_method_name} didn't return a dict") def visit_unsupported(self, node, context): raise NotImplementedError(f"No visit method implemented for node type {node.node_type}") def visit_project(self, node, context): - raise NotImplementedError("visit project") + logger.warning("visit_project not implemented") + return context def visit_filter(self, node, context): - raise NotImplementedError("visit filter") + logger.warning("visit_filter not implemented") + return context def visit_union(self, node, context): - raise NotImplementedError("visit union") + logger.warning("visit_union not implemented") + return context def visit_explain(self, node, context): - raise NotImplementedError("visit explain") + logger.warning("visit_explain not implemented") + return context def visit_difference(self, node, context): - raise NotImplementedError("visit difference") + logger.warning("visit_difference not implemented") + return context def visit_join(self, node, context): - raise NotImplementedError("visit join") + logger.warning("visit_join not implemented") + return context def visit_group(self, node, context): - raise NotImplementedError("visit group") + logger.warning("visit_group not implemented") + return context def visit_aggregate(self, node, context): - raise NotImplementedError("visit aggregate") + logger.warning("visit_aggregate not implemented") + return context def visit_scan(self, node, context): + if node.relation[0] == "$": + from opteryx import samples + + node.connector = "Internal" + _schema = samples.planets.schema + raise NotImplementedError("visit scan") + """ + - determine the source of the relation: + - sample + - in-memory + - on-disk + - storage + - collection + - sql + - if we can get the schema, do that and add it to the context + """ def visit_show(self, node, context): - raise NotImplementedError("visit show") + logger.warning("visit_show not implemented") + return context def visit_show_columns(self, node, context): - raise NotImplementedError("visit show columns") + logger.warning("visit_show_columns not implemented") + return context def visit_set(self, node, context): - raise NotImplementedError("visit set") + logger.warning("visit_set not implemented") + return context def visit_limit(self, node, context): - raise NotImplementedError("visit limit") + logger.warning("visit_limit not implemented") + return context def visit_order(self, node, context): - raise NotImplementedError("visit order") + logger.warning("visit_order not implemented") + return context def visit_distinct(self, node, context): - raise NotImplementedError("visit distinct") + logger.warning("visit_distinct not implemented") + return context def visit_cte(self, node, context): - raise NotImplementedError("visit cte") + logger.warning("visit_cte not implemented") + return context def visit_subquery(self, node, context): - raise NotImplementedError("visit subquery") + logger.warning("visit_subquery not implemented") + return context def visit_values(self, node, context): - raise NotImplementedError("visit values") + logger.warning("visit_values not implemented") + return context def visit_unnest(self, node, context): - raise NotImplementedError("visit unnest") + logger.warning("visit_unnest not implemented") + return context def visit_generate_series(self, node, context): - raise NotImplementedError("visit generate series") + logger.warning("visit_generate_series not implemented") + return context def visit_fake(self, node, context): - raise NotImplementedError("visit fake") + logger.warning("visit_fake not implemented") + return context def traverse(self, graph, node, context=None): """ diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index fab5f840..74a3e942 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -44,7 +44,7 @@ def _unnest(alias, values): # single item lists are reported as nested from opteryx.samples import no_table - list_items = evaluate(values, no_table(), True) + list_items = evaluate(values, no_table.read(), True) return [{alias: row} for row in list_items] diff --git a/opteryx/operators/internal_dataset_node.py b/opteryx/operators/internal_dataset_node.py index 9e521925..8a699da2 100644 --- a/opteryx/operators/internal_dataset_node.py +++ b/opteryx/operators/internal_dataset_node.py @@ -47,10 +47,10 @@ def _get_sample_dataset(dataset, alias, end_date): # we do this like this so the datasets are not loaded into memory unless # they are going to be used sample_datasets = { - "$satellites": samples.satellites, - "$planets": samples.planets, - "$astronauts": samples.astronauts, - "$no_table": samples.no_table, + "$satellites": samples.satellites.read, + "$planets": samples.planets.read, + "$astronauts": samples.astronauts.read, + "$no_table": samples.no_table.read, } dataset = dataset.lower() if dataset in sample_datasets: diff --git a/opteryx/samples/__init__.py b/opteryx/samples/__init__.py index 9c16986d..45a5ac97 100644 --- a/opteryx/samples/__init__.py +++ b/opteryx/samples/__init__.py @@ -12,44 +12,7 @@ import datetime - -def satellites(*args): - """load the satellite sample data""" - from .satellite_data import load - - return load() - - -def planets(end_date=datetime.datetime.utcnow().date()): - """load the planets sample data""" - from .planet_data import load - - full_set = load() - - # make planet data act like it support temporality - mask = [True, True, True, True, True, True, True, True, True] - if end_date < datetime.date(1930, 3, 13): - # March 13, 1930 - Pluto discovered by Clyde William Tombaugh - mask = [True, True, True, True, True, True, True, True, False] - if end_date < datetime.date(1846, 11, 13): - # November 13, 1846 - Neptune - mask = [True, True, True, True, True, True, True, False, False] - if end_date < datetime.date(1781, 4, 26): - # April 26, 1781 - Uranus discovered by Sir William Herschel - mask = [True, True, True, True, True, True, False, False, False] - - return full_set.filter(mask) - - -def astronauts(*args): - """load the astronaut sample data""" - from .astronaut_data import load - - return load() - - -def no_table(*args): - """load the null data table""" - from .no_table_data import load - - return load() +import opteryx.samples.astronaut_data as astronauts +import opteryx.samples.no_table_data as no_table +import opteryx.samples.planet_data as planets +import opteryx.samples.satellite_data as satellites diff --git a/opteryx/samples/astronaut_data.py b/opteryx/samples/astronaut_data.py index d76a205c..a77b2c32 100644 --- a/opteryx/samples/astronaut_data.py +++ b/opteryx/samples/astronaut_data.py @@ -37,7 +37,7 @@ import pyarrow.parquet as pq -def load(): +def read(*args): """The table is saved parquet table, base85 encoded.""" return pq.read_table( io.BytesIO( diff --git a/opteryx/samples/no_table_data.py b/opteryx/samples/no_table_data.py index 1c5125ef..53bc9bee 100644 --- a/opteryx/samples/no_table_data.py +++ b/opteryx/samples/no_table_data.py @@ -20,7 +20,7 @@ """ -def load(): +def read(*args): import pyarrow # Create a PyArrow schema with one column called 'column' of integer type diff --git a/opteryx/samples/planet_data.py b/opteryx/samples/planet_data.py index f92a8c63..6aa849a8 100644 --- a/opteryx/samples/planet_data.py +++ b/opteryx/samples/planet_data.py @@ -30,9 +30,10 @@ This has a companion dataset, $satellites, to help test joins. """ +import datetime -def load(): +def read(end_date=datetime.datetime.utcnow().date()): import pyarrow # fmt:off @@ -62,7 +63,21 @@ def load(): column_names = ["id", "name", "mass", "diameter", "density", "gravity", "escapeVelocity", "rotationPeriod", "lengthOfDay", "distanceFromSun", "perihelion", "aphelion", "orbitalPeriod", "orbitalVelocity", "orbitalInclination", "orbitalEccentricity", "obliquityToOrbit", "meanTemperature", "surfacePressure", "numberOfMoons"] # fmt: on - return pyarrow.Table.from_arrays(data, column_names) + full_set = pyarrow.Table.from_arrays(data, column_names) + + # make planet data act like it support temporality + mask = [True, True, True, True, True, True, True, True, True] + if end_date < datetime.date(1930, 3, 13): + # March 13, 1930 - Pluto discovered by Clyde William Tombaugh + mask = [True, True, True, True, True, True, True, True, False] + if end_date < datetime.date(1846, 11, 13): + # November 13, 1846 - Neptune + mask = [True, True, True, True, True, True, True, False, False] + if end_date < datetime.date(1781, 4, 26): + # April 26, 1781 - Uranus discovered by Sir William Herschel + mask = [True, True, True, True, True, True, False, False, False] + + return full_set.filter(mask) schema = { diff --git a/opteryx/samples/satellite_data.py b/opteryx/samples/satellite_data.py index a6a317da..9ce9c704 100644 --- a/opteryx/samples/satellite_data.py +++ b/opteryx/samples/satellite_data.py @@ -37,7 +37,7 @@ import pyarrow.parquet as pq -def load(): +def read(*args): """The table is saved parquet table, base85 encoded.""" return pq.read_table( diff --git a/tests/misc/test_expressions.py b/tests/misc/test_expressions.py index da516617..41456162 100644 --- a/tests/misc/test_expressions.py +++ b/tests/misc/test_expressions.py @@ -48,7 +48,7 @@ @pytest.mark.parametrize("node_type, value", LITERALS) def test_literals(node_type, value): - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() node = ExpressionTreeNode(node_type, value=value) values = evaluate(node, table=planets) @@ -71,7 +71,7 @@ def test_logical_expressions(): illogical from a user perspective but technically correct. """ - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() true = ExpressionTreeNode(NodeType.LITERAL_BOOLEAN, value=True) false = ExpressionTreeNode(NodeType.LITERAL_BOOLEAN, value=False) @@ -128,7 +128,7 @@ def test_logical_expressions(): def test_reading_identifiers(): - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() names_node = ExpressionTreeNode(NodeType.IDENTIFIER, value="name") names = evaluate(names_node, planets) @@ -151,7 +151,7 @@ def test_reading_identifiers(): def test_function_operations(): - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() name = ExpressionTreeNode(NodeType.IDENTIFIER, value="name") concat = ExpressionTreeNode( @@ -201,7 +201,7 @@ def test_function_operations(): def test_compound_expressions(): - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() # this builds and tests the following `3.7 * gravity > mass` @@ -232,7 +232,7 @@ def test_compound_expressions(): def test_functions(): - planets = opteryx.samples.planets() + planets = opteryx.samples.planets.read() gravity = ExpressionTreeNode(NodeType.IDENTIFIER, value="gravity") _round = ExpressionTreeNode(NodeType.FUNCTION, value="ROUND", parameters=[gravity])