From c2b5e8e7407d26837b19887f3451b7e0c7f7e372 Mon Sep 17 00:00:00 2001 From: Justin Joyce Date: Fri, 28 Apr 2023 21:31:28 +0100 Subject: [PATCH] #1015 --- README.md | 4 +- .../rules/rule_predicate_pushdown.py | 1 + opteryx/functions/v2.py | 23 ++++++++--- opteryx/operators/blob_reader_node.py | 3 ++ opteryx/operators/file_reader_node.py | 3 ++ opteryx/operators/sql_reader_node.py | 4 ++ opteryx/third_party/query_builder/builder.py | 4 +- testdata/flat/planets/parquet/planets.parquet | Bin 0 -> 9200 bytes .../test_predicate_pushdown_parquet.py | 36 ++++++++++++++++++ .../test_predicate_pushdown_postgres.py | 6 +++ 10 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 testdata/flat/planets/parquet/planets.parquet diff --git a/README.md b/README.md index b89391063..756b5bc68 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,6 @@ Opteryx is in beta. Beta means different things to different people, to us, bein ## Related Projects - **[HadroDB](https://github.com/mabel-dev/hadrodb)** Data storage engine -- **[mabel](https://github.com/mabel-dev/mabel)** platform for authoring +- **[mabel](https://github.com/mabel-dev/mabel)** Streaming data APIs - **[mesos](https://github.com/mabel-dev/mesos)** MySQL connector for Opteryx -- **[orso](https://github.com/mabel-dev/orso)** DataFrame librarydata processing systems +- **[orso](https://github.com/mabel-dev/orso)** DataFrame library diff --git a/opteryx/components/tree_rewriter/rules/rule_predicate_pushdown.py b/opteryx/components/tree_rewriter/rules/rule_predicate_pushdown.py index c1fd60010..9285b0f00 100644 --- a/opteryx/components/tree_rewriter/rules/rule_predicate_pushdown.py +++ b/opteryx/components/tree_rewriter/rules/rule_predicate_pushdown.py @@ -46,6 +46,7 @@ def predicate_pushdown(plan, properties): ( operators.BlobReaderNode, operators.CollectionReaderNode, + operators.FileReaderNode, operators.SqlReaderNode, ) ) diff --git a/opteryx/functions/v2.py b/opteryx/functions/v2.py index 8c5808ed3..54d5862d4 100644 --- a/opteryx/functions/v2.py +++ b/opteryx/functions/v2.py @@ -43,7 +43,9 @@ def get_functions(): if function_name[0] != "_": # python CamelCase to SQL SNAKE_CASE function_name = CAMEL_TO_SNAKE.sub("_", function_name).upper() - functions[function_name] = function_implementation + if function_name.startswith("FUNCTION_"): + function_name = function_name[9:] + functions[function_name] = function_implementation return functions @@ -80,7 +82,7 @@ def __str__(self): return f"{self.__class__.__name__.upper()} () → \n{self.__doc__.strip()}" -class CurrentTime(_BaseFunction): +class FunctionCurrentTime(_BaseFunction): """Return the current system time.""" style = _FunctionStyle.CONSTANT @@ -90,7 +92,7 @@ def _func(self) -> datetime.time: return datetime.datetime.utcnow().time() -class E(_BaseFunction): +class FunctionE(_BaseFunction): """Return Euler's number.""" style = _FunctionStyle.CONSTANT @@ -100,7 +102,16 @@ def _func(self) -> float: return 2.71828182845904523536028747135266249775724709369995 -class Phi(_BaseFunction): +class FunctionLen(_BaseFunction): + """return the length of a VARCHAR or ARRAY""" + + style = _FunctionStyle.ELEMENTWISE + + def _func(self, item: typing.Union[list, str]) -> int: + return len(item) + + +class FunctionPhi(_BaseFunction): """Return the golden ratio.""" style = _FunctionStyle.CONSTANT @@ -110,7 +121,7 @@ def _func(self) -> float: return 1.61803398874989484820458683436563811772030917980576 -class Pi(_BaseFunction): +class FunctionPi(_BaseFunction): """Return Pi.""" style = _FunctionStyle.CONSTANT @@ -120,7 +131,7 @@ def _func(self) -> float: return 3.14159265358979323846264338327950288419716939937510 -class Version(_BaseFunction): +class FunctionVersion(_BaseFunction): """Return the version of the query engine.""" style = _FunctionStyle.CONSTANT diff --git a/opteryx/operators/blob_reader_node.py b/opteryx/operators/blob_reader_node.py index a36f4a775..26d3b7a7a 100644 --- a/opteryx/operators/blob_reader_node.py +++ b/opteryx/operators/blob_reader_node.py @@ -146,6 +146,9 @@ def push_predicate(self, predicate): # these to DNF at read time, for everything else, we run the selection nodes from opteryx.connectors.capabilities import predicate_pushable + # Appears to be a bug in how pyarrow does indentifier filters so don't push these + if predicate.left.token_type == predicate.right.token_type == NodeType.IDENTIFIER: + return False dnf = predicate_pushable.to_dnf(predicate) if dnf is None: # we can't push all predicates everywhere diff --git a/opteryx/operators/file_reader_node.py b/opteryx/operators/file_reader_node.py index 6292dd961..c13afaf10 100644 --- a/opteryx/operators/file_reader_node.py +++ b/opteryx/operators/file_reader_node.py @@ -89,6 +89,9 @@ def push_predicate(self, predicate): # these to DNF at read time, for everything else, we run the selection nodes from opteryx.connectors.capabilities import predicate_pushable + # Appears to be a bug in how pyarrow does indentifier filters so don't push these + if predicate.left.token_type == predicate.right.token_type == NodeType.IDENTIFIER: + return False if predicate_pushable.to_dnf(predicate) is None: # we can't push all predicates everywhere return False diff --git a/opteryx/operators/sql_reader_node.py b/opteryx/operators/sql_reader_node.py index e95f84ff9..6fef5855b 100644 --- a/opteryx/operators/sql_reader_node.py +++ b/opteryx/operators/sql_reader_node.py @@ -22,6 +22,7 @@ import pyarrow +from opteryx.managers.expression import NodeType from opteryx.models import QueryProperties from opteryx.models.columns import Columns from opteryx.operators import BasePlanNode @@ -59,6 +60,9 @@ def can_push_selection(self): return not self._disable_selections def push_predicate(self, predicate): + # SQL reader currently assumes the right parameter is a literal + if predicate.left.token_type == predicate.right.token_type == NodeType.IDENTIFIER: + return False if self.can_push_selection: return self._reader.push_predicate(predicate) return False diff --git a/opteryx/third_party/query_builder/builder.py b/opteryx/third_party/query_builder/builder.py index 3a4d72e82..0219a2363 100644 --- a/opteryx/third_party/query_builder/builder.py +++ b/opteryx/third_party/query_builder/builder.py @@ -105,11 +105,11 @@ def _lines_keyword(self, keyword, things): if thing.keyword: yield thing.keyword + "\n" - format = self.formats[bool(thing.alias)][keyword] + format_ = self.formats[bool(thing.alias)][keyword] value = thing.value if thing.is_subquery: value = f"(\n{self._indent(value)}\n)" - yield self._indent(format.format(value=value, alias=thing.alias)) + yield self._indent(format_.format(value=value, alias=thing.alias)) if not last and not thing.keyword: try: diff --git a/testdata/flat/planets/parquet/planets.parquet b/testdata/flat/planets/parquet/planets.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4658b7bde8e709ea970e6314ba0a139411bd1948 GIT binary patch literal 9200 zcmcgyeQaCR6~E81ljq_P6Jpl^1A!)YVFaAS2}vNu@7YeAr1@~t)Je>QupP&C>yOkQ zNfTfSsFbc7TF0QH6&qbwK8CugETyZ^gtDp980v%sjQz6>*s7|GZGw#<{wQeY-uIrL zKi(7DH2hTC@1A?_x##yg=brn~c=@Rglh$;QG2J_AsyCr}god;X(lbb-uhF8aS{7;H zqk~U1d}?aheijLjUi>v7Q1t@PJ&?_%lguXL zj&vZI)7{WN9$w7ml3~)@8_#7^rF`6uT6I*dMMt$lI|bAStF0!3h+3NrB*GN;8IAd; z8vLvd|FOzirbiJTcyeDJ-jMr-hd-D+d;M=6_cfgKytjHa_ueX_Z{5tbR6qEH>zfxG zh+)|0zt1r>MzrxDXJ;7g`VPIZb#0D&y1(tMBNrcal%lZ+?uSkmR%%1M3#bq(k}BhT zS8p|Kl5i#hnGD7Gn#O!igMZ25r)t2#<+@ZxP-9?8^6wBL2*YRCo{3^iiG-E4L)YXN9rV(|0jbe zqF!T6^P@1uN&4cnM-DTLp_cp2c?jnlIlU2W(X)oe9uDak_9kvY$DkW)jm?-_pwC4T z;#N~rVFEjH7#`?hVWl?22jxA;c%_%zL7`y24S zs{wGiGL<0|0mCeh%Wk&o7{gT*FWXG;v6x^*4dC95G057yzEwNgP3CnEv?9;PYV&$y z3_we5q`*T(Qe{6MY?NVN8P0?Pi(y|lo(f6mA%j1!!T;EdFPSBG7im=)OAY-)a*jg_ z4Ab`c6CBuu)?Pic0Y&4*9*#yquMh=WIA*hPVWEWsrnDU!x~A5~c4z?4&7qC*iB6pk z8ij}_I*E1VQe`HNUsu3HI+YD%W2vM!oQ|c0wG49I2CQXPeBl~kqFk%WSZYkH=HsA= zBKrC%4sxFM!ZQ%`sF&n)D5Qb3WeQHW$Q0Dg$1>SK zG87(6rxM$9Nht%K)!;+T`0`CsAQY)p8BC3aJu(e2`?VfUS7UhBPZ#x25K_ULF}=mm z{0Im0qxq&Mgz(?_jLy=w_O%xU#-?9XAOXN3(n3uC$dgb1#ZqMwUb$H&VbdZ^f@nAn z(?P<2T7!?a;sak1Q$n#ug1>xpq=vsRC6GACObQtLNE{gLRUv}B4W|UGd%r=0;f3$% zo3zW%!P>w63n3IPzW#Fs7DyNC;DBZT&?aT6ddfk$ zRGEYG-7*JtA_t|*Bc#NWUHFh4SSVLZSSXjOvQQESR=V(A!RhJ@U?1!d44*sb_UK~= z)6jsB7HDA@^nR_S`Lcr!0}Dj?x+v919!G0=v!J~!Rp#KWKAD4hk%Qr6C>~4F6;etG z5DaH}@v$C>h9b=>>@8b^k#Qlz^&NZD%N5A8E;iEUk zg2uCs5rgKIN6bHV{Ceq?r?0X7#NnU6^PP!DpL4uCOe*#{*b`O|y{@oxl4>lhq?_1T zE>-4b?_Dx4SBtz1ghJtDHXX}vpCC`XrNI~b@u@q3m~z!B1F8`t#|J0S@I#0Pc-j`| z{;`L~OCP7RXwS2-e6HUC>qll*Z)qLv=KuvlNAFRH4@rfEm6Q^*%B9d>Ax}K+lzA|w zg7Mhy9Bdw^Qxha*6hy}-9R8C7NGR88QQAgTUWb$s;UOsWOlvLO2pm=Z}O4)iQ_yM{3|R0iV0zb1mFuT`8bJV)1rC)Ks#33Q~myMW|59s6tIy zjuG{j%so+B60ZWo#ZE=$`bKNx<%Tcy1{42ZE(-CNd<};yaNC8y{Hvu3T;|}4U7X;G zT}w7`|!u;U8ui*e;Pp%vT>;!5qq%*Q*; z^_Vpuw85FW(P}a9Q&ST&cL=*O%aaBhdAcG4JkL+UhdkSOewlYh_(7hJ@U94zIh{N| zM&4XJKTLk1Ed4@(c8!3H?;_2%ExSS31#0;1r0k49*&P9O{BF{1XcE-$y`=0SIxAH{5+WABC6aOG7kM3B1RDH;`e~N*JkU5(twK?PK<(wzS1|o9e$j-`I(NGe|E&{ z-_@}c?Cf48bzbkP%?5Vy1RgwLsXRGGK~PY6D!>ZTc4)jKl<-IAI(?I&gl{!u_pSIR z-C>*8W`kN6zim~r$E|D+vDKON&kW{#9$z-(@vY3wPJzxH{J@f3uqWb1!0c4F&(r4(B}NwKX|;2kZ9%lhxwB8?N3@4RllgHHS2~wy zU(aA>YG!<=f2O-Vl!)&ndimi|vHvi?!z@Mc1+LDbn)4x9x+~A1EM~O&kzh+ z$mb|22R*)=-#(R^?(`4L&FPsF`L>hGMQqEi0}{S$*(zcAgPJSq4Wk^*EeLEo~F zmuaP-*~v#0_0cE`%n)S}9uhM24~eOhv={Or?a6Q5iY!fwf%NWNID2boaq-qbI-Oc> kS_-E#@INI@9UZr|-_{PtN;tj$&;QUCBm8v={H@Qw0U4CTFaQ7m literal 0 HcmV?d00001 diff --git a/tests/query_planning/test_predicate_pushdown_parquet.py b/tests/query_planning/test_predicate_pushdown_parquet.py index 83d51384d..ce25e0976 100644 --- a/tests/query_planning/test_predicate_pushdown_parquet.py +++ b/tests/query_planning/test_predicate_pushdown_parquet.py @@ -66,6 +66,42 @@ def test_predicate_pushdowns_blobs_parquet(): assert cur.stats["rows_read"] == 711, cur.stats config.ONLY_PUSH_EQUALS_PREDICATES = False + # identifier = identifier isn't pushed - file reader + config.ONLY_PUSH_EQUALS_PREDICATES = True + cur = conn.cursor() + cur.execute( + "SELECT * FROM 'testdata/flat/planets/parquet/planets.parquet' WITH(NO_PARTITION) WHERE rotationPeriod = lengthOfDay;" + ) + assert cur.rowcount == 3, cur.rowcount + assert cur.stats["rows_read"] == 9, cur.stats + config.ONLY_PUSH_EQUALS_PREDICATES = False + + # push the >, not the eq + cur = conn.cursor() + cur.execute( + "SELECT * FROM 'testdata/flat/planets/parquet/planets.parquet' WITH(NO_PARTITION) WHERE rotationPeriod = lengthOfDay AND id > 5;" + ) + assert cur.rowcount == 2, cur.rowcount + assert cur.stats["rows_read"] == 4, cur.stats + + # identifier = identifier isn't pushed - blob reader + config.ONLY_PUSH_EQUALS_PREDICATES = True + cur = conn.cursor() + cur.execute( + "SELECT * FROM 'testdata.flat.planets.parquet' WITH(NO_PARTITION) WHERE rotationPeriod = lengthOfDay;" + ) + assert cur.rowcount == 3, cur.rowcount + assert cur.stats["rows_read"] == 9, cur.stats + config.ONLY_PUSH_EQUALS_PREDICATES = False + + # identifier = identifier isn't pushed - blob reader + cur = conn.cursor() + cur.execute( + "SELECT * FROM 'testdata.flat.planets.parquet' WITH(NO_PARTITION) WHERE rotationPeriod = lengthOfDay AND id > 5;" + ) + assert cur.rowcount == 2, cur.rowcount + assert cur.stats["rows_read"] == 4, cur.stats + conn.close() diff --git a/tests/query_planning/test_predicate_pushdown_postgres.py b/tests/query_planning/test_predicate_pushdown_postgres.py index f7175a5e0..1d045b5c7 100644 --- a/tests/query_planning/test_predicate_pushdown_postgres.py +++ b/tests/query_planning/test_predicate_pushdown_postgres.py @@ -71,6 +71,12 @@ def test_predicate_pushdowns_postgres_eq(): assert cur.rowcount == 1, cur.rowcount assert cur.stats["rows_read"] == 2, cur.stats + # identifier = identifier isn't pushed to SQL engines + cur = conn.cursor() + cur.execute("SELECT * FROM pg.planets WHERE rotation_period = length_of_day AND id > 5;") + assert cur.rowcount == 2, cur.rowcount + assert cur.stats["rows_read"] == 4, cur.stats + conn.close()