diff --git a/README.md b/README.md index 0f3c2bf7..d424d95d 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

-Opteryx is a SQL Engine designed for embedded and cloud-native environments, and with command-line skills. +Opteryx is an in-process SQL query engine for analysis of distributed datasets. [Documentation](https://opteryx.dev/latest) | [Examples](#examples) | @@ -26,7 +26,7 @@ Opteryx is a SQL Engine designed for embedded and cloud-native environments, and ## What is Opteryx? -Opteryx is a powerful Python library designed for data wrangling and analytics. With Opteryx, users can seamlessly interact with various data platforms, unlocking the full potential of their data. +Opteryx is a Python library designed for data wrangling and analytics. With Opteryx, users can seamlessly interact with various data platforms, unlocking the full potential of their data. Opteryx offers the following features: @@ -34,7 +34,7 @@ Opteryx offers the following features: - A command-line tool for filtering, transforming, and combining files in a flexible and intuitive manner. - Embeddable as a low-cost engine, allowing for hundreds of analysts to leverage ad hoc databases with ease. - Integration with familiar tools like pandas and Polars. -- Unified access to data on disk, in the Cloud and in on-prem databases, not only through the same interface, but in the same query. +- Unified and federated access to data on disk, in the Cloud and in on-prem databases, not only through the same interface, but in the same query. ## Why Use Opteryx? @@ -68,7 +68,7 @@ Opteryx is Open Source Python, it quickly and easily integrates into Python code ### __Time Travel__ -Designed for data analytics in environments where decisions need to be replayable, Opteryx allows you to query data as at a point in time in the past to replay decision algorithms against facts as they were known in the past. _(data must be structured to enable temporal queries)_ +Designed for data analytics in environments where decisions need to be replayable, Opteryx allows you to query data as at a point in time in the past to replay decision algorithms against facts as they were known in the past. You can even self-join tables historic data, great for finding deltas in datasets over time. _(data must be structured to enable temporal queries)_ ### __Fast__ diff --git a/opteryx/__main__.py b/opteryx/__main__.py index a9dff789..51af4b9e 100644 --- a/opteryx/__main__.py +++ b/opteryx/__main__.py @@ -56,15 +56,16 @@ def main( print(f"Opteryx version {opteryx.__version__}") print(" Enter '.help' for usage hints") print(" Enter '.exit' to exit this program") - print() # Start the REPL loop while True: # pragma: no cover # Prompt the user for a SQL statement + print() statement = input('opteryx> ') - # If the user entered "quit", exit the loop - if statement == '.exit': + # If the user entered "exit", exit the loop + # forgive them for 'quit' + if statement in {'.exit', '.quit'}: break if statement == ".help": print(" .exit Exit this program") diff --git a/opteryx/command.py b/opteryx/command.py new file mode 100644 index 00000000..72235d66 --- /dev/null +++ b/opteryx/command.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from opteryx.__main__ import main diff --git a/opteryx/connection.py b/opteryx/connection.py index 01306128..e433df82 100644 --- a/opteryx/connection.py +++ b/opteryx/connection.py @@ -20,6 +20,7 @@ import typing from uuid import uuid4 +import pyarrow from orso import DataFrame from orso import converters @@ -109,7 +110,7 @@ def id(self): """The unique internal reference for this query""" return self._qid - def execute(self, operation, params=None): + def _inner_execute(self, operation, params=None): if not operation: raise MissingSqlStatement("SQL statement not found") @@ -145,9 +146,21 @@ def execute(self, operation, params=None): results = self._query_planner.execute(self._plan) if results is not None: - self._rows, self._schema = converters.from_arrow(utils.arrow.rename_columns(results)) + return utils.arrow.rename_columns(results) + + def execute(self, operation, params=None): + results = self._inner_execute(operation, params) + if results is not None: + self._rows, self._schema = converters.from_arrow(results) self._cursor = iter(self._rows) + def execute_to_arrow(self, operation, params=None, limit=None): + results = self._inner_execute(operation, params) + if results is not None: + if limit is not None: + return utils.arrow.limit_records(results, limit) + return pyarrow.concat_tables(results, promote=True) + @property def stats(self): """execution statistics""" diff --git a/opteryx/utils/__init__.py b/opteryx/utils/__init__.py index 6092962e..133edb0d 100644 --- a/opteryx/utils/__init__.py +++ b/opteryx/utils/__init__.py @@ -24,7 +24,7 @@ def hasher(vals): This is roughly 2x faster than the previous implementation for lists of strings. Do note though, if you're micro-optimizing, this is faster to create but is - slower for some Python functions to handle, like 'sorted'. + slower for some Python functions to handle the result of, like 'sorted'. """ if numpy.issubdtype(vals.dtype, numpy.character): return numpy.array([CityHash64(s.encode()) for s in vals], numpy.uint64) diff --git a/opteryx/version.py b/opteryx/version.py index 49f93a92..0847dfee 100644 --- a/opteryx/version.py +++ b/opteryx/version.py @@ -17,4 +17,4 @@ """ # __version__ = "0.4.0-alpha.6" -__version__ = "0.10.0-alpha.5" +__version__ = "0.10.0-alpha.6" diff --git a/requirements.txt b/requirements.txt index bcc3bb04..f9f2e9ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ hadrodb numpy orjson -orso>=0.0.57 +orso>=0.0.61 pyarrow>=11.0.0 typer diff --git a/testdata/duckdb/planets.duckdb b/testdata/duckdb/planets.duckdb new file mode 100644 index 00000000..936d1eba Binary files /dev/null and b/testdata/duckdb/planets.duckdb differ diff --git a/tests/misc/test_cli.py b/tests/misc/test_cli.py index d29c8cfb..8f4da98a 100644 --- a/tests/misc/test_cli.py +++ b/tests/misc/test_cli.py @@ -7,7 +7,7 @@ sys.path.insert(1, os.path.join(sys.path[0], "../..")) -from opteryx.__main__ import main +from opteryx.command import main def test_basic_cli(): @@ -16,6 +16,7 @@ def test_basic_cli(): main(sql="SELECT * FROM $planets;", o="temp.csv") main(sql="SELECT * FROM $planets;", o="temp.jsonl") main(sql="SELECT * FROM $planets;", o="temp.parquet") + main(sql="SELECT * FROM $planets;", o="temp.md") if __name__ == "__main__": # pragma: no cover diff --git a/tests/misc/test_connection_arrow.py b/tests/misc/test_connection_arrow.py index d484aaad..28424aa7 100644 --- a/tests/misc/test_connection_arrow.py +++ b/tests/misc/test_connection_arrow.py @@ -33,8 +33,22 @@ def test_as_arrow_with_limit(): assert len(table.column_names) == 20 +def test_direct_as_arrow_no_limit(): + import opteryx + + conn = opteryx.connect() + cur = conn.cursor() + table = cur.execute_to_arrow("SELECT * FROM $planets") + + assert "name" in table.column_names + assert table.num_rows == 9 + assert len(table.column_names) == 20 + assert cur.stats["rows_read"] == 9, cur.stats + + if __name__ == "__main__": # pragma: no cover test_as_arrow_no_limit() test_as_arrow_with_limit() + test_direct_as_arrow_no_limit() print("✅ okay") diff --git a/tests/requirements.txt b/tests/requirements.txt index 11e4412f..8c6194cd 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -25,5 +25,7 @@ sqlalchemy pymysql psycopg2-binary polars +duckdb +duckdb-engine setuptools_rust \ No newline at end of file diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index a72f38f3..3567a5b0 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -18,5 +18,7 @@ firebase-admin sqlalchemy pymysql psycopg2-binary +duckdb +duckdb-engine setuptools_rust \ No newline at end of file diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py new file mode 100644 index 00000000..372d0cc2 --- /dev/null +++ b/tests/storage/test_sql_duckdb.py @@ -0,0 +1,32 @@ +""" +Test we can read from DuckDB - this is a basic exercise of the SQL Connector +""" +import os +import sys + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + +import opteryx + +from opteryx.connectors import SqlConnector + + +def test_duckdb_storage(): + opteryx.register_store( + "duckdb", + SqlConnector, + remove_prefix=True, + connection="duckdb:///testdata/duckdb/planets.duckdb", + ) + + results = opteryx.query("SELECT * FROM duckdb.planets") + assert results.rowcount == 9, results.rowcount + + # PROCESS THE DATA IN SOME WAY + results = opteryx.query("SELECT COUNT(*) FROM duckdb.planets;") + assert results.rowcount == 1, results.rowcount + + +if __name__ == "__main__": # pragma: no cover + test_duckdb_storage() + print("✅ okay")