diff --git a/README.md b/README.md
index 0f3c2bf7..d424d95d 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
-Opteryx is a SQL Engine designed for embedded and cloud-native environments, and with command-line skills.
+Opteryx is an in-process SQL query engine for analysis of distributed datasets.
[Documentation](https://opteryx.dev/latest) |
[Examples](#examples) |
@@ -26,7 +26,7 @@ Opteryx is a SQL Engine designed for embedded and cloud-native environments, and
## What is Opteryx?
-Opteryx is a powerful Python library designed for data wrangling and analytics. With Opteryx, users can seamlessly interact with various data platforms, unlocking the full potential of their data.
+Opteryx is a Python library designed for data wrangling and analytics. With Opteryx, users can seamlessly interact with various data platforms, unlocking the full potential of their data.
Opteryx offers the following features:
@@ -34,7 +34,7 @@ Opteryx offers the following features:
- A command-line tool for filtering, transforming, and combining files in a flexible and intuitive manner.
- Embeddable as a low-cost engine, allowing for hundreds of analysts to leverage ad hoc databases with ease.
- Integration with familiar tools like pandas and Polars.
-- Unified access to data on disk, in the Cloud and in on-prem databases, not only through the same interface, but in the same query.
+- Unified and federated access to data on disk, in the Cloud and in on-prem databases, not only through the same interface, but in the same query.
## Why Use Opteryx?
@@ -68,7 +68,7 @@ Opteryx is Open Source Python, it quickly and easily integrates into Python code
### __Time Travel__
-Designed for data analytics in environments where decisions need to be replayable, Opteryx allows you to query data as at a point in time in the past to replay decision algorithms against facts as they were known in the past. _(data must be structured to enable temporal queries)_
+Designed for data analytics in environments where decisions need to be replayable, Opteryx allows you to query data as at a point in time in the past to replay decision algorithms against facts as they were known in the past. You can even self-join tables historic data, great for finding deltas in datasets over time. _(data must be structured to enable temporal queries)_
### __Fast__
diff --git a/opteryx/__main__.py b/opteryx/__main__.py
index a9dff789..51af4b9e 100644
--- a/opteryx/__main__.py
+++ b/opteryx/__main__.py
@@ -56,15 +56,16 @@ def main(
print(f"Opteryx version {opteryx.__version__}")
print(" Enter '.help' for usage hints")
print(" Enter '.exit' to exit this program")
- print()
# Start the REPL loop
while True: # pragma: no cover
# Prompt the user for a SQL statement
+ print()
statement = input('opteryx> ')
- # If the user entered "quit", exit the loop
- if statement == '.exit':
+ # If the user entered "exit", exit the loop
+ # forgive them for 'quit'
+ if statement in {'.exit', '.quit'}:
break
if statement == ".help":
print(" .exit Exit this program")
diff --git a/opteryx/command.py b/opteryx/command.py
new file mode 100644
index 00000000..72235d66
--- /dev/null
+++ b/opteryx/command.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from opteryx.__main__ import main
diff --git a/opteryx/connection.py b/opteryx/connection.py
index 01306128..e433df82 100644
--- a/opteryx/connection.py
+++ b/opteryx/connection.py
@@ -20,6 +20,7 @@
import typing
from uuid import uuid4
+import pyarrow
from orso import DataFrame
from orso import converters
@@ -109,7 +110,7 @@ def id(self):
"""The unique internal reference for this query"""
return self._qid
- def execute(self, operation, params=None):
+ def _inner_execute(self, operation, params=None):
if not operation:
raise MissingSqlStatement("SQL statement not found")
@@ -145,9 +146,21 @@ def execute(self, operation, params=None):
results = self._query_planner.execute(self._plan)
if results is not None:
- self._rows, self._schema = converters.from_arrow(utils.arrow.rename_columns(results))
+ return utils.arrow.rename_columns(results)
+
+ def execute(self, operation, params=None):
+ results = self._inner_execute(operation, params)
+ if results is not None:
+ self._rows, self._schema = converters.from_arrow(results)
self._cursor = iter(self._rows)
+ def execute_to_arrow(self, operation, params=None, limit=None):
+ results = self._inner_execute(operation, params)
+ if results is not None:
+ if limit is not None:
+ return utils.arrow.limit_records(results, limit)
+ return pyarrow.concat_tables(results, promote=True)
+
@property
def stats(self):
"""execution statistics"""
diff --git a/opteryx/utils/__init__.py b/opteryx/utils/__init__.py
index 6092962e..133edb0d 100644
--- a/opteryx/utils/__init__.py
+++ b/opteryx/utils/__init__.py
@@ -24,7 +24,7 @@ def hasher(vals):
This is roughly 2x faster than the previous implementation for lists of strings.
Do note though, if you're micro-optimizing, this is faster to create but is
- slower for some Python functions to handle, like 'sorted'.
+ slower for some Python functions to handle the result of, like 'sorted'.
"""
if numpy.issubdtype(vals.dtype, numpy.character):
return numpy.array([CityHash64(s.encode()) for s in vals], numpy.uint64)
diff --git a/opteryx/version.py b/opteryx/version.py
index 49f93a92..0847dfee 100644
--- a/opteryx/version.py
+++ b/opteryx/version.py
@@ -17,4 +17,4 @@
"""
# __version__ = "0.4.0-alpha.6"
-__version__ = "0.10.0-alpha.5"
+__version__ = "0.10.0-alpha.6"
diff --git a/requirements.txt b/requirements.txt
index bcc3bb04..f9f2e9ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
hadrodb
numpy
orjson
-orso>=0.0.57
+orso>=0.0.61
pyarrow>=11.0.0
typer
diff --git a/testdata/duckdb/planets.duckdb b/testdata/duckdb/planets.duckdb
new file mode 100644
index 00000000..936d1eba
Binary files /dev/null and b/testdata/duckdb/planets.duckdb differ
diff --git a/tests/misc/test_cli.py b/tests/misc/test_cli.py
index d29c8cfb..8f4da98a 100644
--- a/tests/misc/test_cli.py
+++ b/tests/misc/test_cli.py
@@ -7,7 +7,7 @@
sys.path.insert(1, os.path.join(sys.path[0], "../.."))
-from opteryx.__main__ import main
+from opteryx.command import main
def test_basic_cli():
@@ -16,6 +16,7 @@ def test_basic_cli():
main(sql="SELECT * FROM $planets;", o="temp.csv")
main(sql="SELECT * FROM $planets;", o="temp.jsonl")
main(sql="SELECT * FROM $planets;", o="temp.parquet")
+ main(sql="SELECT * FROM $planets;", o="temp.md")
if __name__ == "__main__": # pragma: no cover
diff --git a/tests/misc/test_connection_arrow.py b/tests/misc/test_connection_arrow.py
index d484aaad..28424aa7 100644
--- a/tests/misc/test_connection_arrow.py
+++ b/tests/misc/test_connection_arrow.py
@@ -33,8 +33,22 @@ def test_as_arrow_with_limit():
assert len(table.column_names) == 20
+def test_direct_as_arrow_no_limit():
+ import opteryx
+
+ conn = opteryx.connect()
+ cur = conn.cursor()
+ table = cur.execute_to_arrow("SELECT * FROM $planets")
+
+ assert "name" in table.column_names
+ assert table.num_rows == 9
+ assert len(table.column_names) == 20
+ assert cur.stats["rows_read"] == 9, cur.stats
+
+
if __name__ == "__main__": # pragma: no cover
test_as_arrow_no_limit()
test_as_arrow_with_limit()
+ test_direct_as_arrow_no_limit()
print("✅ okay")
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 11e4412f..8c6194cd 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -25,5 +25,7 @@ sqlalchemy
pymysql
psycopg2-binary
polars
+duckdb
+duckdb-engine
setuptools_rust
\ No newline at end of file
diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt
index a72f38f3..3567a5b0 100644
--- a/tests/requirements_arm.txt
+++ b/tests/requirements_arm.txt
@@ -18,5 +18,7 @@ firebase-admin
sqlalchemy
pymysql
psycopg2-binary
+duckdb
+duckdb-engine
setuptools_rust
\ No newline at end of file
diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py
new file mode 100644
index 00000000..372d0cc2
--- /dev/null
+++ b/tests/storage/test_sql_duckdb.py
@@ -0,0 +1,32 @@
+"""
+Test we can read from DuckDB - this is a basic exercise of the SQL Connector
+"""
+import os
+import sys
+
+sys.path.insert(1, os.path.join(sys.path[0], "../.."))
+
+import opteryx
+
+from opteryx.connectors import SqlConnector
+
+
+def test_duckdb_storage():
+ opteryx.register_store(
+ "duckdb",
+ SqlConnector,
+ remove_prefix=True,
+ connection="duckdb:///testdata/duckdb/planets.duckdb",
+ )
+
+ results = opteryx.query("SELECT * FROM duckdb.planets")
+ assert results.rowcount == 9, results.rowcount
+
+ # PROCESS THE DATA IN SOME WAY
+ results = opteryx.query("SELECT COUNT(*) FROM duckdb.planets;")
+ assert results.rowcount == 1, results.rowcount
+
+
+if __name__ == "__main__": # pragma: no cover
+ test_duckdb_storage()
+ print("✅ okay")