diff --git a/README copy.md b/README copy.md new file mode 100644 index 00000000..397e0650 --- /dev/null +++ b/README copy.md @@ -0,0 +1,56 @@ +# Opteryx README + +[![Opteryx](https://raw.githubusercontent.com/mabel-dev/opteryx/main/opteryx-word-small.png)](https://opteryx.dev/latest) + +Opteryx is a Python library designed for data wrangling and analytics. With Opteryx, users can seamlessly interact with various data platforms, unlocking the full potential of their data. + +## Features + +Opteryx offers the following features: + +- SQL queries on data files generated by other processes, such as logs. +- A command-line tool for filtering, transforming, and combining files in a flexible and intuitive manner. +- Embeddable as a low-cost engine, allowing for hundreds of analysts to leverage ad hoc databases with ease. +- Integration with familiar tools like pandas and Polars. +- Unified and federated access to data on disk, in the Cloud and in on-prem databases, not only through the same interface, but in the same query. + +## Why Use Opteryx? + +### Familiar Interface + +Opteryx supports key parts of the Python DBAPI and SQL92 standard standards which many analysts and engineers will already know how to use. + +### Consistent Syntax + +Opteryx creates a common SQL-layer over multiple data platforms, allowing backend systems to be upgraded, migrated or consolidated without changing any Opteryx code. + +### Bring your own Data + +Opteryx supports multiple query engines, dataframe APIs and storage formats. You can mix-and-match sources in a single query. Opteryx can even `JOIN` datasets stored in different formats, such as Parquet and MySQL. + +Opteryx allows you to query your data directly in the systems where they are stored, eliminating the need to duplicate data into a common store for analytics. This saves you the cost and effort of maintaining duplicates. + +Opteryx can push parts of your query to the source query engine, allowing queries to run at the speed of the backend, rather than your local computer. + +And if there's not a connector in the box for your data platform; bespoke connectors can be added. + +### Consumption-Based Billing Friendly + +Opteryx is well-suited for deployments to environments which are pay-as-you-use, like Google Cloud Run. Great for situations where you low-volume usage, or multiple environments, where the costs of many traditional database deployment can quickly add up. + +### Python Native + +Opteryx is Open Source Python, it quickly and easily integrates into Python code, including Jupyter Notebooks, so you can start querying your data within a few minutes. You can use Opteryx to run SQL against pandas DataFrames, and even execute a `JOIN` on an in-memory DataFrame and a remote dataset. + +### Time Travel + +Designed for data analytics in environments where decisions need to be replayable, Opteryx allows you to query data as at a point in time in the past to replay decision algorithms against facts as they were known in the past. You can even self-join tables historic data, great for finding deltas in datasets over time. (data must be structured to enable temporal queries) + +### Fast + +Benchmarks on M1 Pro Mac running an ad hoc `GROUP BY` over a 1Gb parquet file via the CLI in ~1/5th of a second, from a cold start. (different systems will have different performance characteristics) + +| Rows | Columns | File Size | Query Time | +| ---- | ------- | --------- | ---------- | +| 561225 | 81 | 1Gb | 0.22sec | +| 1064539 | 81 | 2Gb | 0.27sec | \ No newline at end of file diff --git a/opteryx/models/columns.py b/opteryx/models/columns.py index 88080014..361a0b59 100644 --- a/opteryx/models/columns.py +++ b/opteryx/models/columns.py @@ -154,6 +154,7 @@ def get_column_from_alias(self, column, only_one: bool = False): If we're expecting only_one match, we fail if that's not what we find. """ matches = [] + column = str(column) for col, att in self._column_metadata.items(): matches.extend([col for alias in att.get("aliases", []) if alias == column]) matches = list(dict.fromkeys(matches)) diff --git a/opteryx/operators/file_reader_node.py b/opteryx/operators/file_reader_node.py index 8b94ee38..6292dd96 100644 --- a/opteryx/operators/file_reader_node.py +++ b/opteryx/operators/file_reader_node.py @@ -111,8 +111,8 @@ def name(self): # pragma: no cover return "File Reader" def execute(self) -> Iterable: - ext = ".".join(self._dataset.split("/")[-1].split(".")[1:]) - parser, kind = KNOWN_EXTENSIONS[ext] + ext = self._dataset.split("/")[-1].split(".")[-1] + parser, _ = KNOWN_EXTENSIONS[ext] time_to_read, blob_bytes, pyarrow_blob = self._read_and_parse( ( diff --git a/testdata/flat/multi/00.01.jsonl b/testdata/flat/multi/00.01.jsonl new file mode 100644 index 00000000..f2a789a2 --- /dev/null +++ b/testdata/flat/multi/00.01.jsonl @@ -0,0 +1 @@ +{"string": "string", "int": 1, "float": 1.2, "once":"true"} \ No newline at end of file diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 2ac7b359..ca9b4215 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -712,6 +712,9 @@ ("SELECT * FROM $RomanGods", None, None, DatasetNotFoundError), # disk dataset doesn't exist ("SELECT * FROM non.existent", None, None, DatasetNotFoundError), + # column doesn't exist + ("SELECT awesomeness_factor FROM $planets;", None, None, ColumnNotFoundError), + ("SELECT * FROM $planets WHERE awesomeness_factor > 'Mega';", None, None, ColumnNotFoundError), # https://trino.io/docs/current/functions/aggregate.html#filtering-during-aggregation ("SELECT LIST(name) FILTER (WHERE name IS NOT NULL) FROM $planets;", None, None, SqlError), # Can't IN an INDENTIFIER @@ -876,7 +879,11 @@ # 999 - subscripting ("SELECT name['n'] FROM $planets", None, None, ProgrammingError), ("SELECT id['n'] FROM $planets", None, None, ProgrammingError), - + # [1008] fuzzy search fails on ints + ("SELECT * FROM $planets JOIN $planets ON id = 12;", None, None, ColumnNotFoundError), + ("SELECT * FROM $planets JOIN $planets ON 12 = id;", None, None, ColumnNotFoundError), + # [1006] dots in filenames + ("SELECT * FROM 'testdata/flat/multi/00.01.jsonl'", 1, 4, None), ] # fmt:on