Merge pull request #233 from mabel-dev/FIX/#230

Feature/#230
mabel-dev · Jun 25, 2022 · cbcd0ee · cbcd0ee
2 parents 8e4082f + 392ad91
commit cbcd0ee
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 0 deletions.
diff --git a/docs/Release Notes/Change Log.md b/docs/Release Notes/Change Log.md
@@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [[#203](https://github.com/mabel-dev/opteryx/issues/203)] When reporting that a column doesn't exist, it should suggest likely correct columns. ([@joocer](https://github.com/joocer))
 - Not Regular Expression match operator, `!~` added to supported set of operators. ([@joocer](https://github.com/joocer))
 - [[#226](https://github.com/mabel-dev/opteryx/issues/226)] Implement `DATE_TRUNC` function. ([@joocer](https://github.com/joocer))
+- [[#230](https://github.com/mabel-dev/opteryx/issues/230)] Allow addressing fields as numbers. ([@joocer](https://github.com/joocer))
 
 **Changed**
 

diff --git a/docs/SQL Reference/02 Statements.md b/docs/SQL Reference/02 Statements.md
@@ -111,6 +111,8 @@ HAVING group_filter
 
 The `GROUP BY` clause specifies which grouping columns should be used to perform any aggregations in the `SELECT` clause. If the `GROUP BY` clause is specified, the query is always an aggregate query, even if no aggregations are present in the `SELECT` clause. The `HAVING` clause specifies filters to apply to aggregated data, `HAVING` clauses require a `GROUP BY` clause.
 
+`GROUP BY` expressions may use column numbers, however, this is not recommended for statements intended for reuse. 
+
 ### ORDER BY / LIMIT / OFFSET clauses
 
 ~~~
@@ -125,6 +127,8 @@ LIMIT count
 
 `ORDER BY`, `LIMIT` and `OFFSET` are output modifiers. Logically they are applied at the very end of the query. The `OFFSET` clause discards initial rows from the returned set, the `LIMIT` clause restricts the amount of rows fetched, and the `ORDER BY` clause sorts the rows on the sorting criteria in either ascending or descending order.
 
+`ORDER BY` expressions may use column numbers, however, this is not recommended for statements intended for reuse.
+
 ## SHOW COLUMNS
 
 List the columns in a relation along with their data type and an indication if nulls have been found in the first page of records.

diff --git a/opteryx/engine/planner/operations/aggregate_node.py b/opteryx/engine/planner/operations/aggregate_node.py
@@ -27,6 +27,7 @@
 But, on high cardinality data (nearly unique columns), the performance is much faster,
 on a 10m record set, timings are 1:400 (50s:1220s where 20s is the read time).
 """
+from asyncio import create_subprocess_shell
 from typing import Iterable, List
 
 import numpy as np
@@ -100,6 +101,7 @@ def __init__(
 
         from opteryx.engine.attribute_types import TOKEN_TYPES
 
+        self._positions = []
         self._aggregates = []
         self._groups = config.get("groups", [])
         self._project = self._groups.copy()
@@ -115,10 +117,16 @@ def __init__(
                     self._project.append(column)
                 else:
                     raise SqlError("Can only aggregate on fields in the dataset.")
+                self._positions.append(column)
             elif "column_name" in attribute:
                 self._project.append(attribute["column_name"])
+                if attribute["alias"]:
+                    self._positions.append(attribute["alias"][0])
+                else:
+                    self._positions.append(attribute["column_name"])
             else:
                 self._project.append(attribute["identifier"])
+                self._positions.append(attribute["identifier"])
 
         self._project = [p for p in self._project if p is not None]
 
@@ -194,13 +202,18 @@ def execute(self) -> Iterable:
 
                 for key in self._project:
                     if key != "*":
+                        if isinstance(key, int):
+                            key = self._positions[key - 1]
                         column = columns.get_column_from_alias(key, only_one=True)
                         if column not in self._mapped_project:
                             self._mapped_project.append(column)
                     else:
                         self._mapped_project.append("*")
 
                 for group in self._groups:
+                    # if we have a number, use it as an column offset
+                    if isinstance(group, int):
+                        group = self._positions[group - 1]
                     self._mapped_groups.append(
                         columns.get_column_from_alias(group, only_one=True)
                     )

diff --git a/opteryx/engine/planner/operations/sort_node.py b/opteryx/engine/planner/operations/sort_node.py
@@ -104,6 +104,17 @@ def execute(self) -> Iterable:
                         )
                     )
 
+            # we have an index rather than a column name, it's a natural number but the
+            # list of column names is zero-based, so we subtract one
+            elif isinstance(column, int):
+                column_name = table.column_names[column - 1]
+                self._mapped_order.append(
+                    (
+                        column_name,
+                        direction,
+                    )
+                )
+
             else:
                 self._mapped_order.append(
                     (

diff --git a/opteryx/engine/planner/planner.py b/opteryx/engine/planner/planner.py
@@ -535,6 +535,8 @@ def _extract_order(self, ast):
                     ]
                     alias = f"{func.upper()}({','.join([str(a[0]) for a in args])})"
                     column = {"function": func, "args": args, "alias": alias}
+                if "Value" in column:
+                    column = int(column["Value"]["Number"][0])
                 orders.append(
                     (
                         column,
@@ -570,6 +572,8 @@ def _inner(element):
                     if "Number" in key_dict:
                         key = key_dict["Number"][0]
                     return f"{identifier}[{key}]"
+                if "Value" in element:
+                    return int(element["Value"]["Number"][0])
 
         groups = ast[0]["Query"]["body"]["Select"]["group_by"]
         return [_inner(g) for g in groups]

diff --git a/opteryx/utils/columns.py b/opteryx/utils/columns.py
@@ -183,6 +183,7 @@ def fuzzy_search(self, column_name):
         for k, v in self._column_metadata.items():
             for alias in v.get("aliases"):
                 my_dist = compare(column_name, alias)
+                print(alias)
                 if my_dist > 0 and my_dist < best_match_score:
                     best_match_score = my_dist
                     best_match_column = alias

diff --git a/tests/sql_battery/test_battery_shape.py b/tests/sql_battery/test_battery_shape.py
@@ -135,6 +135,11 @@
         ("SELECT * FROM $satellites LIMIT 50 OFFSET 150", 27, 8),
         ("SELECT * FROM $satellites LIMIT 50 OFFSET 170", 7, 8),
         ("SELECT * FROM $satellites ORDER BY name", 177, 8),
+        ("SELECT * FROM $satellites ORDER BY 1", 177, 8),
+        ("SELECT * FROM $satellites ORDER BY 1 DESC", 177, 8),
+        ("SELECT * FROM $satellites ORDER BY 2", 177, 8),
+        ("SELECT * FROM $satellites ORDER BY 1, 2", 177, 8),
+        ("SELECT * FROM $satellites ORDER BY 1 ASC", 177, 8),
         ("SELECT * FROM $satellites ORDER BY RANDOM()", 177, 8),
 
         ("SELECT MAX(planetId) FROM $satellites", 1, 1),
@@ -158,6 +163,9 @@
         ("SELECT GET(name, 1) FROM $satellites GROUP BY planetId, GET(name, 1)", 56, 1),
         ("SELECT COUNT(*), ROUND(magnitude) FROM $satellites group by ROUND(magnitude)", 27, 2),
         ("SELECT ROUND(magnitude) FROM $satellites group by ROUND(magnitude)", 27, 1),
+        ("SELECT VARCHAR(planetId), COUNT(*) FROM $satellites GROUP BY 1", 7, 2),
+        ("SELECT LEFT(name, 1), COUNT(*) FROM $satellites GROUP BY 1 ORDER BY 2 DESC", 21, 2),
+        ("SELECT LEFT(name, 1) as le, COUNT(*) FROM $satellites GROUP BY 1 ORDER BY 2 DESC", 21, 2),
         ("SELECT round(magnitude) FROM $satellites group by round(magnitude)", 27, 1),
         ("SELECT upper(name) as NAME, id as Identifier FROM $satellites", 177, 2),
         ("SELECT upper(name), lower(name), id as Identifier FROM $satellites", 177, 3),