Skip to content

Commit

Permalink
Merge pull request #1052 from mabel-dev/#1017/5
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed May 27, 2023
2 parents 7173bda + 74b035d commit 548fb96
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 22 deletions.
10 changes: 7 additions & 3 deletions opteryx/components/v2/binder.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@

from orso.logging import get_logger

from opteryx.exceptions import AmbiguousIdentifierError
from opteryx.exceptions import ColumnNotFoundError
from opteryx.exceptions import DatabaseError
from opteryx.managers.expression import NodeType
Expand Down Expand Up @@ -104,13 +105,16 @@ def source_identifiers(node, relations):
find_result = schema.find_column(node.value)
if find_result is not None:
if found_source_relation:
print("I think I found it twice")
raise AmbiguousIdentifierError(identifier=node.value)
found_source_relation = True
print("do something with the result")

if not found_source_relation:
# If we didn't find the relation, get all of the columns it could have been and
# see if we can suggest what the user should have entered in the error message
candidates = []
for a, s in relations.items():
candidates.extend(s.get_all_columns())
for _, schema in relations.items():
candidates.extend(schema.get_all_columns())
from opteryx.utils import fuzzy_search

suggestion = fuzzy_search(node.value, candidates)
Expand Down
22 changes: 12 additions & 10 deletions opteryx/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class Error(Exception):
Python StandardError (defined in the module exceptions).
"""

pass


class DatabaseError(Error):
"""
Expand All @@ -68,8 +66,6 @@ class DatabaseError(Error):
of Error.
"""

pass


class ProgrammingError(DatabaseError):
"""
Expand All @@ -79,8 +75,6 @@ class ProgrammingError(DatabaseError):
must be a subclass of DatabaseError.
"""

pass


# END PEP-0249

Expand All @@ -104,16 +98,17 @@ class CursorInvalidStateError(ProgrammingError):


class ColumnNotFoundError(ProgrammingError):
def __init__(self, column=None, suggestion=None):
def __init__(self, message=None, column=None, suggestion=None):
self.column = column
self.suggestion = suggestion

message = None
if column is not None:
if suggestion is not None:
message = f"'{column}' does not exist, did you mean '{suggestion}'?."
message = f"Column '{column}' does not exist, did you mean '{suggestion}'?."
else:
message = f"'{column}' does not exist."
message = f"Column '{column}' does not exist."
if message is None:
message = "Query contained columns which could not be found."
super().__init__(message)


Expand All @@ -127,6 +122,13 @@ def __init__(self, variable=None):
super().__init__()


class AmbiguousIdentifierError(ProgrammingError):
def __init__(self, identifier):
self.identifier = identifier
message = f"'Identifier reference '{identifier}' is ambiguous."
super().__init__(message)


class UnsupportedSyntaxError(ProgrammingError):
pass

Expand Down
6 changes: 3 additions & 3 deletions opteryx/models/relation_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ class FlatColumn:

@dataclass
class ConstantColumn(FlatColumn):
# Rather than pass around columns of constant values, where we can
# replace them with this column type
# Rather than pass around columns of constant values, where we can we should
# replace them with this column type.
length: int = 0
value: typing.Any = None


@dataclass
class RelationSchema:
table_name: str
aliases: typing.Optional[str] = field(default_factory=list)
aliases: typing.List[str] = field(default_factory=list)
columns: typing.List[FlatColumn] = field(default_factory=list)
temporal_start: typing.Optional[datetime.datetime] = None
temporal_end: typing.Optional[datetime.datetime] = None
Expand Down
23 changes: 20 additions & 3 deletions opteryx/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import itertools
import random
import re

import numpy
from orso.cityhash import CityHash64
Expand Down Expand Up @@ -46,18 +47,34 @@ def peek(iterable): # type:ignore

def fuzzy_search(name, candidates):
"""
Find closest match using a Levenshtein Distance variation
Find closest match using a variation of Levenshtein Distance
This implementation is limited to searching for distance less than three, is case
insenstive and removes any non-alpha numeric characters.
This is tuned for this use case of quickly identifying likely matches when a user
is entering field or function names and may have minor typos, casing or punctuation
mismatches with the source value.
"""
from opteryx.third_party.mbleven import compare

best_match_column = None
best_match_score = 100

for candidate in candidates:
name = "".join(char for char in name if char.isalnum())
for raw_candidate, candidate in (
(
ca,
"".join(ch for ch in ca if ch.isalnum()),
)
for ca in candidates
):
my_dist = compare(candidate, name)
if my_dist == 0: # if we find an exact match, return that
return raw_candidate
if 0 <= my_dist < best_match_score:
best_match_score = my_dist
best_match_column = candidate
best_match_column = raw_candidate

return best_match_column

Expand Down
4 changes: 2 additions & 2 deletions tests/misc/test_fuzzy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
("", ["", "crackle", "pop"], ""),
("", [], None),
("apple", ["appl", "aple", "aplee", "aplle"], "appl"), # first best match
("a_b_c_d", ["abcd", "a_b_cd", "a_b_c_d_e"], "a_b_cd"),
("a_b_c_d", ["abcd", "a_b_cd", "a_b_c_d_e"], "abcd"),
("a_b_c_d_e", ["abcd", "a_b_cd", "a_b_c_d_e"], "a_b_c_d_e"),
("a-b+c_d", ["abcd", "a_b+cd", "a-b+c_d-e"], "a_b+cd"),
("a-b+c_d", ["abcd", "a_b+cd", "a-b+c_d-e"], "abcd"),
("apple", ["banana", "orange", "pear"], None),
("apple", [], None),
("apple", ["appl", "aple", "aplee", "aplle", "apple"], "apple"),
Expand Down
4 changes: 3 additions & 1 deletion tests/misc/test_suggestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ def test_hint_hints():
conn = opteryx.connect()
cur = conn.cursor()
cur.execute("SELECT * FROM $planets WITH(NO_PARTITIONS)")
assert cur.messages == ["Hint `NO_PARTITIONS` is not recognized, did you mean `NO_PARTITION`?"]
assert cur.messages == [
"Hint `NO_PARTITIONS` is not recognized, did you mean `NO_PARTITION`?"
], cur.messages


if __name__ == "__main__": # pragma: no cover
Expand Down

0 comments on commit 548fb96

Please sign in to comment.