Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle human readable number casting in general #2797

Closed
Changes from 14 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 131 additions & 7 deletions db/types/operations/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from db.types.custom.money import MONEY_ARR_FUNC_NAME

NUMERIC_ARR_FUNC_NAME = "get_numeric_array"
INTEGER_ARR_FUNC_NAME = "get_integer_array"


def get_column_cast_expression(column, target_type, engine, type_options=None):
Expand Down Expand Up @@ -89,8 +90,11 @@ def create_email_casts(engine):


def create_integer_casts(engine):
integer_types = categories.INTEGER_TYPES
for db_type in integer_types:
integer_array_create = _build_integer_array_function()
with engine.begin() as conn:
conn.execute(text(integer_array_create))
integer_number_types = categories.INTEGER_TYPES
for db_type in integer_number_types:
type_body_map = _get_integer_type_body_map(target_type=db_type)
create_cast_functions(db_type, type_body_map, engine)

Expand Down Expand Up @@ -415,8 +419,9 @@ def _get_integer_type_body_map(target_type=PostgresType.INTEGER):
We specifically disallow rounding or truncating when casting from numerics,
etc.
"""
default_behavior_source_types = categories.INTEGER_TYPES | categories.STRING_TYPES
default_behavior_source_types = categories.INTEGER_TYPES
no_rounding_source_types = categories.DECIMAL_TYPES | categories.MONEY_WITHOUT_CURRENCY_TYPES | frozenset([PostgresType.NUMERIC])
text_source_types = categories.STRING_TYPES
target_type_str = target_type.id
cast_loss_exception_str = (
f"RAISE EXCEPTION '% cannot be cast to {target_type_str} without loss', $1;"
Expand All @@ -443,6 +448,12 @@ def _get_no_rounding_cast_to_integer():
for type_name in no_rounding_source_types
}
)
type_body_map.update(
{
text_type: _get_text_to_integer_cast()
for text_type in text_source_types
}
)
type_body_map.update({PostgresType.BOOLEAN: _get_boolean_to_number_cast(target_type)})
return type_body_map

Expand All @@ -458,7 +469,7 @@ def _get_decimal_number_type_body_map(target_type=PostgresType.NUMERIC):
"""

default_behavior_source_types = (
categories.NUMERIC_TYPES | categories.STRING_TYPES | categories.MONEY_WITHOUT_CURRENCY_TYPES
categories.NUMERIC_TYPES | categories.MONEY_WITHOUT_CURRENCY_TYPES | categories.STRING_TYPES
)
type_body_map = _get_default_type_body_map(
default_behavior_source_types, target_type,
Expand Down Expand Up @@ -901,25 +912,48 @@ def _get_text_uri_type_body_map():
return {type_: _get_text_uri_type_body_map() for type_ in source_types}


def _get_numeric_type_body_map():
def _get_numeric_type_body_map(target_type=PostgresType.NUMERIC):
"""
Get SQL strings that create various functions for casting different
types to numeric.
We allow casting any textual type to locale-agnostic numeric.
"""
default_behavior_source_types = categories.NUMERIC_TYPES | frozenset([PostgresType.MONEY])
text_source_types = categories.STRING_TYPES
no_rounding_source_types = categories.DECIMAL_TYPES
target_type_str = target_type.id
cast_loss_exception_str = (
f"RAISE EXCEPTION '% cannot be cast to {target_type_str} without loss', $1;"
)

def _get_no_rounding_cast_to_numeric():
return f"""
DECLARE numeric_res {target_type_str};
BEGIN
SELECT $1::{target_type_str} INTO numeric_res;
IF numeric_res = $1 THEN
RETURN numeric_res;
END IF;
{cast_loss_exception_str}
END;
"""

type_body_map = _get_default_type_body_map(
default_behavior_source_types, PostgresType.NUMERIC
default_behavior_source_types, target_type
)
type_body_map.update(
{
type_name: _get_no_rounding_cast_to_numeric()
for type_name in no_rounding_source_types
}
)
type_body_map.update(
{
text_type: _get_text_to_numeric_cast()
for text_type in text_source_types
}
)
type_body_map.update({PostgresType.BOOLEAN: _get_boolean_to_number_cast(PostgresType.NUMERIC)})
type_body_map.update({PostgresType.BOOLEAN: _get_boolean_to_number_cast(target_type)})
return type_body_map


Expand Down Expand Up @@ -979,6 +1013,7 @@ def _build_numeric_array_function():

inner_number_tree = "|".join(
[

no_separator_big,
no_separator_small,
comma_separator_req_decimal,
Expand Down Expand Up @@ -1027,6 +1062,95 @@ def _build_numeric_array_function():
"""


def _get_text_to_integer_cast():
text_db_type_id = PostgresType.TEXT.id
integer_db_type_id = PostgresType.BIGINT.id

integer_array_function = get_qualified_name(INTEGER_ARR_FUNC_NAME)
cast_exception_str = (
f"RAISE EXCEPTION '% cannot be cast to {PostgresType.BIGINT}', $1;"
)
return rf"""
DECLARE is_negative {PostgresType.BOOLEAN.id};
DECLARE integer_arr {text_db_type_id}[];
DECLARE integer {text_db_type_id};
BEGIN
SELECT {integer_array_function}($1::{text_db_type_id}) INTO integer_arr;
IF integer_arr IS NULL THEN
{cast_exception_str}
END IF;

SELECT integer_arr[1] INTO integer;
SELECT $1::text ~ '^-.*$' INTO is_negative;

IF integer_arr[2] IS NOT NULL THEN
SELECT regexp_replace(integer, integer_arr[2], '', 'gq') INTO integer;
END IF;
IF is_negative THEN
RETURN ('-' || integer)::{integer_db_type_id};
END IF;
RETURN integer::{integer_db_type_id};
END;
"""


def _build_integer_array_function():
"""
The main reason for this function to be separate is for testing. This
does have some performance impact; we should consider inlining later.
"""
qualified_function_name = get_qualified_name(INTEGER_ARR_FUNC_NAME)

single_digit = r"^[0-9]$"
no_separator = r"[0-9]{2,}(?:([,])[0-9]{1,2}|[0-9]{4,})?"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need separate expressions for single digit and multi digit non-separated integers. An integer with no separator is just a sequence of digits. No other complexity needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also not sure what the no_separator part should match at this point.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought numbers like 12,345 also have to be matched,that's why i was writing those two regex's ,later i realised 12,345 should not be matched so i think these the regex's should look like this now
no_separator = r"^-?\d+$"
comma_separator = r"[0-9]{1,3}(?:(,)[0-9]{3}){2,}"
period_separator = r"[0-9]{1,3}(?:(.)[0-9]{3}){2,}"
comma_separator_lakh_system = r"[0-9]{1,2}(?:(,)[0-9]{2})+,[0-9]{3}?"
single_quote_separator = r"[0-9]{1,3}(?:('')[0-9]{3})+"
space_separator = r"[0-9]{1,3}(?:( )[0-9]{3})+(?:([,])[0-9]+)?"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need the beginning or end string markers (or the [-] in the individual expression. Those are handled by the wrapping expression.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have made those changes and added tests,thanks for the help throughout @mathemancer .

comma_separator = r"[0-9]{1,3}(?:(,)[0-9]{3}){2,}"
period_separator = r"[0-9]{1,3}(?:(\.)[0-9]{3}){2,}"
comma_separator_lakh_system = r"[0-9]{1,2}(?:(,)[0-9]{2})+,[0-9]{3}?"
single_quote_separator = r"[0-9]{1,3}(?:(\'')[0-9]{3})+"
space_separator = r"[0-9]{1,3}(?:( )[0-9]{3})+(?:([,])[0-9]+)?"

inner_number_tree = "|".join(
[
single_digit,
no_separator,
comma_separator,
period_separator,
comma_separator_lakh_system,
single_quote_separator,
space_separator

])
integer_finding_regex = f"^(?:[+-]?({inner_number_tree}))$"

actual_number_indices = [1]
group_divider_indices = [4, 6, 8, 10, 12, 14, 16]
actual_numbers_str = ','.join([f'raw_arr[{idx}]' for idx in actual_number_indices])
group_dividers_str = ','.join([f'raw_arr[{idx}]' for idx in group_divider_indices])
text_db_type_id = PostgresType.TEXT.id
return rf"""
CREATE OR REPLACE FUNCTION {qualified_function_name}({text_db_type_id}) RETURNS {text_db_type_id}[]
AS $$
DECLARE
raw_arr {text_db_type_id}[];
actual_number_arr {text_db_type_id}[];
group_divider_arr {text_db_type_id}[];
actual_number {text_db_type_id};
group_divider {text_db_type_id};
BEGIN
SELECT regexp_matches($1, '{integer_finding_regex}') INTO raw_arr;
IF raw_arr IS NULL THEN
RETURN NULL;
END IF;
SELECT array_remove(ARRAY[{actual_numbers_str}], null) INTO actual_number_arr;
SELECT array_remove(ARRAY[{group_dividers_str}], null) INTO group_divider_arr;
SELECT actual_number_arr[1] INTO actual_number;
SELECT group_divider_arr[1] INTO group_divider;
RETURN ARRAY[actual_number, group_divider];
END;
$$ LANGUAGE plpgsql;
"""


def _get_default_type_body_map(source_types, target_type):
default_cast_str = f"""
BEGIN
Expand Down