Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: to_datetime raising on invalid offsets with errors=coerce and infer_datetime_format #48676

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,10 @@ Categorical
Datetimelike
^^^^^^^^^^^^
- Bug in :func:`pandas.infer_freq`, raising ``TypeError`` when inferred on :class:`RangeIndex` (:issue:`47084`)
- Bug in :func:`to_datetime` was raising on invalid offsets with ``errors='coerce'`` and ``infer_datetime_format=True`` (:issue:`48633`)
- Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`)
- Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`)
-

Timedelta
^^^^^^^^^
Expand Down
9 changes: 7 additions & 2 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ def format_is_iso(f: str) -> bint:
return False


def guess_datetime_format(dt_str, bint dayfirst=False):
def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
"""
Guess the datetime format of a given datetime string.

Expand Down Expand Up @@ -1026,7 +1026,12 @@ def guess_datetime_format(dt_str, bint dayfirst=False):
# This separation will prevent subsequent processing
# from correctly parsing the time zone format.
# So in addition to the format nomalization, we rejoin them here.
tokens[offset_index] = parsed_datetime.strftime("%z")
try:
tokens[offset_index] = parsed_datetime.strftime("%z")
except ValueError:
# Invalid offset might not have raised in du_parse
# https://github.com/dateutil/dateutil/issues/188
return None
tokens = tokens[:offset_index + 1 or None]

format_guess = [None] * len(tokens)
Expand Down
30 changes: 27 additions & 3 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,13 +1142,37 @@ def test_to_datetime_coerce(self):
)
tm.assert_index_equal(result, expected)

def test_to_datetime_coerce_malformed(self):
@pytest.mark.parametrize("infer_datetime_format", [True, False])
@pytest.mark.parametrize(
"errors, expected",
[
("coerce", Index([NaT, NaT])),
("ignore", Index(["200622-12-31", "111111-24-11"])),
],
)
def test_to_datetime_malformed_no_raise(
self, errors, expected, infer_datetime_format
):
# GH 28299
# GH 48633
ts_strings = ["200622-12-31", "111111-24-11"]
result = to_datetime(ts_strings, errors="coerce")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious, do we have a similar test when errors="ignore"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch! indeed, that errors

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually it's fine (I may not have rebuilt my c extensions when I left the comment above) - have parametrised over errors in the test now, thanks

expected = Index([NaT, NaT])
result = to_datetime(
ts_strings, errors=errors, infer_datetime_format=infer_datetime_format
)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("infer_datetime_format", [True, False])
def test_to_datetime_malformed_raise(self, infer_datetime_format):
# GH 48633
ts_strings = ["200622-12-31", "111111-24-11"]
with pytest.raises(
ValueError,
match=r"^hour must be in 0\.\.23: 111111-24-11 present at position 1$",
):
to_datetime(
ts_strings, errors="raise", infer_datetime_format=infer_datetime_format
)

def test_iso_8601_strings_with_same_offset(self):
# GH 17697, 11736
ts_str = "2015-11-18 15:30:00+05:30"
Expand Down
13 changes: 11 additions & 2 deletions pandas/tests/tslibs/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,6 @@ def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
"1/1/1/1",
"this_is_not_a_datetime",
"51a",
9,
datetime(2011, 1, 1),
],
)
def test_guess_datetime_format_invalid_inputs(invalid_dt):
Expand All @@ -222,6 +220,17 @@ def test_guess_datetime_format_invalid_inputs(invalid_dt):
assert parsing.guess_datetime_format(invalid_dt) is None


@pytest.mark.parametrize("invalid_type_dt", [9, datetime(2011, 1, 1)])
def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt):
# A datetime string must include a year, month and a day for it to be
# guessable, in addition to being a string that looks like a datetime.
with pytest.raises(
TypeError,
match=r"^Argument 'dt_str' has incorrect type \(expected str, got .*\)$",
):
parsing.guess_datetime_format(invalid_type_dt)


@pytest.mark.parametrize(
"string,fmt",
[
Expand Down