Skip to content

Commit

Permalink
DEPR: datetimelike inference with strings (#41731)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Jun 1, 2021
1 parent 7b8f638 commit b8ee68b
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 60 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Expand Up @@ -698,6 +698,7 @@ Deprecations
- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`)
- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Expand Up @@ -153,7 +153,7 @@ def ensure_string_array(

def infer_datetimelike_array(
arr: np.ndarray # np.ndarray[object]
) -> str: ...
) -> tuple[str, bool]: ...

def astype_intsafe(
arr: np.ndarray, # np.ndarray[object]
Expand Down
22 changes: 12 additions & 10 deletions pandas/_libs/lib.pyx
Expand Up @@ -1558,7 +1558,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
return "mixed"


def infer_datetimelike_array(arr: ndarray[object]) -> str:
def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
"""
Infer if we have a datetime or timedelta array.
- date: we have *only* date and maybe strings, nulls
Expand All @@ -1576,19 +1576,21 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
Returns
-------
str: {datetime, timedelta, date, nat, mixed}
bool
"""
cdef:
Py_ssize_t i, n = len(arr)
bint seen_timedelta = False, seen_date = False, seen_datetime = False
bint seen_tz_aware = False, seen_tz_naive = False
bint seen_nat = False
bint seen_nat = False, seen_str = False
list objs = []
object v

for i in range(n):
v = arr[i]
if isinstance(v, str):
objs.append(v)
seen_str = True

if len(objs) == 3:
break
Expand All @@ -1609,7 +1611,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
seen_tz_aware = True

if seen_tz_naive and seen_tz_aware:
return "mixed"
return "mixed", seen_str
elif util.is_datetime64_object(v):
# np.datetime64
seen_datetime = True
Expand All @@ -1619,16 +1621,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
# timedelta, or timedelta64
seen_timedelta = True
else:
return "mixed"
return "mixed", seen_str

if seen_date and not (seen_datetime or seen_timedelta):
return "date"
return "date", seen_str
elif seen_datetime and not seen_timedelta:
return "datetime"
return "datetime", seen_str
elif seen_timedelta and not seen_datetime:
return "timedelta"
return "timedelta", seen_str
elif seen_nat:
return "nat"
return "nat", seen_str

# short-circuit by trying to
# actually convert these strings
Expand All @@ -1637,14 +1639,14 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
if len(objs):
try:
array_to_datetime(objs, errors="raise")
return "datetime"
return "datetime", seen_str
except (ValueError, TypeError):
pass

# we are *not* going to infer from strings
# for timedelta as too much ambiguity

return 'mixed'
return "mixed", seen_str


cdef inline bint is_timedelta(object o):
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/dtypes/cast.py
Expand Up @@ -1543,7 +1543,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
else:
return td_values.reshape(shape)

inferred_type = lib.infer_datetimelike_array(ensure_object(v))
inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))

if inferred_type == "datetime":
# error: Incompatible types in assignment (expression has type "ExtensionArray",
Expand Down Expand Up @@ -1572,6 +1572,15 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
value = try_datetime(v) # type: ignore[assignment]

if value.dtype.kind in ["m", "M"] and seen_str:
warnings.warn(
f"Inferring {value.dtype} from data containing strings is deprecated "
"and will be removed in a future version. To retain the old behavior "
"explicitly pass Series(data, dtype={value.dtype})",
FutureWarning,
stacklevel=find_stack_level(),
)
# return v.reshape(shape)
return value


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/apply/test_series_apply.py
Expand Up @@ -859,7 +859,9 @@ def test_apply_to_timedelta():
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]

a = pd.to_timedelta(list_of_strings) # noqa
b = Series(list_of_strings).apply(pd.to_timedelta) # noqa
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
ser = Series(list_of_strings)
b = ser.apply(pd.to_timedelta) # noqa
# Can't compare until apply on a Series gives the correct dtype
# assert_series_equal(a, b)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arithmetic/test_datetime64.py
Expand Up @@ -328,7 +328,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array):
box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray
)

ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), "NaT"])
ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), NaT])
ser = tm.box_expected(ser, box_with_array)

result = ser != ser
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/dtypes/test_inference.py
Expand Up @@ -1169,7 +1169,7 @@ def test_infer_dtype_period_with_na(self, na_value):
],
)
def test_infer_datetimelike_array_datetime(self, data):
assert lib.infer_datetimelike_array(data) == "datetime"
assert lib.infer_datetimelike_array(data) == ("datetime", False)

@pytest.mark.parametrize(
"data",
Expand All @@ -1181,11 +1181,11 @@ def test_infer_datetimelike_array_datetime(self, data):
],
)
def test_infer_datetimelike_array_timedelta(self, data):
assert lib.infer_datetimelike_array(data) == "timedelta"
assert lib.infer_datetimelike_array(data) == ("timedelta", False)

def test_infer_datetimelike_array_date(self):
arr = [date(2017, 6, 12), date(2017, 3, 11)]
assert lib.infer_datetimelike_array(arr) == "date"
assert lib.infer_datetimelike_array(arr) == ("date", False)

@pytest.mark.parametrize(
"data",
Expand All @@ -1200,7 +1200,7 @@ def test_infer_datetimelike_array_date(self):
],
)
def test_infer_datetimelike_array_mixed(self, data):
assert lib.infer_datetimelike_array(data) == "mixed"
assert lib.infer_datetimelike_array(data)[0] == "mixed"

@pytest.mark.parametrize(
"first, expected",
Expand All @@ -1218,7 +1218,7 @@ def test_infer_datetimelike_array_mixed(self, data):
@pytest.mark.parametrize("second", [None, np.nan])
def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
first.append(second)
assert lib.infer_datetimelike_array(first) == expected
assert lib.infer_datetimelike_array(first) == (expected, False)

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
Expand Down
45 changes: 24 additions & 21 deletions pandas/tests/resample/test_time_grouper.py
Expand Up @@ -305,27 +305,30 @@ def test_groupby_resample_interpolate():
.resample("1D")
.interpolate(method="linear")
)
expected_ind = pd.MultiIndex.from_tuples(
[
(50, "2018-01-07"),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected_ind = pd.MultiIndex.from_tuples(
[
(50, "2018-01-07"),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)
expected = DataFrame(
data={
"price": [
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/series/accessors/test_dt_accessor.py
Expand Up @@ -679,6 +679,7 @@ def test_dt_timetz_accessor(self, tz_naive_fixture):
[["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]],
],
)
@pytest.mark.filterwarnings("ignore:Inferring datetime64:FutureWarning")
def test_isocalendar(self, input_series, expected_output):
result = pd.to_datetime(Series(input_series)).dt.isocalendar()
expected_frame = DataFrame(
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/series/methods/test_combine_first.py
Expand Up @@ -78,7 +78,11 @@ def test_combine_first_dt64(self):
s0 = to_datetime(Series(["2010", np.NaN]))
s1 = Series([np.NaN, "2011"])
rs = s0.combine_first(s1)
xp = Series([datetime(2010, 1, 1), "2011"])

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
xp = Series([datetime(2010, 1, 1), "2011"])

tm.assert_series_equal(rs, xp)

def test_combine_first_dt_tz_values(self, tz_naive_fixture):
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/series/methods/test_fillna.py
Expand Up @@ -319,8 +319,11 @@ def test_datetime64_fillna(self):

# GH#6587
# make sure that we are treating as integer when filling
# this also tests inference of a datetime-like with NaT's
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# this also tests inference of a datetime-like with NaT's
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])

expected = Series(
[
"2013-08-05 15:30:00.000001",
Expand Down
49 changes: 33 additions & 16 deletions pandas/tests/series/test_constructors.py
Expand Up @@ -900,14 +900,23 @@ def test_constructor_dtype_datetime64_7(self):

def test_constructor_dtype_datetime64_6(self):
# these will correctly infer a datetime
s = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert s.dtype == "datetime64[ns]"
s = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert s.dtype == "datetime64[ns]"
s = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert s.dtype == "datetime64[ns]"
s = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert s.dtype == "datetime64[ns]"
msg = "containing strings is deprecated"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"

def test_constructor_dtype_datetime64_5(self):
# tz-aware (UTC and other tz's)
Expand Down Expand Up @@ -1379,14 +1388,22 @@ def test_constructor_dtype_timedelta64(self):
assert td.dtype == "object"

# these will correctly infer a timedelta
s = Series([None, NaT, "1 Day"])
assert s.dtype == "timedelta64[ns]"
s = Series([np.nan, NaT, "1 Day"])
assert s.dtype == "timedelta64[ns]"
s = Series([NaT, None, "1 Day"])
assert s.dtype == "timedelta64[ns]"
s = Series([NaT, np.nan, "1 Day"])
assert s.dtype == "timedelta64[ns]"
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "1 Day"])
assert ser.dtype == "timedelta64[ns]"

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "1 Day"])
assert ser.dtype == "timedelta64[ns]"

# GH 16406
def test_constructor_mixed_tz(self):
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/tools/test_to_timedelta.py
Expand Up @@ -187,6 +187,16 @@ def test_to_timedelta_via_apply(self):
result = Series([to_timedelta("00:00:01")])
tm.assert_series_equal(result, expected)

def test_to_timedelta_inference_without_warning(self):
# GH#41731 inference produces a warning in the Series constructor,
# but _not_ in to_timedelta
vals = ["00:00:01", pd.NaT]
with tm.assert_produces_warning(None):
result = to_timedelta(vals)

expected = TimedeltaIndex([pd.Timedelta(seconds=1), pd.NaT])
tm.assert_index_equal(result, expected)

def test_to_timedelta_on_missing_values(self):
# GH5438
timedelta_NaT = np.timedelta64("NaT")
Expand All @@ -197,7 +207,8 @@ def test_to_timedelta_on_missing_values(self):
)
tm.assert_series_equal(actual, expected)

actual = to_timedelta(Series(["00:00:01", pd.NaT]))
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
actual = to_timedelta(Series(["00:00:01", pd.NaT]))
tm.assert_series_equal(actual, expected)

actual = to_timedelta(np.nan)
Expand Down

0 comments on commit b8ee68b

Please sign in to comment.