Skip to content

Commit

Permalink
DEPR: stop inferring dt64/td64 from strings in Series construtor (#49319
Browse files Browse the repository at this point in the history
)

* DEPR: stop inferring dt64/td64 from strings in Series construtor

* update pyi
  • Loading branch information
jbrockmendel committed Oct 26, 2022
1 parent 218ab09 commit 6ee0acb
Show file tree
Hide file tree
Showing 12 changed files with 72 additions and 116 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ Removal of prior version deprecations/changes
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def ensure_string_array(
) -> npt.NDArray[np.object_]: ...
def infer_datetimelike_array(
arr: npt.NDArray[np.object_],
) -> tuple[str, bool]: ...
) -> str: ...
def convert_nans_to_NA(
arr: npt.NDArray[np.object_],
) -> npt.NDArray[np.object_]: ...
Expand Down
50 changes: 15 additions & 35 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ from pandas._libs.util cimport (
is_nan,
)

from pandas._libs.tslib import array_to_datetime
from pandas._libs.tslibs import (
OutOfBoundsDatetime,
OutOfBoundsTimedelta,
Expand Down Expand Up @@ -1583,25 +1582,19 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
Returns
-------
str: {datetime, timedelta, date, nat, mixed}
bool
"""
cdef:
Py_ssize_t i, n = len(arr)
bint seen_timedelta = False, seen_date = False, seen_datetime = False
bint seen_tz_aware = False, seen_tz_naive = False
bint seen_nat = False, seen_str = False
bint seen_nat = False
bint seen_period = False, seen_interval = False
list objs = []
object v

for i in range(n):
v = arr[i]
if isinstance(v, str):
objs.append(v)
seen_str = True

if len(objs) == 3:
break
return "mixed"

elif v is None or util.is_nan(v):
# nan or None
Expand All @@ -1619,7 +1612,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
seen_tz_aware = True

if seen_tz_naive and seen_tz_aware:
return "mixed", seen_str
return "mixed"
elif util.is_datetime64_object(v):
# np.datetime64
seen_datetime = True
Expand All @@ -1635,43 +1628,30 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
seen_interval = True
break
else:
return "mixed", seen_str
return "mixed"

if seen_period:
if is_period_array(arr):
return "period", seen_str
return "mixed", seen_str
return "period"
return "mixed"

if seen_interval:
if is_interval_array(arr):
return "interval", seen_str
return "mixed", seen_str
return "interval"
return "mixed"

if seen_date and not (seen_datetime or seen_timedelta):
return "date", seen_str
return "date"
elif seen_datetime and not seen_timedelta:
return "datetime", seen_str
return "datetime"
elif seen_timedelta and not seen_datetime:
return "timedelta", seen_str
return "timedelta"
elif seen_datetime and seen_timedelta:
return "mixed"
elif seen_nat:
return "nat", seen_str
return "nat"

# short-circuit by trying to
# actually convert these strings
# this is for performance as we don't need to try
# convert *every* string array
if len(objs):
try:
# require_iso8601 as in maybe_infer_to_datetimelike
array_to_datetime(objs, errors="raise", require_iso8601=True)
return "datetime", seen_str
except (ValueError, TypeError):
pass

# we are *not* going to infer from strings
# for timedelta as too much ambiguity

return "mixed", seen_str
return "mixed"


cdef inline bint is_timedelta(object o):
Expand Down
15 changes: 4 additions & 11 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1264,7 +1264,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
else:
return td_values.reshape(shape)

inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
# TODO: can we just do lib.maybe_convert_objects for this entire function?
inferred_type = lib.infer_datetimelike_array(ensure_object(v))

if inferred_type in ["period", "interval"]:
# Incompatible return value type (got "Union[ExtensionArray, ndarray]",
# expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,
Expand All @@ -1280,14 +1282,14 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
elif inferred_type == "timedelta":
value = try_timedelta(v)
elif inferred_type == "nat":
# only reached if we have at least 1 NaT and the rest (NaT or None or np.nan)

# if all NaT, return as datetime
if isna(v).all():
# error: Incompatible types in assignment (expression has type
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
value = try_datetime(v) # type: ignore[assignment]
else:

# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
Expand All @@ -1300,15 +1302,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
value = try_datetime(v) # type: ignore[assignment]

if value.dtype.kind in ["m", "M"] and seen_str:
# TODO(2.0): enforcing this deprecation should close GH#40111
warnings.warn(
f"Inferring {value.dtype} from data containing strings is deprecated "
"and will be removed in a future version. To retain the old behavior "
f"explicitly pass Series(data, dtype={value.dtype})",
FutureWarning,
stacklevel=find_stack_level(),
)
return value


Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def _infer_types(
result = BooleanArray(result, bool_mask)
elif result.dtype == np.object_ and use_nullable_dtypes:
# read_excel sends array of datetime objects
inferred_type, _ = lib.infer_datetimelike_array(result)
inferred_type = lib.infer_datetimelike_array(result)
if inferred_type != "datetime":
result = StringDtype().construct_array_type()._from_sequence(values)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,8 +859,7 @@ def test_apply_to_timedelta():
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]

a = pd.to_timedelta(list_of_strings)
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
ser = Series(list_of_strings)
ser = Series(list_of_strings)
b = ser.apply(pd.to_timedelta)
tm.assert_series_equal(Series(a), b)

Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,7 +1346,7 @@ def test_infer_dtype_period_with_na(self, na_value):
],
)
def test_infer_datetimelike_array_datetime(self, data):
assert lib.infer_datetimelike_array(data) == ("datetime", False)
assert lib.infer_datetimelike_array(data) == "datetime"

@pytest.mark.parametrize(
"data",
Expand All @@ -1358,11 +1358,11 @@ def test_infer_datetimelike_array_datetime(self, data):
],
)
def test_infer_datetimelike_array_timedelta(self, data):
assert lib.infer_datetimelike_array(data) == ("timedelta", False)
assert lib.infer_datetimelike_array(data) == "timedelta"

def test_infer_datetimelike_array_date(self):
arr = [date(2017, 6, 12), date(2017, 3, 11)]
assert lib.infer_datetimelike_array(arr) == ("date", False)
assert lib.infer_datetimelike_array(arr) == "date"

@pytest.mark.parametrize(
"data",
Expand All @@ -1377,7 +1377,7 @@ def test_infer_datetimelike_array_date(self):
],
)
def test_infer_datetimelike_array_mixed(self, data):
assert lib.infer_datetimelike_array(data)[0] == "mixed"
assert lib.infer_datetimelike_array(data) == "mixed"

@pytest.mark.parametrize(
"first, expected",
Expand All @@ -1395,7 +1395,7 @@ def test_infer_datetimelike_array_mixed(self, data):
@pytest.mark.parametrize("second", [None, np.nan])
def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
first.append(second)
assert lib.infer_datetimelike_array(first) == (expected, False)
assert lib.infer_datetimelike_array(first) == expected

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
Expand Down
44 changes: 21 additions & 23 deletions pandas/tests/resample/test_time_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,29 +321,27 @@ def test_groupby_resample_interpolate():
.interpolate(method="linear")
)

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected_ind = pd.MultiIndex.from_tuples(
[
(50, "2018-01-07"),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)
expected_ind = pd.MultiIndex.from_tuples(
[
(50, Timestamp("2018-01-07")),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)

expected = DataFrame(
data={
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/series/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,7 @@ def test_combine_first_dt64(self):
s1 = Series([np.NaN, "2011"])
rs = s0.combine_first(s1)

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
xp = Series([datetime(2010, 1, 1), "2011"])
xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]")

tm.assert_series_equal(rs, xp)

Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,10 +365,7 @@ def test_datetime64_fillna(self):
def test_datetime64_fillna_backfill(self):
# GH#6587
# make sure that we are treating as integer when filling
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# this also tests inference of a datetime-like with NaT's
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]")

expected = Series(
[
Expand Down
48 changes: 20 additions & 28 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,24 +1018,20 @@ def test_constructor_dtype_datetime64_7(self):
assert series1.dtype == object

def test_constructor_dtype_datetime64_6(self):
# these will correctly infer a datetime
msg = "containing strings is deprecated"
# as of 2.0, these no longer infer datetime64 based on the strings,
# matching the Index behavior

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

def test_constructor_dtype_datetime64_5(self):
# tz-aware (UTC and other tz's)
Expand Down Expand Up @@ -1517,23 +1513,19 @@ def test_constructor_dtype_timedelta64(self):
td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
assert td.dtype == "object"

# these will correctly infer a timedelta
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
# as of 2.0, these no longer infer timedelta64 based on the strings,
# matching Index behavior
ser = Series([None, NaT, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([np.nan, NaT, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([NaT, None, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([NaT, np.nan, "1 Day"])
assert ser.dtype == object

# GH 16406
def test_constructor_mixed_tz(self):
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/tools/test_to_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,7 @@ def test_to_timedelta_on_missing_values(self):
)
tm.assert_series_equal(actual, expected)

with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
ser = Series(["00:00:01", pd.NaT])
assert ser.dtype == "m8[ns]"
ser = Series(["00:00:01", pd.NaT], dtype="m8[ns]")
actual = to_timedelta(ser)
tm.assert_series_equal(actual, expected)

Expand Down

0 comments on commit 6ee0acb

Please sign in to comment.