Merge branch 'main' into ci/cache/daily

pandas-dev · May 15, 2024 · 655e6ef · 655e6ef
2 parents 17c4a96 + 0fc0336
commit 655e6ef
Show file tree

Hide file tree

Showing 26 changed files with 215 additions and 354 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -146,7 +146,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.cat.rename_categories PR01,PR02" \
         -i "pandas.Series.cat.reorder_categories PR01,PR02" \
         -i "pandas.Series.cat.set_categories PR01,PR02" \
-        -i "pandas.Series.div PR07" \
         -i "pandas.Series.dt.as_unit PR01,PR02" \
         -i "pandas.Series.dt.ceil PR01,PR02" \
         -i "pandas.Series.dt.components SA01" \
@@ -166,33 +165,23 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.tz_convert PR01,PR02" \
         -i "pandas.Series.dt.tz_localize PR01,PR02" \
         -i "pandas.Series.dt.unit GL08" \
-        -i "pandas.Series.eq PR07,SA01" \
-        -i "pandas.Series.floordiv PR07" \
-        -i "pandas.Series.ge PR07,SA01" \
-        -i "pandas.Series.gt PR07,SA01" \
+        -i "pandas.Series.eq SA01" \
+        -i "pandas.Series.ge SA01" \
+        -i "pandas.Series.gt SA01" \
         -i "pandas.Series.kurt RT03,SA01" \
         -i "pandas.Series.kurtosis RT03,SA01" \
-        -i "pandas.Series.le PR07,SA01" \
+        -i "pandas.Series.le SA01" \
         -i "pandas.Series.list.__getitem__ SA01" \
         -i "pandas.Series.list.flatten SA01" \
         -i "pandas.Series.list.len SA01" \
-        -i "pandas.Series.lt PR07,SA01" \
-        -i "pandas.Series.ne PR07,SA01" \
+        -i "pandas.Series.lt SA01" \
+        -i "pandas.Series.ne SA01" \
         -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Series.plot PR02,SA01" \
         -i "pandas.Series.pop RT03,SA01" \
-        -i "pandas.Series.pow PR07" \
         -i "pandas.Series.prod RT03" \
         -i "pandas.Series.product RT03" \
-        -i "pandas.Series.radd PR07" \
-        -i "pandas.Series.rdiv PR07" \
         -i "pandas.Series.reorder_levels RT03,SA01" \
-        -i "pandas.Series.rfloordiv PR07" \
-        -i "pandas.Series.rmod PR07" \
-        -i "pandas.Series.rmul PR07" \
-        -i "pandas.Series.rpow PR07" \
-        -i "pandas.Series.rsub PR07" \
-        -i "pandas.Series.rtruediv PR07" \
         -i "pandas.Series.sem PR01,RT03,SA01" \
         -i "pandas.Series.skew RT03,SA01" \
         -i "pandas.Series.sparse PR01,SA01" \
@@ -232,14 +221,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.str.wrap RT03,SA01" \
         -i "pandas.Series.str.zfill RT03" \
         -i "pandas.Series.struct.dtypes SA01" \
-        -i "pandas.Series.sub PR07" \
         -i "pandas.Series.sum RT03" \
         -i "pandas.Series.swaplevel SA01" \
         -i "pandas.Series.to_dict SA01" \
         -i "pandas.Series.to_frame SA01" \
         -i "pandas.Series.to_markdown SA01" \
         -i "pandas.Series.to_string SA01" \
-        -i "pandas.Series.truediv PR07" \
         -i "pandas.Series.update PR07,SA01" \
         -i "pandas.Series.var PR01,RT03,SA01" \
         -i "pandas.Timedelta PR07,SA01" \
@@ -290,7 +277,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.quarter SA01" \
         -i "pandas.Timestamp.replace PR07,SA01" \
         -i "pandas.Timestamp.resolution PR02,PR07,SA01" \
-        -i "pandas.Timestamp.round SA01" \
         -i "pandas.Timestamp.second GL08" \
         -i "pandas.Timestamp.strptime PR01,SA01" \
         -i "pandas.Timestamp.time SA01" \
@@ -301,7 +287,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.to_julian_date SA01" \
         -i "pandas.Timestamp.to_numpy PR01" \
         -i "pandas.Timestamp.to_period PR01,SA01" \
-        -i "pandas.Timestamp.to_pydatetime PR01,SA01" \
         -i "pandas.Timestamp.today SA01" \
         -i "pandas.Timestamp.toordinal SA01" \
         -i "pandas.Timestamp.tz SA01" \
@@ -548,7 +533,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.api.guess_datetime_format SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
         -i "pandas.tseries.offsets.BMonthBegin PR02" \
-        -i "pandas.tseries.offsets.BMonthEnd PR02" \
         -i "pandas.tseries.offsets.BQuarterBegin PR02" \
         -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \
         -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
@@ -609,7 +593,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \
         -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \
         -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \
-        -i "pandas.tseries.offsets.BusinessMonthEnd PR02" \
         -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \
         -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \
         -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \
@@ -754,7 +737,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \
         -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \
         -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \
-        -i "pandas.tseries.offsets.MonthEnd PR02" \
         -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \
         -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \
         -i "pandas.tseries.offsets.MonthEnd.n GL08" \
@@ -799,7 +781,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \
-        -i "pandas.tseries.offsets.SemiMonthEnd PR02,SA01" \
+        -i "pandas.tseries.offsets.SemiMonthEnd SA01" \
         -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \
         -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \
         -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1711,6 +1711,6 @@ Why does assignment fail when using chained indexing?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :ref:`Copy-on-Write <copy_on_write>` is the new default with pandas 3.0.
-This means than chained indexing will never work.
+This means that chained indexing will never work.
 See :ref:`this section <copy_on_write_chained_assignment>`
 for more context.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1511,7 +1511,6 @@ Currently, options unsupported by the C and pyarrow engines include:
 
 * ``sep`` other than a single character (e.g. regex separators)
 * ``skipfooter``
-* ``sep=None`` with ``delim_whitespace=False``
 
 Specifying any of the above options will produce a ``ParserWarning`` unless the
 python engine is selected explicitly using ``engine='python'``.
@@ -1526,7 +1525,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the
 * ``memory_map``
 * ``dialect``
 * ``on_bad_lines``
-* ``delim_whitespace``
 * ``quoting``
 * ``lineterminator``
 * ``converters``

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -243,7 +243,9 @@ Removal of prior version deprecations/changes
 - Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`)
 - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`)
 - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`)
+- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`)
 - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
+- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`)
 - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
 - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`)
 - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`)

diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
@@ -882,7 +882,30 @@ class NaTType(_NaT):
         """
         Convert a Timestamp object to a native Python datetime object.
 
-        If warn=True, issue a warning if nanoseconds is nonzero.
+        This method is useful for when you need to utilize a pandas Timestamp
+        object in contexts where native Python datetime objects are expected
+        or required. The conversion discards the nanoseconds component, and a
+        warning can be issued in such cases if desired.
+
+        Parameters
+        ----------
+        warn : bool, default True
+            If True, issues a warning when the timestamp includes nonzero
+            nanoseconds, as these will be discarded during the conversion.
+
+        Returns
+        -------
+        datetime.datetime or NaT
+            Returns a datetime.datetime object representing the timestamp,
+            with year, month, day, hour, minute, second, and microsecond components.
+            If the timestamp is NaT (Not a Time), returns NaT.
+
+        See Also
+        --------
+        datetime.datetime : The standard Python datetime class that this method
+            returns.
+        Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp.
+        Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64.
 
         Examples
         --------
@@ -947,6 +970,12 @@ class NaTType(_NaT):
         """
         Round the Timestamp to the specified resolution.
 
+        This method rounds the given Timestamp down to a specified frequency
+        level. It is particularly useful in data analysis to normalize timestamps
+        to regular frequency intervals. For instance, rounding to the nearest
+        minute, hour, or day can help in time series comparisons or resampling
+        operations.
+
         Parameters
         ----------
         freq : str
@@ -981,6 +1010,14 @@ timedelta}, default 'raise'
         ------
         ValueError if the freq cannot be converted
 
+        See Also
+        --------
+        datetime.round : Similar behavior in native Python datetime module.
+        Timestamp.floor : Round the Timestamp downward to the nearest multiple
+            of the specified frequency.
+        Timestamp.ceil : Round the Timestamp upward to the nearest multiple of
+            the specified frequency.
+
         Notes
         -----
         If the Timestamp has a timezone, rounding will take place relative to the

diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -2939,7 +2939,7 @@ cdef class MonthEnd(MonthOffset):
 
     MonthEnd goes to the next date which is an end of the month.
 
-    Parameters
+    Attributes
     ----------
     n : int, default 1
         The number of months represented.
@@ -3014,7 +3014,7 @@ cdef class BusinessMonthEnd(MonthOffset):
 
     BusinessMonthEnd goes to the next date which is the last business day of the month.
 
-    Parameters
+    Attributes
     ----------
     n : int, default 1
         The number of months represented.
@@ -3222,7 +3222,7 @@ cdef class SemiMonthEnd(SemiMonthOffset):
     """
     Two DateOffset's per month repeating on the last day of the month & day_of_month.
 
-    Parameters
+    Attributes
     ----------
     n : int, default 1
         The number of months represented.

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -1196,7 +1196,30 @@ cdef class _Timestamp(ABCTimestamp):
         """
         Convert a Timestamp object to a native Python datetime object.
 
-        If warn=True, issue a warning if nanoseconds is nonzero.
+        This method is useful for when you need to utilize a pandas Timestamp
+        object in contexts where native Python datetime objects are expected
+        or required. The conversion discards the nanoseconds component, and a
+        warning can be issued in such cases if desired.
+
+        Parameters
+        ----------
+        warn : bool, default True
+            If True, issues a warning when the timestamp includes nonzero
+            nanoseconds, as these will be discarded during the conversion.
+
+        Returns
+        -------
+        datetime.datetime or NaT
+            Returns a datetime.datetime object representing the timestamp,
+            with year, month, day, hour, minute, second, and microsecond components.
+            If the timestamp is NaT (Not a Time), returns NaT.
+
+        See Also
+        --------
+        datetime.datetime : The standard Python datetime class that this method
+            returns.
+        Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp.
+        Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64.
 
         Examples
         --------
@@ -2015,6 +2038,12 @@ class Timestamp(_Timestamp):
         """
         Round the Timestamp to the specified resolution.
 
+        This method rounds the given Timestamp down to a specified frequency
+        level. It is particularly useful in data analysis to normalize timestamps
+        to regular frequency intervals. For instance, rounding to the nearest
+        minute, hour, or day can help in time series comparisons or resampling
+        operations.
+
         Parameters
         ----------
         freq : str
@@ -2049,6 +2078,14 @@ timedelta}, default 'raise'
         ------
         ValueError if the freq cannot be converted
 
+        See Also
+        --------
+        datetime.round : Similar behavior in native Python datetime module.
+        Timestamp.floor : Round the Timestamp downward to the nearest multiple
+            of the specified frequency.
+        Timestamp.ceil : Round the Timestamp upward to the nearest multiple of
+            the specified frequency.
+
         Notes
         -----
         If the Timestamp has a timezone, rounding will take place relative to the

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -38,7 +38,6 @@
     ensure_object,
     is_list_like,
     is_object_dtype,
-    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import NumpyEADtype
@@ -555,9 +554,7 @@ def sanitize_array(
         # Avoid ending up with a NumpyExtensionArray
         dtype = dtype.numpy_dtype
 
-    object_index = False
-    if isinstance(data, ABCIndex) and data.dtype == object and dtype is None:
-        object_index = True
+    data_was_index = isinstance(data, ABCIndex)
 
     # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
     data = extract_array(data, extract_numpy=True, extract_range=True)
@@ -610,15 +607,8 @@ def sanitize_array(
 
         if dtype is None:
             subarr = data
-            if data.dtype == object:
+            if data.dtype == object and not data_was_index:
                 subarr = maybe_infer_to_datetimelike(data)
-                if (
-                    object_index
-                    and using_pyarrow_string_dtype()
-                    and is_string_dtype(subarr)
-                ):
-                    # Avoid inference when string option is set
-                    subarr = data
             elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5059,22 +5059,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]:
 
         if is_list_like(value):
             com.require_length_match(value, self.index)
-        arr = sanitize_array(value, self.index, copy=True, allow_2d=True)
-        if (
-            isinstance(value, Index)
-            and value.dtype == "object"
-            and arr.dtype != value.dtype
-        ):  #
-            # TODO: Remove kludge in sanitize_array for string mode when enforcing
-            # this deprecation
-            warnings.warn(
-                "Setting an Index with object dtype into a DataFrame will stop "
-                "inferring another dtype in a future version. Cast the Index "
-                "explicitly before setting it into the DataFrame.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-        return arr, None
+        return sanitize_array(value, self.index, copy=True, allow_2d=True), None
 
     @property
     def _series(self):

diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py
@@ -436,6 +436,7 @@ def make_flex_doc(op_name: str, typ: str) -> str:
 Parameters
 ----------
 other : Series or scalar value
+    The second operand in this operation.
 level : int or name
     Broadcast across a level, matching Index values on the
     passed MultiIndex level.