CLN: assorted (#52569)

pandas-dev · Apr 10, 2023 · 2d5ad57 · 2d5ad57
1 parent c537b36
commit 2d5ad57
Show file tree

Hide file tree

Showing 28 changed files with 65 additions and 148 deletions.
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -248,6 +248,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
 - Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`)
 - Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
+- Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`)
 - Performance improvement in :class:`Series` reductions (:issue:`52341`)
 - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
 - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)

diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
@@ -30,11 +30,11 @@
 from pandas._config.display import detect_console_encoding
 
 
-def using_copy_on_write():
+def using_copy_on_write() -> bool:
     _mode_options = _global_config["mode"]
     return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"
 
 
-def using_nullable_dtypes():
+def using_nullable_dtypes() -> bool:
     _mode_options = _global_config["mode"]
     return _mode_options["nullable_dtypes"]
diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi
@@ -131,12 +131,13 @@ class Timestamp(datetime):
     def astimezone(self, tz: _tzinfo | None) -> Self: ...  # type: ignore[override]
     def ctime(self) -> str: ...
     def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ...
-    # Return type "datetime" of "strptime" incompatible with return type "Timestamp"
-    # in supertype "datetime"
     @classmethod
-    def strptime(  # type: ignore[override]
-        cls, date_string: str, format: str
-    ) -> datetime: ...
+    def strptime(
+        # Note: strptime is actually disabled and raises NotImplementedError
+        cls,
+        date_string: str,
+        format: str,
+    ) -> Self: ...
     def utcoffset(self) -> timedelta | None: ...
     def tzname(self) -> str | None: ...
     def dst(self) -> timedelta | None: ...

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -547,7 +547,7 @@ def assert_period_array_equal(left, right, obj: str = "PeriodArray") -> None:
     _check_isinstance(left, right, PeriodArray)
 
     assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
-    assert_attr_equal("freq", left, right, obj=obj)
+    assert_attr_equal("dtype", left, right, obj=obj)
 
 
 def assert_datetime_array_equal(

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -471,7 +471,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
 
         if (
             len(values) > 0
-            and is_numeric_dtype(values)
+            and is_numeric_dtype(values.dtype)
             and not is_signed_integer_dtype(comps)
         ):
             # GH#46485 Use object to avoid upcast to float64 later

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -236,7 +236,7 @@ def transform(self) -> DataFrame | Series:
         # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
         # Series]"
         if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
-            obj.index  # type:ignore[arg-type]
+            obj.index  # type: ignore[arg-type]
         ):
             raise ValueError("Function did not transform")
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1150,7 +1150,7 @@ def _concat_same_type(cls, to_concat) -> Self:
         """
         chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
         if to_concat[0].dtype == "string":
-            # StringDtype has no attrivute pyarrow_dtype
+            # StringDtype has no attribute pyarrow_dtype
             pa_dtype = pa.string()
         else:
             pa_dtype = to_concat[0].dtype.pyarrow_dtype

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -13,10 +13,7 @@
     missing as libmissing,
 )
 
-from pandas.core.dtypes.common import (
-    is_list_like,
-    is_numeric_dtype,
-)
+from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna
 
@@ -180,7 +177,7 @@ def coerce_to_array(
     if isinstance(values, np.ndarray) and values.dtype == np.bool_:
         if copy:
             values = values.copy()
-    elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
+    elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
         mask_values = isna(values)
 
         values_bool = np.zeros(len(values), dtype=bool)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -113,7 +113,6 @@
 from pandas.core import (
     algorithms,
     nanops,
-    ops,
 )
 from pandas.core.algorithms import (
     checked_add_with_arr,
@@ -903,13 +902,7 @@ def _cmp_method(self, other, op):
 
         dtype = getattr(other, "dtype", None)
         if is_object_dtype(dtype):
-            # We have to use comp_method_OBJECT_ARRAY instead of numpy
-            #  comparison otherwise it would fail to raise when
-            #  comparing tz-aware and tz-naive
-            result = ops.comp_method_OBJECT_ARRAY(
-                op, np.asarray(self.astype(object)), other
-            )
-            return result
+            return op(np.asarray(self, dtype=object), other)
 
         if other is NaT:
             if op is operator.ne:

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1805,21 +1805,6 @@ def _formatter(self, boxed: bool = False):
         # This will infer the correct formatter from the dtype of the values.
         return None
 
-    # ------------------------------------------------------------------------
-    # GroupBy Methods
-
-    def _groupby_op(
-        self,
-        *,
-        how: str,
-        has_dropped_na: bool,
-        min_count: int,
-        ngroups: int,
-        ids: npt.NDArray[np.intp],
-        **kwargs,
-    ):
-        raise NotImplementedError(f"{self.dtype} dtype not supported")
-
 
 def _make_sparse(
     arr: np.ndarray,

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -25,10 +25,8 @@
     DT64NS_DTYPE,
     TD64NS_DTYPE,
     ensure_object,
-    is_bool_dtype,
     is_dtype_equal,
     is_extension_array_dtype,
-    is_integer_dtype,
     is_object_dtype,
     is_scalar,
     is_string_or_object_np_dtype,
@@ -431,23 +429,6 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
 notnull = notna
 
 
-def isna_compat(arr, fill_value=np.nan) -> bool:
-    """
-    Parameters
-    ----------
-    arr: a numpy array
-    fill_value: fill value, default to np.nan
-
-    Returns
-    -------
-    True if we can fill using this fill_value
-    """
-    if isna(fill_value):
-        dtype = arr.dtype
-        return not (is_bool_dtype(dtype) or is_integer_dtype(dtype))
-    return True
-
-
 def array_equivalent(
     left,
     right,

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -278,15 +278,8 @@
 axis : int or str, optional
     Axis to target. Can be either the axis name ('index', 'columns')
     or number (0, 1).""",
-    "replace_iloc": """
-    This differs from updating with ``.loc`` or ``.iloc``, which require
-    you to specify a location to update with some value.""",
 }
 
-_numeric_only_doc = """numeric_only : bool, default False
-    Include only float, int, boolean data.
-"""
-
 _merge_doc = """
 Merge DataFrame or named Series objects with a database-style join.
 
@@ -5736,7 +5729,7 @@ def set_index(
 
                 # error: Argument 1 to "append" of "list" has incompatible type
                 #  "Union[Index, Series]"; expected "Index"
-                arrays.append(col)  # type:ignore[arg-type]
+                arrays.append(col)  # type: ignore[arg-type]
                 names.append(col.name)
             elif isinstance(col, (list, np.ndarray)):
                 # error: Argument 1 to "append" of "list" has incompatible type
@@ -7791,10 +7784,7 @@ def _flex_arith_method(
             #  through the DataFrame path
             raise NotImplementedError(f"fill_value {fill_value} not supported.")
 
-        other = ops.maybe_prepare_scalar_for_op(
-            other,
-            self.shape,
-        )
+        other = ops.maybe_prepare_scalar_for_op(other, self.shape)
         self, other = self._align_for_op(other, axis, flex=True, level=level)
 
         with np.errstate(all="ignore"):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -212,16 +212,12 @@
     "axes": "keywords for axes",
     "klass": "Series/DataFrame",
     "axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame",  # noqa:E501
-    "args_transpose": "axes to permute (int or label for object)",
     "inplace": """
     inplace : bool, default False
         If True, performs operation inplace and returns None.""",
     "optional_by": """
         by : str or list of str
             Name or list of names to sort by""",
-    "replace_iloc": """
-    This differs from updating with ``.loc`` or ``.iloc``, which require
-    you to specify a location to update with some value.""",
 }
 
 
@@ -264,22 +260,11 @@ class NDFrame(PandasObject, indexing.IndexingMixin):
     # ----------------------------------------------------------------------
     # Constructors
 
-    def __init__(
-        self,
-        data: Manager,
-        copy: bool_t = False,
-        attrs: Mapping[Hashable, Any] | None = None,
-    ) -> None:
-        # copy kwarg is retained for mypy compat, is not used
-
+    def __init__(self, data: Manager) -> None:
         object.__setattr__(self, "_is_copy", None)
         object.__setattr__(self, "_mgr", data)
         object.__setattr__(self, "_item_cache", {})
-        if attrs is None:
-            attrs = {}
-        else:
-            attrs = dict(attrs)
-        object.__setattr__(self, "_attrs", attrs)
+        object.__setattr__(self, "_attrs", {})
         object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
 
     @final
@@ -313,6 +298,7 @@ def _init_mgr(
                 mgr = mgr.astype(dtype=dtype)
         return mgr
 
+    @final
     def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
         """
         Private helper function to create a DataFrame with specific manager.
@@ -7314,7 +7300,6 @@ def replace(
         _shared_docs["replace"],
         klass=_shared_doc_kwargs["klass"],
         inplace=_shared_doc_kwargs["inplace"],
-        replace_iloc=_shared_doc_kwargs["replace_iloc"],
     )
     def replace(
         self,

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -431,7 +431,7 @@ def _outer_indexer(
 
     @cache_readonly
     def _can_hold_strings(self) -> bool:
-        return not is_numeric_dtype(self)
+        return not is_numeric_dtype(self.dtype)
 
     _engine_types: dict[np.dtype | ExtensionDtype, type[libindex.IndexEngine]] = {
         np.dtype(np.int8): libindex.Int8Engine,
@@ -3307,6 +3307,8 @@ def _wrap_setop_result(self, other: Index, result) -> Index:
 
     @final
     def intersection(self, other, sort: bool = False):
+        # default sort keyword is different here from other setops intentionally
+        #  done in GH#25063
         """
         Form the intersection of two Index objects.
 

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -29,7 +29,6 @@
     contains,
 )
 from pandas.core.construction import extract_array
-import pandas.core.indexes.base as ibase
 from pandas.core.indexes.base import (
     Index,
     maybe_extract_name,
@@ -47,8 +46,6 @@
         DtypeObj,
         npt,
     )
-_index_doc_kwargs: dict[str, str] = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
 
 
 @inherit_names(

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -64,7 +64,10 @@
     )
 
     from pandas import Index
-    from pandas.core.internals.blocks import Block
+    from pandas.core.internals.blocks import (
+        Block,
+        BlockPlacement,
+    )
 
 
 def _concatenate_array_managers(
@@ -317,7 +320,9 @@ def _maybe_reindex_columns_na_proxy(
     return new_mgrs_indexers
 
 
-def _get_mgr_concatenation_plan(mgr: BlockManager):
+def _get_mgr_concatenation_plan(
+    mgr: BlockManager,
+) -> list[tuple[BlockPlacement, JoinUnit]]:
     """
     Construct concatenation plan for given block manager.
 

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -44,7 +44,6 @@
     needs_i8_conversion,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import PeriodDtype
 from pandas.core.dtypes.missing import (
     isna,
     na_value_for_dtype,
@@ -669,7 +668,6 @@ def _mask_datetimelike_result(
     return result
 
 
-@disallow(PeriodDtype)
 @bottleneck_switch()
 @_datetimelike_compat
 def nanmean(
@@ -808,38 +806,34 @@ def get_median(x, _mask=None):
             # empty set so return nans of shape "everything but the passed axis"
             # since "axis" is where the reduction would occur if we had a nonempty
             # array
-            res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
+            res = _get_empty_reduction_result(values.shape, axis)
 
     else:
         # otherwise return a scalar value
         res = get_median(values, mask) if notempty else np.nan
     return _wrap_results(res, dtype)
 
 
-def get_empty_reduction_result(
-    shape: tuple[int, ...],
+def _get_empty_reduction_result(
+    shape: Shape,
     axis: AxisInt,
-    dtype: np.dtype | type[np.floating],
-    fill_value: Any,
 ) -> np.ndarray:
     """
     The result from a reduction on an empty ndarray.
 
     Parameters
     ----------
-    shape : Tuple[int]
+    shape : Tuple[int, ...]
     axis : int
-    dtype : np.dtype
-    fill_value : Any
 
     Returns
     -------
     np.ndarray
     """
     shp = np.array(shape)
     dims = np.arange(len(shape))
-    ret = np.empty(shp[dims != axis], dtype=dtype)
-    ret.fill(fill_value)
+    ret = np.empty(shp[dims != axis], dtype=np.float64)
+    ret.fill(np.nan)
     return ret