Skip to content

Commit

Permalink
Backport PR #35519: REF: StringArray._from_sequence, use less memory (#…
Browse files Browse the repository at this point in the history
…35770)

Co-authored-by: Terji Petersen <contribute@tensortable.com>
  • Loading branch information
meeseeksmachine and topper-123 committed Aug 17, 2020
1 parent ac8845b commit 66d08dc
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 53 deletions.
15 changes: 15 additions & 0 deletions asv_bench/benchmarks/strings.py
Expand Up @@ -7,6 +7,21 @@
from .pandas_vb_common import tm


class Construction:

params = ["str", "string"]
param_names = ["dtype"]

def setup(self, dtype):
self.data = tm.rands_array(nchars=10 ** 5, size=10)

def time_construction(self, dtype):
Series(self.data, dtype=dtype)

def peakmem_construction(self, dtype):
Series(self.data, dtype=dtype)


class Methods:
def setup(self):
self.s = Series(tm.makeStringIndex(10 ** 5))
Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v1.1.1.rst
Expand Up @@ -75,6 +75,11 @@ Categorical
- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
-

**Strings**

- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`)


.. ---------------------------------------------------------------------------
.. _whatsnew_111.contributors:
Expand Down
51 changes: 34 additions & 17 deletions pandas/_libs/lib.pyx
Expand Up @@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):

@cython.wraparound(False)
@cython.boundscheck(False)
def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
"""
Convert all elements in an array to string.
cpdef ndarray[object] ensure_string_array(
arr,
object na_value=np.nan,
bint convert_na_value=True,
bint copy=True,
bint skipna=True,
):
"""Returns a new numpy array with object dtype and only strings and na values.
Parameters
----------
arr : ndarray
The array whose elements we are casting.
skipna : bool, default False
arr : array-like
The values to be converted to str, if needed.
na_value : Any
The value to use for na. For example, np.nan or pd.NA.
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. NaN becomes 'nan').
(e.g. if False, NaN becomes 'nan').
Returns
-------
ndarray
A new array with the input array's elements casted.
An array with the input array's elements casted to str or nan-like.
"""
cdef:
object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)

for i in range(n):
arr_i = arr[i]
Py_ssize_t i = 0, n = len(arr)

if not (skipna and checknull(arr_i)):
arr_i = str(arr_i)
result = np.asarray(arr, dtype="object")
if copy and result is arr:
result = result.copy()

result[i] = arr_i
for i in range(n):
val = result[i]
if not checknull(val):
result[i] = str(val)
else:
if convert_na_value:
val = na_value
if skipna:
result[i] = val
else:
result[i] = str(val)

return result

Expand Down
25 changes: 6 additions & 19 deletions pandas/core/arrays/string_.py
Expand Up @@ -178,11 +178,10 @@ class StringArray(PandasArray):

def __init__(self, values, copy=False):
values = extract_array(values)
skip_validation = isinstance(values, type(self))

super().__init__(values, copy=copy)
self._dtype = StringDtype()
if not skip_validation:
if not isinstance(values, type(self)):
self._validate()

def _validate(self):
Expand All @@ -201,23 +200,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
assert dtype == "string"

result = np.asarray(scalars, dtype="object")
if copy and result is scalars:
result = result.copy()

# Standardize all missing-like values to NA
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
na_values = isna(result)
has_nans = na_values.any()
if has_nans and result is scalars:
# force a copy now, if we haven't already
result = result.copy()

# convert to str, then to object to avoid dtype like '<U3', then insert na_value
result = np.asarray(result, dtype=str)
result = np.asarray(result, dtype="object")
if has_nans:
result[na_values] = StringDtype.na_value

# convert non-na-likes to str, and nan-likes to StringDtype.na_value
result = lib.ensure_string_array(
result, na_value=StringDtype.na_value, copy=copy
)

return cls(result)

Expand Down
16 changes: 4 additions & 12 deletions pandas/core/dtypes/cast.py
Expand Up @@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
dtype = pandas_dtype(dtype)

if issubclass(dtype.type, str):
return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)

elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
Expand Down Expand Up @@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
array(['1.0', '2.0', None], dtype=object)
"""
subarr = np.array(values, dtype=dtype, copy=copy)

if dtype is not None and dtype.kind == "U":
# GH-21083
# We can't just return np.array(subarr, dtype='str') since
# NumPy will convert the non-string objects into strings
# Including NA values. Se we have to go
# string -> object -> update NA, which requires an
# additional pass over the data.
na_values = isna(values)
subarr2 = subarr.astype(object)
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
subarr = subarr2
subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
else:
subarr = np.array(values, dtype=dtype, copy=copy)

return subarr

Expand Down
14 changes: 9 additions & 5 deletions pandas/tests/arrays/string_/test_string.py
Expand Up @@ -206,12 +206,16 @@ def test_constructor_raises():

@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy):
a = np.array(["a", np.nan], dtype=object)
original = a.copy()
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
nan_arr = np.array(["a", np.nan], dtype=object)
na_arr = np.array(["a", pd.NA], dtype=object)

result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
expected = pd.arrays.StringArray(na_arr)

tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(a, original)

expected = nan_arr if copy else na_arr
tm.assert_numpy_array_equal(nan_arr, expected)


def test_astype_int():
Expand Down

0 comments on commit 66d08dc

Please sign in to comment.