diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 6276dc324ca0d..65af7b077d80f 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,10 +1,36 @@ import numpy as np -from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp +from pandas import Series, Index, DatetimeIndex, Timestamp from .pandas_vb_common import setup # noqa -class Constructors(object): +class SeriesConstructors(object): + + goal_time = 0.2 + + param_names = ["data_fmt", "with_index"] + params = [[lambda x: x, + list, + lambda arr: list(arr.astype(str)), + lambda arr: dict(zip(range(len(arr)), arr)), + lambda arr: [(i, -i) for i in arr], + lambda arr: [[i, -i] for i in arr], + lambda arr: ([(i, -i) for i in arr][:-1] + [None]), + lambda arr: ([[i, -i] for i in arr][:-1] + [None])], + [False, True]] + + def setup(self, data_fmt, with_index): + N = 10**4 + np.random.seed(1234) + arr = np.random.randn(N) + self.data = data_fmt(arr) + self.index = np.arange(N) if with_index else None + + def time_series_constructor(self, data_fmt, with_index): + Series(self.data, index=self.index) + + +class SeriesDtypesConstructors(object): goal_time = 0.2 @@ -19,12 +45,6 @@ def setup(self): self.s = Series([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * N * 10) - def time_frame_from_ndarray(self): - DataFrame(self.arr) - - def time_series_from_ndarray(self): - Series(self.data, index=self.index) - def time_index_from_array_string(self): Index(self.arr_str) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 6761d48d25919..391a209cb2a89 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -81,3 +81,15 @@ def setup(self, nrows): def time_frame_from_records_generator(self, nrows): # issue-6700 self.df = DataFrame.from_records(self.gen, nrows=nrows) + + +class FromNDArray(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + self.data = np.random.randn(N) + + def time_frame_from_ndarray(self): + self.df = DataFrame(self.data) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a39f83d5261c0..5a62203f79642 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -148,23 +148,6 @@ def item_from_zerodim(object val): return util.unbox_if_zerodim(val) -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef ndarray[object] list_to_object_array(list obj): - """ - Convert list to object ndarray. Seriously can\'t believe - I had to write this function. - """ - cdef: - Py_ssize_t i, n = len(obj) - ndarray[object] arr = np.empty(n, dtype=object) - - for i in range(n): - arr[i] = obj[i] - - return arr - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique(ndarray[object] values): diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index e15b4693432d9..8bfed4fe60fed 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -349,7 +349,9 @@ def infer_dtype(object value, bint skipna=False): else: if not isinstance(value, list): value = list(value) - values = list_to_object_array(value) + from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike) + values = construct_1d_object_array_from_listlike(value) values = getattr(values, 'values', values) val = _try_infer_map(values) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0ceb8966fd3c8..167f215b6c0ac 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,7 +6,8 @@ from warnings import warn, catch_warnings import numpy as np -from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.cast import ( + maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, ABCIndex, ABCIndexClass, ABCCategorical) @@ -171,7 +172,7 @@ def _ensure_arraylike(values): if inferred in ['mixed', 'string', 'unicode']: if isinstance(values, tuple): values = list(values) - values = lib.list_to_object_array(values) + values = construct_1d_object_array_from_listlike(values) else: values = np.asarray(values) return values @@ -401,7 +402,7 @@ def isin(comps, values): .format(values_type=type(values).__name__)) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): - values = lib.list_to_object_array(list(values)) + values = construct_1d_object_array_from_listlike(list(values)) comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/common.py b/pandas/core/common.py index 76a69030463ec..35696be5b2a03 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,6 +21,7 @@ from pandas.core.dtypes.missing import isna, isnull, notnull # noqa from pandas.api import types from pandas.core.dtypes import common +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike # compat from pandas.errors import ( # noqa @@ -381,7 +382,7 @@ def _asarray_tuplesafe(values, dtype=None): return values.values if isinstance(values, list) and dtype in [np.object_, object]: - return lib.list_to_object_array(values) + return construct_1d_object_array_from_listlike(values) result = np.asarray(values, dtype=dtype) @@ -389,17 +390,10 @@ def _asarray_tuplesafe(values, dtype=None): result = np.asarray(values, dtype=object) if result.ndim == 2: - if isinstance(values, list): - return lib.list_to_object_array(values) - else: - # Making a 1D array that safely contains tuples is a bit tricky - # in numpy, leading to the following - try: - result = np.empty(len(values), dtype=object) - result[:] = values - except ValueError: - # we have a list-of-list - result[:] = [tuple(x) for x in values] + # Avoid building an array of arrays: + # TODO: verify whether any path hits this except #18819 (invalid) + values = [tuple(x) for x in values] + result = construct_1d_object_array_from_listlike(values) return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a97b84ab9cc5b..87c6fb69f33bf 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -42,7 +42,7 @@ def maybe_convert_platform(values): """ try to do platform conversion, allow ndarray or list here """ if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(list(values)) + values = construct_1d_object_array_from_listlike(list(values)) if getattr(values, 'dtype', None) == np.object_: if hasattr(values, '_values'): values = values._values @@ -1162,3 +1162,28 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): subarr.fill(value) return subarr + + +def construct_1d_object_array_from_listlike(values): + """ + Transform any list-like object in a 1-dimensional numpy array of object + dtype. + + Parameters + ---------- + values : any iterable which has a len() + + Raises + ------ + TypeError + * If `values` does not have a len() + + Returns + ------- + 1-dimensional numpy array of dtype object + """ + # numpy will try to interpret nested lists as further dimensions, hence + # making a 1D array that contains list-likes is a bit tricky: + result = np.empty(len(values), dtype='object') + result[:] = values + return result diff --git a/pandas/core/ops.py b/pandas/core/ops.py index ca6d888625873..e23609b23f529 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -33,7 +33,9 @@ is_list_like, is_offsetlike, is_scalar, _ensure_object) -from pandas.core.dtypes.cast import maybe_upcast_putmask, find_common_type +from pandas.core.dtypes.cast import ( + maybe_upcast_putmask, find_common_type, + construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, ABCDataFrame, @@ -740,7 +742,7 @@ def wrapper(left, right, name=name, na_op=na_op): def _comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): - y = lib.list_to_object_array(y) + y = construct_1d_object_array_from_listlike(y) if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): if not is_object_dtype(y.dtype): y = y.astype(np.object_) @@ -891,7 +893,7 @@ def na_op(x, y): result = op(x, y) except TypeError: if isinstance(y, list): - y = lib.list_to_object_array(y) + y = construct_1d_object_array_from_listlike(y) if isinstance(y, (np.ndarray, ABCSeries)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 82a35fa711e8c..d13d781f03117 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -21,7 +21,8 @@ infer_dtype_from_array, maybe_convert_string_to_object, maybe_convert_scalar, - find_common_type) + find_common_type, + construct_1d_object_array_from_listlike) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -407,3 +408,17 @@ def test_period_dtype(self): np.dtype('datetime64[ns]'), np.object, np.int64]: assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object + + @pytest.mark.parametrize('datum1', [1, 2., "3", (4, 5), [6, 7], None]) + @pytest.mark.parametrize('datum2', [8, 9., "10", (11, 12), [13, 14], None]) + def test_cast_1d_array(self, datum1, datum2): + data = [datum1, datum2] + result = construct_1d_object_array_from_listlike(data) + + # Direct comparison fails: https://github.com/numpy/numpy/issues/10218 + assert result.dtype == 'object' + assert list(result) == data + + @pytest.mark.parametrize('val', [1, 2., None]) + def test_cast_1d_array_invalid_scalar(self, val): + pytest.raises(TypeError, construct_1d_object_array_from_listlike, val) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 22ad2258e70bc..8be6c4875ae24 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,8 +21,8 @@ MultiIndex, Timedelta, Timestamp, date_range, Categorical) import pandas as pd -import pandas._libs.lib as lib import pandas.util.testing as tm +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.tests.frame.common import TestData @@ -1199,7 +1199,7 @@ def test_constructor_from_items(self): DataFrame.from_items(row_items, orient='index') # orient='index', but thar be tuples - arr = lib.list_to_object_array( + arr = construct_1d_object_array_from_listlike( [('bar', 'baz')] * len(self.mixed_frame)) self.mixed_frame['foo'] = arr row_items = [(idx, list(self.mixed_frame.xs(idx))) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 510ca6ac83ec0..7d6937592002d 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -18,7 +18,7 @@ from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.indexes.base import InvalidIndexError -from pandas._libs import lib +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas._libs.lib import Timestamp import pandas.util.testing as tm @@ -913,7 +913,7 @@ def test_from_product_invalid_input(self): def test_from_product_datetimeindex(self): dt_index = date_range('2000-01-01', periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) - etalon = lib.list_to_object_array([(1, pd.Timestamp( + etalon = construct_1d_object_array_from_listlike([(1, pd.Timestamp( '2000-01-01')), (1, pd.Timestamp('2000-01-02')), (2, pd.Timestamp( '2000-01-01')), (2, pd.Timestamp('2000-01-02'))]) tm.assert_numpy_array_equal(mi.values, etalon) @@ -938,11 +938,11 @@ def test_values_boxed(self): (1, pd.Timestamp('2000-01-04')), (2, pd.Timestamp('2000-01-02')), (3, pd.Timestamp('2000-01-03'))] - mi = pd.MultiIndex.from_tuples(tuples) - tm.assert_numpy_array_equal(mi.values, - lib.list_to_object_array(tuples)) + result = pd.MultiIndex.from_tuples(tuples) + expected = construct_1d_object_array_from_listlike(tuples) + tm.assert_numpy_array_equal(result.values, expected) # Check that code branches for boxed values produce identical results - tm.assert_numpy_array_equal(mi.values[:4], mi[:4].values) + tm.assert_numpy_array_equal(result.values[:4], result[:4].values) def test_append(self): result = self.index[:3].append(self.index[3:])