Skip to content

Commit

Permalink
Construct 1d array from listlike (#18769)
Browse files Browse the repository at this point in the history
  • Loading branch information
toobaz authored and jreback committed Dec 19, 2017
1 parent 856d9e5 commit 04db779
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 54 deletions.
36 changes: 28 additions & 8 deletions asv_bench/benchmarks/ctors.py
@@ -1,10 +1,36 @@
import numpy as np
from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp
from pandas import Series, Index, DatetimeIndex, Timestamp

from .pandas_vb_common import setup # noqa


class Constructors(object):
class SeriesConstructors(object):

goal_time = 0.2

param_names = ["data_fmt", "with_index"]
params = [[lambda x: x,
list,
lambda arr: list(arr.astype(str)),
lambda arr: dict(zip(range(len(arr)), arr)),
lambda arr: [(i, -i) for i in arr],
lambda arr: [[i, -i] for i in arr],
lambda arr: ([(i, -i) for i in arr][:-1] + [None]),
lambda arr: ([[i, -i] for i in arr][:-1] + [None])],
[False, True]]

def setup(self, data_fmt, with_index):
N = 10**4
np.random.seed(1234)
arr = np.random.randn(N)
self.data = data_fmt(arr)
self.index = np.arange(N) if with_index else None

def time_series_constructor(self, data_fmt, with_index):
Series(self.data, index=self.index)


class SeriesDtypesConstructors(object):

goal_time = 0.2

Expand All @@ -19,12 +45,6 @@ def setup(self):
self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
Timestamp('20130101')] * N * 10)

def time_frame_from_ndarray(self):
DataFrame(self.arr)

def time_series_from_ndarray(self):
Series(self.data, index=self.index)

def time_index_from_array_string(self):
Index(self.arr_str)

Expand Down
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/frame_ctor.py
Expand Up @@ -81,3 +81,15 @@ def setup(self, nrows):
def time_frame_from_records_generator(self, nrows):
# issue-6700
self.df = DataFrame.from_records(self.gen, nrows=nrows)


class FromNDArray(object):

goal_time = 0.2

def setup(self):
N = 100000
self.data = np.random.randn(N)

def time_frame_from_ndarray(self):
self.df = DataFrame(self.data)
17 changes: 0 additions & 17 deletions pandas/_libs/lib.pyx
Expand Up @@ -148,23 +148,6 @@ def item_from_zerodim(object val):
return util.unbox_if_zerodim(val)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] list_to_object_array(list obj):
"""
Convert list to object ndarray. Seriously can\'t believe
I had to write this function.
"""
cdef:
Py_ssize_t i, n = len(obj)
ndarray[object] arr = np.empty(n, dtype=object)

for i in range(n):
arr[i] = obj[i]

return arr


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique(ndarray[object] values):
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/src/inference.pyx
Expand Up @@ -349,7 +349,9 @@ def infer_dtype(object value, bint skipna=False):
else:
if not isinstance(value, list):
value = list(value)
values = list_to_object_array(value)
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike)
values = construct_1d_object_array_from_listlike(value)

values = getattr(values, 'values', values)
val = _try_infer_map(values)
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/algorithms.py
Expand Up @@ -6,7 +6,8 @@
from warnings import warn, catch_warnings
import numpy as np

from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.cast import (
maybe_promote, construct_1d_object_array_from_listlike)
from pandas.core.dtypes.generic import (
ABCSeries, ABCIndex,
ABCIndexClass, ABCCategorical)
Expand Down Expand Up @@ -171,7 +172,7 @@ def _ensure_arraylike(values):
if inferred in ['mixed', 'string', 'unicode']:
if isinstance(values, tuple):
values = list(values)
values = lib.list_to_object_array(values)
values = construct_1d_object_array_from_listlike(values)
else:
values = np.asarray(values)
return values
Expand Down Expand Up @@ -401,7 +402,7 @@ def isin(comps, values):
.format(values_type=type(values).__name__))

if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
values = lib.list_to_object_array(list(values))
values = construct_1d_object_array_from_listlike(list(values))

comps, dtype, _ = _ensure_data(comps)
values, _, _ = _ensure_data(values, dtype=dtype)
Expand Down
18 changes: 6 additions & 12 deletions pandas/core/common.py
Expand Up @@ -21,6 +21,7 @@
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
from pandas.api import types
from pandas.core.dtypes import common
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike

# compat
from pandas.errors import ( # noqa
Expand Down Expand Up @@ -381,25 +382,18 @@ def _asarray_tuplesafe(values, dtype=None):
return values.values

if isinstance(values, list) and dtype in [np.object_, object]:
return lib.list_to_object_array(values)
return construct_1d_object_array_from_listlike(values)

result = np.asarray(values, dtype=dtype)

if issubclass(result.dtype.type, compat.string_types):
result = np.asarray(values, dtype=object)

if result.ndim == 2:
if isinstance(values, list):
return lib.list_to_object_array(values)
else:
# Making a 1D array that safely contains tuples is a bit tricky
# in numpy, leading to the following
try:
result = np.empty(len(values), dtype=object)
result[:] = values
except ValueError:
# we have a list-of-list
result[:] = [tuple(x) for x in values]
# Avoid building an array of arrays:
# TODO: verify whether any path hits this except #18819 (invalid)
values = [tuple(x) for x in values]
result = construct_1d_object_array_from_listlike(values)

return result

Expand Down
27 changes: 26 additions & 1 deletion pandas/core/dtypes/cast.py
Expand Up @@ -42,7 +42,7 @@ def maybe_convert_platform(values):
""" try to do platform conversion, allow ndarray or list here """

if isinstance(values, (list, tuple)):
values = lib.list_to_object_array(list(values))
values = construct_1d_object_array_from_listlike(list(values))
if getattr(values, 'dtype', None) == np.object_:
if hasattr(values, '_values'):
values = values._values
Expand Down Expand Up @@ -1162,3 +1162,28 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
subarr.fill(value)

return subarr


def construct_1d_object_array_from_listlike(values):
"""
Transform any list-like object in a 1-dimensional numpy array of object
dtype.
Parameters
----------
values : any iterable which has a len()
Raises
------
TypeError
* If `values` does not have a len()
Returns
-------
1-dimensional numpy array of dtype object
"""
# numpy will try to interpret nested lists as further dimensions, hence
# making a 1D array that contains list-likes is a bit tricky:
result = np.empty(len(values), dtype='object')
result[:] = values
return result
8 changes: 5 additions & 3 deletions pandas/core/ops.py
Expand Up @@ -33,7 +33,9 @@
is_list_like, is_offsetlike,
is_scalar,
_ensure_object)
from pandas.core.dtypes.cast import maybe_upcast_putmask, find_common_type
from pandas.core.dtypes.cast import (
maybe_upcast_putmask, find_common_type,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.generic import (
ABCSeries,
ABCDataFrame,
Expand Down Expand Up @@ -740,7 +742,7 @@ def wrapper(left, right, name=name, na_op=na_op):

def _comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, list):
y = lib.list_to_object_array(y)
y = construct_1d_object_array_from_listlike(y)
if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
if not is_object_dtype(y.dtype):
y = y.astype(np.object_)
Expand Down Expand Up @@ -891,7 +893,7 @@ def na_op(x, y):
result = op(x, y)
except TypeError:
if isinstance(y, list):
y = lib.list_to_object_array(y)
y = construct_1d_object_array_from_listlike(y)

if isinstance(y, (np.ndarray, ABCSeries)):
if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)):
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/dtypes/test_cast.py
Expand Up @@ -21,7 +21,8 @@
infer_dtype_from_array,
maybe_convert_string_to_object,
maybe_convert_scalar,
find_common_type)
find_common_type,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
Expand Down Expand Up @@ -407,3 +408,17 @@ def test_period_dtype(self):
np.dtype('datetime64[ns]'), np.object, np.int64]:
assert find_common_type([dtype, dtype2]) == np.object
assert find_common_type([dtype2, dtype]) == np.object

@pytest.mark.parametrize('datum1', [1, 2., "3", (4, 5), [6, 7], None])
@pytest.mark.parametrize('datum2', [8, 9., "10", (11, 12), [13, 14], None])
def test_cast_1d_array(self, datum1, datum2):
data = [datum1, datum2]
result = construct_1d_object_array_from_listlike(data)

# Direct comparison fails: https://github.com/numpy/numpy/issues/10218
assert result.dtype == 'object'
assert list(result) == data

@pytest.mark.parametrize('val', [1, 2., None])
def test_cast_1d_array_invalid_scalar(self, val):
pytest.raises(TypeError, construct_1d_object_array_from_listlike, val)
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_constructors.py
Expand Up @@ -21,8 +21,8 @@
MultiIndex, Timedelta, Timestamp,
date_range, Categorical)
import pandas as pd
import pandas._libs.lib as lib
import pandas.util.testing as tm
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike

from pandas.tests.frame.common import TestData

Expand Down Expand Up @@ -1199,7 +1199,7 @@ def test_constructor_from_items(self):
DataFrame.from_items(row_items, orient='index')

# orient='index', but thar be tuples
arr = lib.list_to_object_array(
arr = construct_1d_object_array_from_listlike(
[('bar', 'baz')] * len(self.mixed_frame))
self.mixed_frame['foo'] = arr
row_items = [(idx, list(self.mixed_frame.xs(idx)))
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/indexes/test_multi.py
Expand Up @@ -18,7 +18,7 @@
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.indexes.base import InvalidIndexError
from pandas._libs import lib
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas._libs.lib import Timestamp

import pandas.util.testing as tm
Expand Down Expand Up @@ -913,7 +913,7 @@ def test_from_product_invalid_input(self):
def test_from_product_datetimeindex(self):
dt_index = date_range('2000-01-01', periods=2)
mi = pd.MultiIndex.from_product([[1, 2], dt_index])
etalon = lib.list_to_object_array([(1, pd.Timestamp(
etalon = construct_1d_object_array_from_listlike([(1, pd.Timestamp(
'2000-01-01')), (1, pd.Timestamp('2000-01-02')), (2, pd.Timestamp(
'2000-01-01')), (2, pd.Timestamp('2000-01-02'))])
tm.assert_numpy_array_equal(mi.values, etalon)
Expand All @@ -938,11 +938,11 @@ def test_values_boxed(self):
(1, pd.Timestamp('2000-01-04')),
(2, pd.Timestamp('2000-01-02')),
(3, pd.Timestamp('2000-01-03'))]
mi = pd.MultiIndex.from_tuples(tuples)
tm.assert_numpy_array_equal(mi.values,
lib.list_to_object_array(tuples))
result = pd.MultiIndex.from_tuples(tuples)
expected = construct_1d_object_array_from_listlike(tuples)
tm.assert_numpy_array_equal(result.values, expected)
# Check that code branches for boxed values produce identical results
tm.assert_numpy_array_equal(mi.values[:4], mi[:4].values)
tm.assert_numpy_array_equal(result.values[:4], result[:4].values)

def test_append(self):
result = self.index[:3].append(self.index[3:])
Expand Down

0 comments on commit 04db779

Please sign in to comment.