Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Construct 1d array from listlike #18769

Merged
merged 7 commits into from
Dec 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions asv_bench/benchmarks/ctors.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
import numpy as np
from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp
from pandas import Series, Index, DatetimeIndex, Timestamp

from .pandas_vb_common import setup # noqa


class Constructors(object):
class SeriesConstructors(object):

goal_time = 0.2

param_names = ["data_fmt", "with_index"]
params = [[lambda x: x,
list,
lambda arr: list(arr.astype(str)),
lambda arr: dict(zip(range(len(arr)), arr)),
lambda arr: [(i, -i) for i in arr],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some args are duplicated here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, was reading the ( as [

lambda arr: [[i, -i] for i in arr],
lambda arr: ([(i, -i) for i in arr][:-1] + [None]),
lambda arr: ([[i, -i] for i in arr][:-1] + [None])],
[False, True]]

def setup(self, data_fmt, with_index):
N = 10**4
np.random.seed(1234)
arr = np.random.randn(N)
self.data = data_fmt(arr)
self.index = np.arange(N) if with_index else None

def time_series_constructor(self, data_fmt, with_index):
Series(self.data, index=self.index)


class SeriesDtypesConstructors(object):

goal_time = 0.2

Expand All @@ -19,12 +45,6 @@ def setup(self):
self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
Timestamp('20130101')] * N * 10)

def time_frame_from_ndarray(self):
DataFrame(self.arr)

def time_series_from_ndarray(self):
Series(self.data, index=self.index)

def time_index_from_array_string(self):
Index(self.arr_str)

Expand Down
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,15 @@ def setup(self, nrows):
def time_frame_from_records_generator(self, nrows):
# issue-6700
self.df = DataFrame.from_records(self.gen, nrows=nrows)


class FromNDArray(object):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there already list-like constructor asv's here? and from dict?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no idea (unrelated to this PR)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and the answer is?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no idea, as above

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well since you are adding this one, should be very easy to .parametrize and add similar to the Series case, at least a few simple ones.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, some simple ones are present in frame_ctor.py, above the code I moved from ctor.py. I can open an issue to remind adding more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

goal_time = 0.2

def setup(self):
N = 100000
self.data = np.random.randn(N)

def time_frame_from_ndarray(self):
self.df = DataFrame(self.data)
17 changes: 0 additions & 17 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -148,23 +148,6 @@ def item_from_zerodim(object val):
return util.unbox_if_zerodim(val)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] list_to_object_array(list obj):
"""
Convert list to object ndarray. Seriously can\'t believe
I had to write this function.
"""
cdef:
Py_ssize_t i, n = len(obj)
ndarray[object] arr = np.empty(n, dtype=object)

for i in range(n):
arr[i] = obj[i]

return arr


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique(ndarray[object] values):
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,9 @@ def infer_dtype(object value, bint skipna=False):
else:
if not isinstance(value, list):
value = list(value)
values = list_to_object_array(value)
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike)
values = construct_1d_object_array_from_listlike(value)

values = getattr(values, 'values', values)
val = _try_infer_map(values)
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from warnings import warn, catch_warnings
import numpy as np

from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.cast import (
maybe_promote, construct_1d_object_array_from_listlike)
from pandas.core.dtypes.generic import (
ABCSeries, ABCIndex,
ABCIndexClass, ABCCategorical)
Expand Down Expand Up @@ -171,7 +172,7 @@ def _ensure_arraylike(values):
if inferred in ['mixed', 'string', 'unicode']:
if isinstance(values, tuple):
values = list(values)
values = lib.list_to_object_array(values)
values = construct_1d_object_array_from_listlike(values)
else:
values = np.asarray(values)
return values
Expand Down Expand Up @@ -401,7 +402,7 @@ def isin(comps, values):
.format(values_type=type(values).__name__))

if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
values = lib.list_to_object_array(list(values))
values = construct_1d_object_array_from_listlike(list(values))

comps, dtype, _ = _ensure_data(comps)
values, _, _ = _ensure_data(values, dtype=dtype)
Expand Down
18 changes: 6 additions & 12 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
from pandas.api import types
from pandas.core.dtypes import common
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike

# compat
from pandas.errors import ( # noqa
Expand Down Expand Up @@ -381,25 +382,18 @@ def _asarray_tuplesafe(values, dtype=None):
return values.values

if isinstance(values, list) and dtype in [np.object_, object]:
return lib.list_to_object_array(values)
return construct_1d_object_array_from_listlike(values)

result = np.asarray(values, dtype=dtype)

if issubclass(result.dtype.type, compat.string_types):
result = np.asarray(values, dtype=object)

if result.ndim == 2:
if isinstance(values, list):
return lib.list_to_object_array(values)
else:
# Making a 1D array that safely contains tuples is a bit tricky
# in numpy, leading to the following
try:
result = np.empty(len(values), dtype=object)
result[:] = values
except ValueError:
# we have a list-of-list
result[:] = [tuple(x) for x in values]
# Avoid building an array of arrays:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an asv that hits this case?

Copy link
Member Author

@toobaz toobaz Dec 18, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't even think there is any valid code path that hits this case... which indeed should be suppressed in #18626

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there is not valid code, then let's remove it. or make a new issue. inside a PR doesn't help.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number here with a TODO

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

# TODO: verify whether any path hits this except #18819 (invalid)
values = [tuple(x) for x in values]
result = construct_1d_object_array_from_listlike(values)

return result

Expand Down
27 changes: 26 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def maybe_convert_platform(values):
""" try to do platform conversion, allow ndarray or list here """

if isinstance(values, (list, tuple)):
values = lib.list_to_object_array(list(values))
values = construct_1d_object_array_from_listlike(list(values))
if getattr(values, 'dtype', None) == np.object_:
if hasattr(values, '_values'):
values = values._values
Expand Down Expand Up @@ -1162,3 +1162,28 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
subarr.fill(value)

return subarr


def construct_1d_object_array_from_listlike(values):
"""
Transform any list-like object in a 1-dimensional numpy array of object
dtype.

Parameters
----------
values : any iterable which has a len()

Raises
------
TypeError
* If `values` does not have a len()

Returns
-------
1-dimensional numpy array of dtype object
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would not object to an
assert is_iterable(values) with a nice error message

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing is: I can't think of any code path which could be hitting it. Scalar input to a Series() is (considered valid and) recasted to a 1-d before calling this. Similarly, an operation such as Series([1,2]) + 3 transforms 3 before hitting this. So I don't know what the error message could actually say.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not what i am asking

this is a completely internal
routine
it should fail with invalid input

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should fail with invalid input

Sure it does, TypeError: object of type 'int' has no len(). Which is pretty clear, considering the docstring, and precisely in light of the fact that this is an internal routine. That said, feel free to suggest an error message which is worth the cost of the additional assert.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a raises section to the doc-string

# numpy will try to interpret nested lists as further dimensions, hence
# making a 1D array that contains list-likes is a bit tricky:
result = np.empty(len(values), dtype='object')
result[:] = values
return result
8 changes: 5 additions & 3 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
is_list_like,
is_scalar,
_ensure_object)
from pandas.core.dtypes.cast import maybe_upcast_putmask, find_common_type
from pandas.core.dtypes.cast import (
maybe_upcast_putmask, find_common_type,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.generic import (
ABCSeries,
ABCDataFrame,
Expand Down Expand Up @@ -750,7 +752,7 @@ def wrapper(left, right, name=name, na_op=na_op):

def _comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, list):
y = lib.list_to_object_array(y)
y = construct_1d_object_array_from_listlike(y)
if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
if not is_object_dtype(y.dtype):
y = y.astype(np.object_)
Expand Down Expand Up @@ -901,7 +903,7 @@ def na_op(x, y):
result = op(x, y)
except TypeError:
if isinstance(y, list):
y = lib.list_to_object_array(y)
y = construct_1d_object_array_from_listlike(y)

if isinstance(y, (np.ndarray, ABCSeries)):
if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)):
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/dtypes/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
infer_dtype_from_array,
maybe_convert_string_to_object,
maybe_convert_scalar,
find_common_type)
find_common_type,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
Expand Down Expand Up @@ -407,3 +408,17 @@ def test_period_dtype(self):
np.dtype('datetime64[ns]'), np.object, np.int64]:
assert find_common_type([dtype, dtype2]) == np.object
assert find_common_type([dtype2, dtype]) == np.object

@pytest.mark.parametrize('datum1', [1, 2., "3", (4, 5), [6, 7], None])
@pytest.mark.parametrize('datum2', [8, 9., "10", (11, 12), [13, 14], None])
def test_cast_1d_array(self, datum1, datum2):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there fail cases? iow where this routine raises?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not once you remove support for non-object dtypes (which is what you just asked me to do).

(well, as long as data has a len() and is iterable...)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right so can you add a test that fails for scalars then

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

data = [datum1, datum2]
result = construct_1d_object_array_from_listlike(data)

# Direct comparison fails: https://github.com/numpy/numpy/issues/10218
assert result.dtype == 'object'
assert list(result) == data

@pytest.mark.parametrize('val', [1, 2., None])
def test_cast_1d_array_invalid_scalar(self, val):
pytest.raises(TypeError, construct_1d_object_array_from_listlike, val)
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
MultiIndex, Timedelta, Timestamp,
date_range, Categorical)
import pandas as pd
import pandas._libs.lib as lib
import pandas.util.testing as tm
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike

from pandas.tests.frame.common import TestData

Expand Down Expand Up @@ -1199,7 +1199,7 @@ def test_constructor_from_items(self):
DataFrame.from_items(row_items, orient='index')

# orient='index', but thar be tuples
arr = lib.list_to_object_array(
arr = construct_1d_object_array_from_listlike(
[('bar', 'baz')] * len(self.mixed_frame))
self.mixed_frame['foo'] = arr
row_items = [(idx, list(self.mixed_frame.xs(idx)))
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.indexes.base import InvalidIndexError
from pandas._libs import lib
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas._libs.lib import Timestamp

import pandas.util.testing as tm
Expand Down Expand Up @@ -913,7 +913,7 @@ def test_from_product_invalid_input(self):
def test_from_product_datetimeindex(self):
dt_index = date_range('2000-01-01', periods=2)
mi = pd.MultiIndex.from_product([[1, 2], dt_index])
etalon = lib.list_to_object_array([(1, pd.Timestamp(
etalon = construct_1d_object_array_from_listlike([(1, pd.Timestamp(
'2000-01-01')), (1, pd.Timestamp('2000-01-02')), (2, pd.Timestamp(
'2000-01-01')), (2, pd.Timestamp('2000-01-02'))])
tm.assert_numpy_array_equal(mi.values, etalon)
Expand All @@ -938,11 +938,11 @@ def test_values_boxed(self):
(1, pd.Timestamp('2000-01-04')),
(2, pd.Timestamp('2000-01-02')),
(3, pd.Timestamp('2000-01-03'))]
mi = pd.MultiIndex.from_tuples(tuples)
tm.assert_numpy_array_equal(mi.values,
lib.list_to_object_array(tuples))
result = pd.MultiIndex.from_tuples(tuples)
expected = construct_1d_object_array_from_listlike(tuples)
tm.assert_numpy_array_equal(result.values, expected)
# Check that code branches for boxed values produce identical results
tm.assert_numpy_array_equal(mi.values[:4], mi[:4].values)
tm.assert_numpy_array_equal(result.values[:4], result[:4].values)

def test_append(self):
result = self.index[:3].append(self.index[3:])
Expand Down