-
-
Notifications
You must be signed in to change notification settings - Fork 19k
ENH: add downcast to pd.to_numeric #13425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -291,6 +291,83 @@ def test_non_hashable(self): | |
with self.assertRaisesRegexp(TypeError, "Invalid object type"): | ||
pd.to_numeric(s) | ||
|
||
def test_downcast(self): | ||
# see gh-13352 | ||
mixed_data = ['1', 2, 3] | ||
int_data = [1, 2, 3] | ||
date_data = np.array(['1970-01-02', '1970-01-03', | ||
'1970-01-04'], dtype='datetime64[D]') | ||
|
||
invalid_downcast = 'unsigned-integer' | ||
msg = 'invalid downcasting method provided' | ||
|
||
|
||
smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) | ||
smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) | ||
|
||
# support below np.float32 is rare and far between | ||
float_32_char = np.dtype(np.float32).char | ||
smallest_float_dtype = float_32_char | ||
|
||
for data in (mixed_data, int_data, date_data): | ||
with self.assertRaisesRegexp(ValueError, msg): | ||
pd.to_numeric(data, downcast=invalid_downcast) | ||
|
||
expected = np.array([1, 2, 3], dtype=np.int64) | ||
|
||
res = pd.to_numeric(data) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
res = pd.to_numeric(data, downcast=None) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_int_dtype) | ||
|
||
for signed_downcast in ('integer', 'signed'): | ||
res = pd.to_numeric(data, downcast=signed_downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) | ||
res = pd.to_numeric(data, downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_float_dtype) | ||
res = pd.to_numeric(data, downcast='float') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# if we can't successfully cast the given | ||
# data to a numeric dtype, do not bother | ||
# with the downcast parameter | ||
data = ['foo', 2, 3] | ||
expected = np.array(data, dtype=object) | ||
res = pd.to_numeric(data, errors='ignore', | ||
downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# cannot cast to an unsigned integer because | ||
# we have a negative number | ||
data = ['-1', 2, 3] | ||
expected = np.array([-1, 2, 3], dtype=np.int64) | ||
res = pd.to_numeric(data, downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# cannot cast to an integer (signed or unsigned) | ||
# because we have a float number | ||
data = ['1.1', 2, 3] | ||
expected = np.array([1.1, 2, 3], dtype=np.float64) | ||
|
||
for downcast in ('integer', 'signed', 'unsigned'): | ||
res = pd.to_numeric(data, downcast=downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# the smallest integer dtype need not be np.(u)int8 | ||
data = ['256', 257, 258] | ||
|
||
for downcast, expected_dtype in zip( | ||
['integer', 'signed', 'unsigned'], | ||
[np.int16, np.int16, np.uint16]): | ||
expected = np.array([256, 257, 258], dtype=expected_dtype) | ||
res = pd.to_numeric(data, downcast=downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,7 @@ def compose(*funcs): | |
return reduce(_compose2, funcs) | ||
|
||
|
||
def to_numeric(arg, errors='raise'): | ||
def to_numeric(arg, errors='raise', downcast=None): | ||
""" | ||
Convert argument to a numeric type. | ||
|
||
|
@@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'): | |
- If 'raise', then invalid parsing will raise an exception | ||
- If 'coerce', then invalid parsing will be set as NaN | ||
- If 'ignore', then invalid parsing will return the input | ||
downcast : {'integer', 'signed', 'unsigned', 'float'} , default None | ||
|
||
If not None, and if the data has been successfully cast to a | ||
numerical dtype (or if the data was numeric to begin with), | ||
downcast that resulting data to the smallest numerical dtype | ||
possible according to the following rules: | ||
|
||
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8) | ||
- 'unsigned': smallest unsigned int dtype (min.: np.uint8) | ||
- 'float': smallest float dtype (min.: np.float32) | ||
|
||
As this behaviour is separate from the core conversion to | ||
numeric values, any errors raised during the downcasting | ||
will be surfaced regardless of the value of the 'errors' input. | ||
|
||
In addition, downcasting will only occur if the size | ||
of the resulting data's dtype is strictly larger than | ||
the dtype it is to be cast to, so if none of the dtypes | ||
checked satisfy that specification, no downcasting will be | ||
performed on the data. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
Returns | ||
------- | ||
|
@@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'): | |
>>> import pandas as pd | ||
>>> s = pd.Series(['1.0', '2', -3]) | ||
>>> pd.to_numeric(s) | ||
0 1.0 | ||
1 2.0 | ||
2 -3.0 | ||
dtype: float64 | ||
>>> pd.to_numeric(s, downcast='float') | ||
0 1.0 | ||
1 2.0 | ||
2 -3.0 | ||
dtype: float32 | ||
>>> pd.to_numeric(s, downcast='signed') | ||
0 1 | ||
1 2 | ||
2 -3 | ||
dtype: int8 | ||
>>> s = pd.Series(['apple', '1.0', '2', -3]) | ||
>>> pd.to_numeric(s, errors='ignore') | ||
0 apple | ||
1 1.0 | ||
2 2 | ||
3 -3 | ||
dtype: object | ||
>>> pd.to_numeric(s, errors='coerce') | ||
0 NaN | ||
1 1.0 | ||
2 2.0 | ||
3 -3.0 | ||
dtype: float64 | ||
""" | ||
if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): | ||
raise ValueError('invalid downcasting method provided') | ||
|
||
is_series = False | ||
is_index = False | ||
is_scalar = False | ||
|
@@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'): | |
else: | ||
values = arg | ||
|
||
if com.is_numeric_dtype(values): | ||
pass | ||
elif com.is_datetime_or_timedelta_dtype(values): | ||
values = values.astype(np.int64) | ||
else: | ||
values = com._ensure_object(values) | ||
coerce_numeric = False if errors in ('ignore', 'raise') else True | ||
try: | ||
if com.is_numeric_dtype(values): | ||
|
||
pass | ||
elif com.is_datetime_or_timedelta_dtype(values): | ||
values = values.astype(np.int64) | ||
else: | ||
values = com._ensure_object(values) | ||
coerce_numeric = False if errors in ('ignore', 'raise') else True | ||
|
||
try: | ||
values = lib.maybe_convert_numeric(values, set(), | ||
coerce_numeric=coerce_numeric) | ||
except: | ||
if errors == 'raise': | ||
raise | ||
|
||
|
||
except Exception: | ||
if errors == 'raise': | ||
raise | ||
|
||
# attempt downcast only if the data has been successfully converted | ||
# to a numerical dtype and if a downcast method has been specified | ||
if downcast is not None and com.is_numeric_dtype(values): | ||
typecodes = None | ||
|
||
if downcast in ('integer', 'signed'): | ||
typecodes = np.typecodes['Integer'] | ||
elif downcast == 'unsigned' and np.min(values) > 0: | ||
typecodes = np.typecodes['UnsignedInteger'] | ||
elif downcast == 'float': | ||
typecodes = np.typecodes['Float'] | ||
|
||
# pandas support goes only to np.float32, | ||
# as float dtypes smaller than that are | ||
# extremely rare and not well supported | ||
float_32_char = np.dtype(np.float32).char | ||
float_32_ind = typecodes.index(float_32_char) | ||
typecodes = typecodes[float_32_ind:] | ||
|
||
if typecodes is not None: | ||
# from smallest to largest | ||
for dtype in typecodes: | ||
if np.dtype(dtype).itemsize < values.dtype.itemsize: | ||
values = com._possibly_downcast_to_dtype( | ||
values, dtype) | ||
|
||
# successful conversion | ||
if values.dtype == dtype: | ||
break | ||
|
||
if is_series: | ||
return pd.Series(values, index=arg.index, name=arg.name) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just use lists here - no need to use a numpy array
also say these operate on 1dim things (or scalars)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough. Done.