From c069ab1755ca04efc5951d72026ec41d612883a0 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 12:36:56 -0400 Subject: [PATCH] ENH: Add skipna parameter to infer_dtype --- doc/source/whatsnew/v0.21.0.txt | 2 + pandas/_libs/src/inference.pyx | 134 ++++++++++++++++++-------- pandas/tests/dtypes/test_inference.py | 33 ++++++- 3 files changed, 129 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 096040bb85a10..22965f8d45d4b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -24,6 +24,8 @@ New features `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``skipna`` parameter :func:`~pandas.api.types.infer_dtype` to support + type inference in the presence of missing values (:issue:`17059`). .. _whatsnew_0210.enhancements.infer_objects: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 38e95fe6ee652..1b9f268f9a843 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -222,7 +222,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(object value): +def infer_dtype(object value, bint skipna=False): """ Effeciently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -230,6 +230,8 @@ def infer_dtype(object value): Parameters ---------- value : scalar, list, ndarray, or pandas type + skipna : bool, default False + Ignore NaN values when inferring the type. Returns ------- @@ -272,6 +274,9 @@ def infer_dtype(object value): >>> infer_dtype(['foo', 'bar']) 'string' + >>> infer_dtype(['a', np.nan, 'b'], skipna=True) + 'string' + >>> infer_dtype([b'foo', b'bar']) 'bytes' @@ -310,7 +315,6 @@ def infer_dtype(object value): >>> infer_dtype(pd.Series(list('aabc')).astype('category')) 'categorical' - """ cdef: Py_ssize_t i, n @@ -356,7 +360,7 @@ def infer_dtype(object value): values = values.ravel() # try to use a valid value - for i from 0 <= i < n: + for i in range(n): val = util.get_value_1d(values, i) # do not use is_nul_datetimelike to keep @@ -403,11 +407,11 @@ def infer_dtype(object value): return 'datetime' elif is_date(val): - if is_date_array(values): + if is_date_array(values, skipna=skipna): return 'date' elif is_time(val): - if is_time_array(values): + if is_time_array(values, skipna=skipna): return 'time' elif is_decimal(val): @@ -420,19 +424,19 @@ def infer_dtype(object value): return 'mixed-integer-float' elif util.is_bool_object(val): - if is_bool_array(values): + if is_bool_array(values, skipna=skipna): return 'boolean' elif PyString_Check(val): - if is_string_array(values): + if is_string_array(values, skipna=skipna): return 'string' elif PyUnicode_Check(val): - if is_unicode_array(values): + if is_unicode_array(values, skipna=skipna): return 'unicode' elif PyBytes_Check(val): - if is_bytes_array(values): + if is_bytes_array(values, skipna=skipna): return 'bytes' elif is_period(val): @@ -593,10 +597,11 @@ cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) -cpdef bint is_bool_array(ndarray values): +cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.bool_): return True @@ -606,9 +611,16 @@ cpdef bint is_bool_array(ndarray values): if n == 0: return False - for i in range(n): - if not util.is_bool_object(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not util.is_bool_object(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not util.is_bool_object(val): + return False return True else: return False @@ -639,6 +651,7 @@ cpdef bint is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object value if issubclass(values.dtype.type, np.integer): return True @@ -649,9 +662,8 @@ cpdef bint is_integer_float_array(ndarray values): return False for i in range(n): - if not (util.is_integer_object(objbuf[i]) or - util.is_float_object(objbuf[i])): - + val = objbuf[i] + if not (util.is_integer_object(val) or util.is_float_object(val)): return False return True else: @@ -679,10 +691,11 @@ cpdef bint is_float_array(ndarray values): return False -cpdef bint is_string_array(ndarray values): +cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if ((PY2 and issubclass(values.dtype.type, np.string_)) or not PY2 and issubclass(values.dtype.type, np.unicode_)): @@ -693,18 +706,26 @@ cpdef bint is_string_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyString_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyString_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyString_Check(val): + return False return True else: return False -cpdef bint is_unicode_array(ndarray values): +cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.unicode_): return True @@ -714,18 +735,26 @@ cpdef bint is_unicode_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyUnicode_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyUnicode_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyUnicode_Check(val): + return False return True else: return False -cpdef bint is_bytes_array(ndarray values): +cpdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.bytes_): return True @@ -735,9 +764,16 @@ cpdef bint is_bytes_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyBytes_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyBytes_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyBytes_Check(val): + return False return True else: return False @@ -856,23 +892,45 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values): return null_count != n -cpdef bint is_date_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) +cpdef bint is_date_array(ndarray[object] values, bint skipna=False): + cdef: + Py_ssize_t i, n = len(values) + object val + if n == 0: return False - for i in range(n): - if not is_date(values[i]): - return False + + if skipna: + for i in range(n): + val = values[i] + if not util._checknull(val) and not is_date(val): + return False + else: + for i in range(n): + val = values[i] + if not is_date(val): + return False return True -cpdef bint is_time_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) +cpdef bint is_time_array(ndarray[object] values, bint skipna=False): + cdef: + Py_ssize_t i, n = len(values) + object val + if n == 0: return False - for i in range(n): - if not is_time(values[i]): - return False + + if skipna: + for i in range(n): + val = values[i] + if not util._checknull(val) and not is_time(val): + return False + else: + for i in range(n): + val = values[i] + if not is_time(val): + return False return True diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ec5fe45d7f610..2ed31d4e2f320 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -240,6 +240,9 @@ def test_infer_dtype_bytes(self): arr = arr.astype(object) assert lib.infer_dtype(arr) == compare + # object array of bytes with missing values + assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare + def test_isinf_scalar(self): # GH 11352 assert lib.isposinf_scalar(float('inf')) @@ -445,6 +448,10 @@ def test_bools(self): result = lib.infer_dtype(arr) assert result == 'boolean' + arr = np.array([True, np.nan, False], dtype='O') + result = lib.infer_dtype(arr, skipna=True) + assert result == 'boolean' + def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr) @@ -473,11 +480,26 @@ def test_decimals(self): result = lib.infer_dtype(arr) assert result == 'mixed' + arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) + result = lib.infer_dtype(arr) + assert result == 'decimal' + + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') + result = lib.infer_dtype(arr) + assert result == 'decimal' + def test_string(self): pass def test_unicode(self): - pass + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr) + assert result == 'mixed' + + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr, skipna=True) + expected = 'unicode' if PY2 else 'string' + assert result == expected def test_datetime(self): @@ -715,10 +737,17 @@ def test_is_datetimelike_array_all_nan_nat_like(self): def test_date(self): - dates = [date(2012, 1, x) for x in range(1, 20)] + dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) assert index.inferred_type == 'date' + dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] + result = lib.infer_dtype(dates) + assert result == 'mixed' + + result = lib.infer_dtype(dates, skipna=True) + assert result == 'date' + def test_to_object_array_tuples(self): r = (5, 6) values = [r]