From 96b8bb1cf0febfbaa1d5f27ff1545fed6b13b80b Mon Sep 17 00:00:00 2001 From: jschendel Date: Wed, 28 Feb 2018 18:07:15 -0700 Subject: [PATCH 01/16] ENH: Implement DataFrame.astype('category') (#18099) --- doc/source/categorical.rst | 98 ++++++++++++++++++++++++++----- doc/source/whatsnew/v0.23.0.txt | 31 ++++++++++ pandas/core/generic.py | 9 ++- pandas/tests/frame/test_dtypes.py | 25 +++++--- 4 files changed, 139 insertions(+), 24 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index efcc04d688334..3d4bb8ec57794 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -46,9 +46,14 @@ The categorical data type is useful in the following cases: See also the :ref:`API docs on categoricals`. +.. _categorical.objectcreation: + Object Creation --------------- +Series Creation +~~~~~~~~~~~~~~~ + Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways: By specifying ``dtype="category"`` when constructing a ``Series``: @@ -77,7 +82,7 @@ discrete bins. See the :ref:`example on tiling ` in the docs df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) -By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python @@ -89,6 +94,55 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df +Categorical data has a specific ``category`` :ref:`dtype `: + +.. ipython:: python + + df.dtypes + +DataFrame Creation +~~~~~~~~~~~~~~~~~~ + +Similar to the previous section where a single column was converted to categorical, all columns in a +``DataFrame`` can be batch converted to categorical either during or after construction. + +This can be done during construction by specifying ``dtype="category"`` in the ``DataFrame`` constructor: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df.dtypes + +Note that the categories present in each column differ; the conversion is done column by column, so +only labels present in a given column are categories: + +.. ipython:: python + + df['A'] + df['B'] + + +.. versionadded:: 0.23.0 + +Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df_cat = df.astype('category') + df_cat.dtypes + +This conversion is likewise done column by column: + +.. ipython:: python + + df_cat['A'] + df_cat['B'] + + +Controlling Behavior +~~~~~~~~~~~~~~~~~~~~ + In the examples above where we passed ``dtype='category'``, we used the default behavior: @@ -108,21 +162,36 @@ of :class:`~pandas.api.types.CategoricalDtype`. s_cat = s.astype(cat_type) s_cat -Categorical data has a specific ``category`` :ref:`dtype `: +Similarly, a ``CategoricalDtype`` can be used with a ``DataFrame`` to ensure that categories +are consistent among all columns. .. ipython:: python - df.dtypes + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cat_type = CategoricalDtype(categories=list('abcd'), + ordered=True) + df_cat = df.astype(cat_type) + df_cat['A'] + df_cat['B'] .. note:: - In contrast to R's `factor` function, categorical data is not converting input values to - strings and categories will end up the same data type as the original values. + To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as + categories for each column, the ``categories`` parameter can be determined programatically by + ``categories = pd.unique(df.values.ravel())``. -.. note:: +If you already have ``codes`` and ``categories``, you can use the +:func:`~pandas.Categorical.from_codes` constructor to save the factorize step +during normal constructor mode: - In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `categories` to change the categories after creation time. +.. ipython:: python + + splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + + +Regaining Original Data +~~~~~~~~~~~~~~~~~~~~~~~ To get back to the original ``Series`` or NumPy array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: @@ -136,14 +205,15 @@ To get back to the original ``Series`` or NumPy array, use s2.astype(str) np.asarray(s2) -If you already have `codes` and `categories`, you can use the -:func:`~pandas.Categorical.from_codes` constructor to save the factorize step -during normal constructor mode: +.. note:: -.. ipython:: python + In contrast to R's `factor` function, categorical data is not converting input values to + strings; categories will end up the same data type as the original values. - splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. note:: + + In contrast to R's `factor` function, there is currently no way to assign/change labels at + creation time. Use `categories` to change the categories after creation time. .. _categorical.categoricaldtype: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 542e62aa374be..e9ba073312064 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -268,6 +268,37 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python df.assign(A=df.A+1, C= lambda df: df.A* -1) + +.. _whatsnew_0230.enhancements.astype_category: + +``DataFrame.astype`` performs column-wise conversion to ``Categorical`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.astype` can now perform column-wise conversion to ``Categorical`` by supplying the string ``'category'`` or +a :class:`~pandas.api.types.CategoricalDtype`. Previously, attempting this would raise a ``NotImplementedError``. See the +:ref:`categorical.objectcreation` section of the documentation for more details and examples. (:issue:`12860`, :issue:`18099`) + +Supplying the string ``'category'`` performs column-wise conversion, with only labels appearing in a given column set as categories: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df = df.astype('category') + df['A'].dtype + df['B'].dtype + + +Supplying a ``CategoricalDtype`` will make the categories in each column consistent with the supplied dtype: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cdt = CategoricalDtype(categories=list('abcd'), ordered=True) + df = df.astype(cdt) + df['A'].dtype + df['B'].dtype + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1ed6ae9c8a6c..c4eb7dd7e7a7e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,6 +18,7 @@ is_number, is_integer, is_bool, is_bool_dtype, + is_categorical_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -4429,14 +4430,18 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if col_name not in self: raise KeyError('Only a column name can be used for the ' 'key in a dtype mappings argument.') - from pandas import concat results = [] for col_name, col in self.iteritems(): if col_name in dtype: results.append(col.astype(dtype[col_name], copy=copy)) else: results.append(results.append(col.copy() if copy else col)) - return concat(results, axis=1, copy=False) + return pd.concat(results, axis=1, copy=False) + + elif is_categorical_dtype(dtype) and self.ndim > 1: + # GH 18099: columnwise conversion to categorical + results = (self[col].astype(dtype, copy=copy) for col in self) + return pd.concat(results, axis=1, copy=False) # else, only a single dtype is given new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index e9e5b2a447a4a..430d43019afc2 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -8,11 +8,11 @@ import numpy as np from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, - compat, concat, option_context) + Categorical, compat, concat, option_context) from pandas.compat import u from pandas import _np_version_under1p14 -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -619,12 +619,21 @@ def test_astype_duplicate_col(self): expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) - def test_categorical_astype_ndim_raises(self, columns): - # GH 18004 - msg = '> 1 ndim Categorical are not supported at this time' - with tm.assert_raises_regex(NotImplementedError, msg): - DataFrame(columns=columns).astype('category') + @pytest.mark.parametrize('dtype', [ + 'category', + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list('abcdef')), + CategoricalDtype(categories=list('edba'), ordered=False), + CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) + def test_astype_categorical(self, dtype): + # GH 18099 + d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [ pd.api.types.CategoricalDtype, From 4a276979f27e7e2bd7ef9965d7c8a166d011f240 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 28 Feb 2018 17:30:18 -0800 Subject: [PATCH 02/16] Cythonized GroupBy any (#19722) --- asv_bench/benchmarks/groupby.py | 16 +++- doc/source/api.rst | 5 ++ doc/source/whatsnew/v0.23.0.txt | 5 +- pandas/_libs/groupby.pyx | 57 ++++++++++++ pandas/core/groupby.py | 127 +++++++++++++++++++++++++-- pandas/tests/groupby/test_groupby.py | 25 ++++++ 6 files changed, 222 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c347442784d41..3e7e5c821b14c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -11,6 +11,13 @@ from .pandas_vb_common import setup # noqa +method_blacklist = { + 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', + 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', + 'var', 'mad', 'describe', 'std'} +} + + class ApplyDictReturn(object): goal_time = 0.2 @@ -153,6 +160,7 @@ def time_frame_nth_any(self, df): def time_frame_nth(self, df): df.groupby(0).nth(0) + def time_series_nth_any(self, df): df[1].groupby(df[0]).nth(0, dropna='any') @@ -369,7 +377,7 @@ class GroupByMethods(object): goal_time = 0.2 param_names = ['dtype', 'method'] - params = [['int', 'float'], + params = [['int', 'float', 'object'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', @@ -377,15 +385,19 @@ class GroupByMethods(object): 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] def setup(self, dtype, method): + if method in method_blacklist.get(dtype, {}): + raise NotImplementedError # skip benchmark ngroups = 1000 size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == 'int': key = np.random.randint(0, size, size=size) - else: + elif dtype == 'float': key = np.concatenate([np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]) + elif dtype == 'object': + key = ['foo'] * size df = DataFrame({'values': values, 'key': key}) self.df_groupby_method = getattr(df.groupby('key')['values'], method) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e47499a03f3a..a5e26bc948a70 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2179,8 +2179,12 @@ Computations / Descriptive Stats .. autosummary:: :toctree: generated/ + GroupBy.all + GroupBy.any + GroupBy.bfill GroupBy.count GroupBy.cumcount + GroupBy.ffill GroupBy.first GroupBy.head GroupBy.last @@ -2192,6 +2196,7 @@ Computations / Descriptive Stats GroupBy.nth GroupBy.ohlc GroupBy.prod + GroupBy.rank GroupBy.size GroupBy.sem GroupBy.std diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e9ba073312064..2e65fc9a44e29 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -729,9 +729,10 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) -- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) -- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) .. _whatsnew_0230.docs: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3d208a915225..d3fcd84e5f38d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -310,5 +310,62 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, filled_vals = 0 +@cython.boundscheck(False) +@cython.wraparound(False) +def group_any_all(ndarray[uint8_t] out, + ndarray[int64_t] labels, + ndarray[uint8_t] values, + ndarray[uint8_t] mask, + object val_test, + bint skipna): + """Aggregated boolean values to show truthfulness of group elements + + Parameters + ---------- + out : array of values which this method will write its results to + labels : array containing unique label for each group, with its + ordering matching up to the corresponding record in `values` + values : array containing the truth value of each element + mask : array indicating whether a value is na or not + val_test : str {'any', 'all'} + String object dictating whether to use any or all truth testing + skipna : boolean + Flag to ignore nan values during truth testing + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + The returned values will either be 0 or 1 (False or True, respectively). + """ + cdef: + Py_ssize_t i, N=len(labels) + int64_t lab + uint8_t flag_val + + if val_test == 'all': + # Because the 'all' value of an empty iterable in Python is True we can + # start with an array full of ones and set to zero when a False value + # is encountered + flag_val = 0 + elif val_test == 'any': + # Because the 'any' value of an empty iterable in Python is False we + # can start with an array full of zeros and set to one only if any + # value encountered is True + flag_val = 1 + else: + raise ValueError("'bool_func' must be either 'any' or 'all'!") + + out.fill(1 - flag_val) + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0 or (skipna and mask[i]): + continue + + if values[i] == flag_val: + out[lab] = flag_val + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 00643614e8803..b8ca104c4b2c7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1219,6 +1219,53 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist + def _bool_agg(self, val_test, skipna): + """Shared func to call any / all Cython GroupBy implementations""" + + def objs_to_bool(vals): + try: + vals = vals.astype(np.bool) + except ValueError: # for objects + vals = np.array([bool(x) for x in vals]) + + return vals.view(np.uint8) + + def result_to_bool(result): + return result.astype(np.bool, copy=False) + + return self._get_cythonized_result('group_any_all', self.grouper, + aggregate=True, + cython_dtype=np.uint8, + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + val_test=val_test, skipna=skipna) + + @Substitution(name='groupby') + @Appender(_doc_template) + def any(self, skipna=True): + """Returns True if any value in the group is truthful, else False + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ + return self._bool_agg('any', skipna) + + @Substitution(name='groupby') + @Appender(_doc_template) + def all(self, skipna=True): + """Returns True if all values in the group are truthful, else False + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ + return self._bool_agg('all', skipna) + @Substitution(name='groupby') @Appender(_doc_template) def count(self): @@ -1485,6 +1532,8 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result('group_fillna_indexer', self.grouper, needs_mask=True, + cython_dtype=np.int64, + result_is_index=True, direction=direction, limit=limit) @Substitution(name='groupby') @@ -1873,18 +1922,40 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform('cummax', numeric_only=False) - def _get_cythonized_result(self, how, grouper, needs_mask=False, - needs_ngroups=False, **kwargs): + def _get_cythonized_result(self, how, grouper, aggregate=False, + cython_dtype=None, needs_values=False, + needs_mask=False, needs_ngroups=False, + result_is_index=False, + pre_processing=None, post_processing=None, + **kwargs): """Get result for Cythonized functions Parameters ---------- how : str, Cythonized function name to be called grouper : Grouper object containing pertinent group info + aggregate : bool, default False + Whether the result should be aggregated to match the number of + groups + cython_dtype : default None + Type of the array that will be modified by the Cython call. If + `None`, the type will be inferred from the values of each slice + needs_values : bool, default False + Whether the values should be a part of the Cython call + signature needs_mask : bool, default False - Whether boolean mask needs to be part of the Cython call signature + Whether boolean mask needs to be part of the Cython call + signature needs_ngroups : bool, default False - Whether number of groups part of the Cython call signature + Whether number of groups is part of the Cython call signature + result_is_index : bool, default False + Whether the result of the Cython operation is an index of + values to be retrieved, instead of the actual values themselves + pre_processing : function, default None + Function to be applied to `values` prior to passing to Cython + Raises if `needs_values` is False + post_processing : function, default None + Function to be applied to result of Cython function **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -1892,14 +1963,40 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, ------- `Series` or `DataFrame` with filled values """ + if result_is_index and aggregate: + raise ValueError("'result_is_index' and 'aggregate' cannot both " + "be True!") + if post_processing: + if not callable(pre_processing): + raise ValueError("'post_processing' must be a callable!") + if pre_processing: + if not callable(pre_processing): + raise ValueError("'pre_processing' must be a callable!") + if not needs_values: + raise ValueError("Cannot use 'pre_processing' without " + "specifying 'needs_values'!") labels, _, ngroups = grouper.group_info output = collections.OrderedDict() base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): - indexer = np.zeros_like(labels, dtype=np.int64) - func = partial(base_func, indexer, labels) + if aggregate: + result_sz = ngroups + else: + result_sz = len(obj.values) + + if not cython_dtype: + cython_dtype = obj.values.dtype + + result = np.zeros(result_sz, dtype=cython_dtype) + func = partial(base_func, result, labels) + if needs_values: + vals = obj.values + if pre_processing: + vals = pre_processing(vals) + func = partial(func, vals) + if needs_mask: mask = isnull(obj.values).view(np.uint8) func = partial(func, mask) @@ -1908,9 +2005,19 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, func = partial(func, ngroups) func(**kwargs) # Call func to modify indexer values in place - output[name] = algorithms.take_nd(obj.values, indexer) - return self._wrap_transformed_output(output) + if result_is_index: + result = algorithms.take_nd(obj.values, result) + + if post_processing: + result = post_processing(result) + + output[name] = result + + if aggregate: + return self._wrap_aggregated_output(output) + else: + return self._wrap_transformed_output(output) @Substitution(name='groupby') @Appender(_doc_template) @@ -1930,7 +2037,9 @@ def shift(self, periods=1, freq=None, axis=0): return self.apply(lambda x: x.shift(periods, freq, axis)) return self._get_cythonized_result('group_shift_indexer', - self.grouper, needs_ngroups=True, + self.grouper, cython_dtype=np.int64, + needs_ngroups=True, + result_is_index=True, periods=periods) @Substitution(name='groupby') diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2429e9975fc8e..0561b3a1d8592 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,6 +9,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, concat, Panel, DatetimeIndex, read_csv) +from pandas.core.dtypes.missing import isna from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_frame_equal, assert_index_equal, assert_series_equal, assert_almost_equal) @@ -2116,6 +2117,30 @@ def interweave(list_obj): exp = DataFrame({'key': keys, 'val': _exp_vals}) assert_frame_equal(result, exp) + @pytest.mark.parametrize("agg_func", ['any', 'all']) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("vals", [ + ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], + [1, 2, 3], [1, 0, 0], [0, 0, 0], + [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], + [True, True, True], [True, False, False], [False, False, False], + [np.nan, np.nan, np.nan] + ]) + def test_groupby_bool_aggs(self, agg_func, skipna, vals): + df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(compat.builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == 'any': + exp = False + + exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( + ['a', 'b'], name='key')) + result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + assert_frame_equal(result, exp_df) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From 52559f535788b3c59c6c78d2cf987b03303b16f6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 28 Feb 2018 17:32:24 -0800 Subject: [PATCH 03/16] ENH: Allow Timestamp to accept Nanosecond argument (#19889) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/conversion.pxd | 3 ++- pandas/_libs/tslibs/conversion.pyx | 4 +-- pandas/_libs/tslibs/timestamps.pyx | 25 ++++++++++++++----- .../tests/scalar/timestamp/test_timestamp.py | 21 ++++++++++++++++ 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2e65fc9a44e29..90ce6b47728fb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -618,6 +618,7 @@ Datetimelike API Changes - Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) - :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with ``freq=None``, addition or subtraction of integer-dtyped array or ``Index`` will raise ``NullFrequencyError`` instead of ``TypeError`` (:issue:`19895`) +- :class:`Timestamp` constructor now accepts a `nanosecond` keyword or positional argument (:issue:`18898`) .. _whatsnew_0230.api.other: diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 868c2641b34db..8f887dc3af203 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -16,7 +16,8 @@ cdef class _TSObject: cdef convert_to_tsobject(object ts, object tz, object unit, - bint dayfirst, bint yearfirst) + bint dayfirst, bint yearfirst, + int32_t nanos=*) cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, int32_t nanos=*) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 4726bd7ea3629..f4841e6abb7e8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -252,7 +252,7 @@ cpdef int64_t pydt_to_i8(object pydt) except? -1: cdef convert_to_tsobject(object ts, object tz, object unit, - bint dayfirst, bint yearfirst): + bint dayfirst, bint yearfirst, int32_t nanos=0): """ Extract datetime and int64 from any of: - np.int64 (with unit providing a possible modifier) @@ -297,7 +297,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = ts dt64_to_dtstruct(ts, &obj.dts) elif PyDateTime_Check(ts): - return convert_datetime_to_tsobject(ts, tz) + return convert_datetime_to_tsobject(ts, tz, nanos) elif PyDate_Check(ts): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ed77916a1d887..421f781483290 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -107,6 +107,7 @@ cdef class _Timestamp(datetime): cdef readonly: int64_t value, nanosecond object freq # frequency reference + list _date_attributes def __hash__(_Timestamp self): if self.nanosecond: @@ -425,6 +426,8 @@ class Timestamp(_Timestamp): .. versionadded:: 0.19.0 hour, minute, second, microsecond : int, optional, default 0 .. versionadded:: 0.19.0 + nanosecond : int, optional, default 0 + .. versionadded:: 0.23.0 tzinfo : datetime.tzinfo, optional, default None .. versionadded:: 0.19.0 @@ -556,7 +559,7 @@ class Timestamp(_Timestamp): object freq=None, tz=None, unit=None, year=None, month=None, day=None, hour=None, minute=None, second=None, microsecond=None, - tzinfo=None): + nanosecond=None, tzinfo=None): # The parameter list folds together legacy parameter names (the first # four) and positional and keyword parameter names from pydatetime. # @@ -580,6 +583,9 @@ class Timestamp(_Timestamp): cdef _TSObject ts + _date_attributes = [year, month, day, hour, minute, second, + microsecond, nanosecond] + if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 @@ -588,7 +594,14 @@ class Timestamp(_Timestamp): elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') - if ts_input is _no_input: + if is_string_object(ts_input): + # User passed a date string to parse. + # Check that the user didn't also pass a date attribute kwarg. + if any(arg is not None for arg in _date_attributes): + raise ValueError('Cannot pass a date attribute keyword ' + 'argument when passing a date string') + + elif ts_input is _no_input: # User passed keyword arguments. if tz is None: # Handle the case where the user passes `tz` and not `tzinfo` @@ -596,20 +609,20 @@ class Timestamp(_Timestamp): return Timestamp(datetime(year, month, day, hour or 0, minute or 0, second or 0, microsecond or 0, tzinfo), - tz=tz) + nanosecond=nanosecond, tz=tz) elif is_integer_object(freq): # User passed positional arguments: # Timestamp(year, month, day[, hour[, minute[, second[, - # microsecond[, tzinfo]]]]]) + # microsecond[, nanosecond[, tzinfo]]]]]]) return Timestamp(datetime(ts_input, freq, tz, unit or 0, year or 0, month or 0, day or 0, - hour), tz=hour) + minute), nanosecond=hour, tz=minute) if tzinfo is not None: # User passed tzinfo instead of tz; avoid silently ignoring tz, tzinfo = tzinfo, None - ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) + ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: return NaT diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 7695c94409232..504a76f259e55 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -385,6 +385,27 @@ def test_constructor_fromordinal(self): ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') assert ts.to_pydatetime() == dt_tz + @pytest.mark.parametrize('result', [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, + microsecond=6, nanosecond=1), + Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, + microsecond=6, nanosecond=1, tz='UTC'), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC)]) + def test_constructor_nanosecond(self, result): + # GH 18898 + expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) + expected = expected + Timedelta(nanoseconds=1) + assert result == expected + + @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond']) + def test_invalid_date_kwarg_with_string_input(self, arg): + kwarg = {arg: 1} + with pytest.raises(ValueError): + Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + def test_out_of_bounds_value(self): one_us = np.timedelta64(1).astype('timedelta64[us]') From c8859b57b891701f250fb05f2cc60d2e6cae2d6b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Mar 2018 10:35:05 +0100 Subject: [PATCH 04/16] DOC: script to build single docstring page (#19840) --- doc/make.py | 166 +++++++++++++++++++++++++++------- doc/source/conf.py | 41 +-------- doc/source/contributing.rst | 27 ++++-- doc/source/index.rst.template | 5 + 4 files changed, 161 insertions(+), 78 deletions(-) diff --git a/doc/make.py b/doc/make.py index e3cb29aa3e086..2819a62347627 100755 --- a/doc/make.py +++ b/doc/make.py @@ -14,11 +14,14 @@ import sys import os import shutil -import subprocess +# import subprocess import argparse from contextlib import contextmanager +import webbrowser import jinja2 +import pandas + DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, 'source') @@ -26,28 +29,6 @@ BUILD_DIRS = ['doctrees', 'html', 'latex', 'plots', '_static', '_templates'] -def _generate_index(include_api, single_doc=None): - """Create index.rst file with the specified sections. - - Parameters - ---------- - include_api : bool - Whether API documentation will be built. - single_doc : str or None - If provided, this single documentation page will be generated. - """ - if single_doc is not None: - single_doc = os.path.splitext(os.path.basename(single_doc))[0] - include_api = False - - with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: - t = jinja2.Template(f.read()) - - with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: - f.write(t.render(include_api=include_api, - single_doc=single_doc)) - - @contextmanager def _maybe_exclude_notebooks(): """Skip building the notebooks if pandoc is not installed. @@ -58,6 +39,7 @@ def _maybe_exclude_notebooks(): 1. nbconvert isn't installed, or 2. nbconvert is installed, but pandoc isn't """ + # TODO move to exclude_pattern base = os.path.dirname(__file__) notebooks = [os.path.join(base, 'source', nb) for nb in ['style.ipynb']] @@ -96,8 +78,110 @@ class DocBuilder: All public methods of this class can be called as parameters of the script. """ - def __init__(self, num_jobs=1): + def __init__(self, num_jobs=1, include_api=True, single_doc=None): self.num_jobs = num_jobs + self.include_api = include_api + self.single_doc = None + self.single_doc_type = None + if single_doc is not None: + self._process_single_doc(single_doc) + self.exclude_patterns = self._exclude_patterns + + self._generate_index() + if self.single_doc_type == 'docstring': + self._run_os('sphinx-autogen', '-o', + 'source/generated_single', 'source/index.rst') + + @property + def _exclude_patterns(self): + """Docs source files that will be excluded from building.""" + # TODO move maybe_exclude_notebooks here + if self.single_doc is not None: + rst_files = [f for f in os.listdir(SOURCE_PATH) + if ((f.endswith('.rst') or f.endswith('.ipynb')) + and (f != 'index.rst') + and (f != '{0}.rst'.format(self.single_doc)))] + if self.single_doc_type != 'api': + rst_files += ['generated/*.rst'] + elif not self.include_api: + rst_files = ['api.rst', 'generated/*.rst'] + else: + rst_files = ['generated_single/*.rst'] + + exclude_patterns = ','.join( + '{!r}'.format(i) for i in ['**.ipynb_checkpoints'] + rst_files) + + return exclude_patterns + + def _process_single_doc(self, single_doc): + """Extract self.single_doc (base name) and self.single_doc_type from + passed single_doc kwarg. + + """ + self.include_api = False + + if single_doc == 'api.rst': + self.single_doc_type = 'api' + self.single_doc = 'api' + elif os.path.exists(os.path.join(SOURCE_PATH, single_doc)): + self.single_doc_type = 'rst' + self.single_doc = os.path.splitext(os.path.basename(single_doc))[0] + elif os.path.exists( + os.path.join(SOURCE_PATH, '{}.rst'.format(single_doc))): + self.single_doc_type = 'rst' + self.single_doc = single_doc + elif single_doc is not None: + try: + obj = pandas + for name in single_doc.split('.'): + obj = getattr(obj, name) + except AttributeError: + raise ValueError('Single document not understood, it should ' + 'be a file in doc/source/*.rst (e.g. ' + '"contributing.rst" or a pandas function or ' + 'method (e.g. "pandas.DataFrame.head")') + else: + self.single_doc_type = 'docstring' + if single_doc.startswith('pandas.'): + self.single_doc = single_doc[len('pandas.'):] + else: + self.single_doc = single_doc + + def _copy_generated_docstring(self): + """Copy existing generated (from api.rst) docstring page because + this is more correct in certain cases (where a custom autodoc + template is used). + + """ + fname = os.path.join(SOURCE_PATH, 'generated', + 'pandas.{}.rst'.format(self.single_doc)) + temp_dir = os.path.join(SOURCE_PATH, 'generated_single') + + try: + os.makedirs(temp_dir) + except OSError: + pass + + if os.path.exists(fname): + try: + # copying to make sure sphinx always thinks it is new + # and needs to be re-generated (to pick source code changes) + shutil.copy(fname, temp_dir) + except: # noqa + pass + + def _generate_index(self): + """Create index.rst file with the specified sections.""" + if self.single_doc_type == 'docstring': + self._copy_generated_docstring() + + with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: + t = jinja2.Template(f.read()) + + with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: + f.write(t.render(include_api=self.include_api, + single_doc=self.single_doc, + single_doc_type=self.single_doc_type)) @staticmethod def _create_build_structure(): @@ -121,7 +205,10 @@ def _run_os(*args): -------- >>> DocBuilder()._run_os('python', '--version') """ - subprocess.check_call(args, stderr=subprocess.STDOUT) + # TODO check_call should be more safe, but it fails with + # exclude patterns, needs investigation + # subprocess.check_call(args, stderr=subprocess.STDOUT) + os.system(' '.join(args)) def _sphinx_build(self, kind): """Call sphinx to build documentation. @@ -142,11 +229,21 @@ def _sphinx_build(self, kind): self._run_os('sphinx-build', '-j{}'.format(self.num_jobs), '-b{}'.format(kind), - '-d{}'.format(os.path.join(BUILD_PATH, - 'doctrees')), + '-d{}'.format(os.path.join(BUILD_PATH, 'doctrees')), + '-Dexclude_patterns={}'.format(self.exclude_patterns), SOURCE_PATH, os.path.join(BUILD_PATH, kind)) + def _open_browser(self): + base_url = os.path.join('file://', DOC_PATH, 'build', 'html') + if self.single_doc_type == 'docstring': + url = os.path.join( + base_url, + 'generated_single', 'pandas.{}.html'.format(self.single_doc)) + else: + url = os.path.join(base_url, '{}.html'.format(self.single_doc)) + webbrowser.open(url, new=2) + def html(self): """Build HTML documentation.""" self._create_build_structure() @@ -156,6 +253,11 @@ def html(self): if os.path.exists(zip_fname): os.remove(zip_fname) + if self.single_doc is not None: + self._open_browser() + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'), + ignore_errors=True) + def latex(self, force=False): """Build PDF documentation.""" self._create_build_structure() @@ -222,8 +324,8 @@ def main(): metavar='FILENAME', type=str, default=None, - help=('filename of section to compile, ' - 'e.g. "indexing"')) + help=('filename of section or method name to ' + 'compile, e.g. "indexing", "DataFrame.join"')) argparser.add_argument('--python-path', type=str, default=os.path.join(DOC_PATH, '..'), @@ -235,8 +337,10 @@ def main(): args.command, ', '.join(cmds))) os.environ['PYTHONPATH'] = args.python_path - _generate_index(not args.no_api, args.single) - getattr(DocBuilder(args.num_jobs), args.command)() + + getattr(DocBuilder(args.num_jobs, + not args.no_api, + args.single), args.command)() if __name__ == '__main__': diff --git a/doc/source/conf.py b/doc/source/conf.py index b5fbf096f2626..835127e5094e4 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -18,7 +18,6 @@ import importlib import warnings -from pandas.compat import u, PY3 try: raw_input # Python 2 @@ -86,38 +85,6 @@ if any(re.match("\s*api\s*", l) for l in index_rst_lines): autosummary_generate = True -files_to_delete = [] -for f in os.listdir(os.path.dirname(__file__)): - if (not f.endswith(('.ipynb', '.rst')) or - f.startswith('.') or os.path.basename(f) == 'index.rst'): - continue - - _file_basename = os.path.splitext(f)[0] - _regex_to_match = "\s*{}\s*$".format(_file_basename) - if not any(re.match(_regex_to_match, line) for line in index_rst_lines): - files_to_delete.append(f) - -if files_to_delete: - print("I'm about to DELETE the following:\n{}\n".format( - list(sorted(files_to_delete)))) - sys.stdout.write("WARNING: I'd like to delete those " - "to speed up processing (yes/no)? ") - if PY3: - answer = input() - else: - answer = raw_input() - - if answer.lower().strip() in ('y', 'yes'): - for f in files_to_delete: - f = os.path.join(os.path.join(os.path.dirname(__file__), f)) - f = os.path.abspath(f) - try: - print("Deleting {}".format(f)) - os.unlink(f) - except: - print("Error deleting {}".format(f)) - pass - # Add any paths that contain templates here, relative to this directory. templates_path = ['../_templates'] @@ -131,8 +98,8 @@ master_doc = 'index' # General information about the project. -project = u('pandas') -copyright = u('2008-2014, the pandas development team') +project = u'pandas' +copyright = u'2008-2014, the pandas development team' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -343,8 +310,8 @@ # file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pandas.tex', - u('pandas: powerful Python data analysis toolkit'), - u('Wes McKinney\n\& PyData Development Team'), 'manual'), + u'pandas: powerful Python data analysis toolkit', + u'Wes McKinney\n\& PyData Development Team', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 258ab874cafcf..e159af9958fde 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -171,7 +171,7 @@ We'll now kick off a three-step process: # Create and activate the build environment conda env create -f ci/environment-dev.yaml conda activate pandas-dev - + # or with older versions of Anaconda: source activate pandas-dev @@ -388,14 +388,11 @@ If you want to do a full clean build, do:: python make.py html You can tell ``make.py`` to compile only a single section of the docs, greatly -reducing the turn-around time for checking your changes. You will be prompted to -delete ``.rst`` files that aren't required. This is okay because the prior -versions of these files can be checked out from git. However, you must make sure -not to commit the file deletions to your Git repository! +reducing the turn-around time for checking your changes. :: - #omit autosummary and API section + # omit autosummary and API section python make.py clean python make.py --no-api @@ -404,10 +401,20 @@ not to commit the file deletions to your Git repository! python make.py clean python make.py --single indexing -For comparison, a full documentation build may take 10 minutes, a ``-no-api`` build -may take 3 minutes and a single section may take 15 seconds. Subsequent builds, which -only process portions you have changed, will be faster. Open the following file in a web -browser to see the full documentation you just built:: + # compile the reference docs for a single function + python make.py clean + python make.py --single DataFrame.join + +For comparison, a full documentation build may take 15 minutes, but a single +section may take 15 seconds. Subsequent builds, which only process portions +you have changed, will be faster. + +You can also specify to use multiple cores to speed up the documentation build:: + + python make.py html --num-jobs 4 + +Open the following file in a web browser to see the full documentation you +just built:: pandas/docs/build/html/index.html diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index eff1227e98994..cb6cce5edaf79 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -106,8 +106,13 @@ Some other notes See the package overview for more detail about what's in the library. +{% if single_doc_type == 'docstring' -%} +.. autosummary:: + :toctree: generated_single/ +{% else -%} .. toctree:: :maxdepth: 4 +{% endif %} {% if single_doc -%} {{ single_doc }} From 3b4eb8d9bc9a34b055b62f352c957e826386900d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Mar 2018 12:12:35 +0100 Subject: [PATCH 05/16] CLN: remove redundant clean_fill_method calls (#19947) --- pandas/core/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c4eb7dd7e7a7e..79c783040dc97 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4711,7 +4711,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if axis is None: axis = 0 axis = self._get_axis_number(axis) - method = missing.clean_fill_method(method) + from pandas import DataFrame if value is None: @@ -4732,7 +4732,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, # 3d elif self.ndim == 3: - # fill in 2d chunks result = {col: s.fillna(method=method, value=value) for col, s in self.iteritems()} @@ -4742,7 +4741,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, else: # 2d or less - method = missing.clean_fill_method(method) new_data = self._data.interpolate(method=method, axis=axis, limit=limit, inplace=inplace, coerce=True, From 9958ce68a19477721d2ba53bde2b17bb52ebebaa Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 1 Mar 2018 04:14:19 -0700 Subject: [PATCH 06/16] BUG: Preserve column metadata with DataFrame.astype (#19948) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/generic.py | 16 ++++++++++------ pandas/tests/frame/test_dtypes.py | 9 +++++++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 90ce6b47728fb..fb19fd81fe7c7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -954,6 +954,7 @@ Reshaping - Bug in :func:`qcut` where datetime and timedelta data with ``NaT`` present raised a ``ValueError`` (:issue:`19768`) - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`) +- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) Other ^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79c783040dc97..c411e29b5cc02 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4436,17 +4436,21 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): results.append(col.astype(dtype[col_name], copy=copy)) else: results.append(results.append(col.copy() if copy else col)) - return pd.concat(results, axis=1, copy=False) elif is_categorical_dtype(dtype) and self.ndim > 1: # GH 18099: columnwise conversion to categorical results = (self[col].astype(dtype, copy=copy) for col in self) - return pd.concat(results, axis=1, copy=False) - # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, - **kwargs) - return self._constructor(new_data).__finalize__(self) + else: + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, + **kwargs) + return self._constructor(new_data).__finalize__(self) + + # GH 19920: retain column metadata after concat + result = pd.concat(results, axis=1, copy=False) + result.columns = self.columns + return result def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 430d43019afc2..90daa9aa882c8 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -649,6 +649,15 @@ def test_astype_categoricaldtype_class_raises(self, cls): with tm.assert_raises_regex(TypeError, xpr): df['A'].astype(cls) + @pytest.mark.parametrize('dtype', [ + {100: 'float64', 200: 'uint64'}, 'category', 'float64']) + def test_astype_column_metadata(self, dtype): + # GH 19920 + columns = pd.UInt64Index([100, 200, 300], name='foo') + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + df = df.astype(dtype) + tm.assert_index_equal(df.columns, columns) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): From c5a1ef1505262e50a91dc78a647ff88586858243 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Mar 2018 22:48:39 +0100 Subject: [PATCH 07/16] DOC: remove empty attribute/method lists from class docstrings html page (#19949) --- doc/source/conf.py | 40 ++++++++++++++++++++++++++++++++++ pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/range.py | 4 ++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 835127e5094e4..c81d38db05cca 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -552,6 +552,45 @@ def remove_flags_docstring(app, what, name, obj, options, lines): del lines[:] +def process_class_docstrings(app, what, name, obj, options, lines): + """ + For those classes for which we use :: + + :template: autosummary/class_without_autosummary.rst + + the documented attributes/methods have to be listed in the class + docstring. However, if one of those lists is empty, we use 'None', + which then generates warnings in sphinx / ugly html output. + This "autodoc-process-docstring" event connector removes that part + from the processed docstring. + + """ + if what == "class": + joined = '\n'.join(lines) + + templates = [ + """.. rubric:: Attributes + +.. autosummary:: + :toctree: + + None +""", + """.. rubric:: Methods + +.. autosummary:: + :toctree: + + None +""" + ] + + for template in templates: + if template in joined: + joined = joined.replace(template, '') + lines[:] = joined.split('\n') + + suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just @@ -562,6 +601,7 @@ def remove_flags_docstring(app, what, name, obj, options, lines): def setup(app): app.connect("autodoc-process-docstring", remove_flags_docstring) + app.connect("autodoc-process-docstring", process_class_docstrings) app.add_autodocumenter(AccessorDocumenter) app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a4558116bfa63..1fe0c8fa289e6 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -131,7 +131,7 @@ def is_all_dates(self): Attributes ---------- - inferred_type + None Methods ------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9d770cffb0059..7c266dc889368 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -53,6 +53,10 @@ class RangeIndex(Int64Index): Index : The base pandas Index type Int64Index : Index of int64 data + Attributes + ---------- + None + Methods ------- from_range From 9242248068e3f9ba18bef2b83005c3a80b873835 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 1 Mar 2018 14:50:35 -0800 Subject: [PATCH 08/16] BUG: DataFrame.diff(axis=0) with DatetimeTZ data (#19773) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/internals.py | 29 +++++++++++++++++++++++++++ pandas/tests/frame/test_timeseries.py | 26 ++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fb19fd81fe7c7..1ede0310aa902 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -833,6 +833,7 @@ Timezones - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) - Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) +- Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`) Offsets ^^^^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 00ef8f9cef598..240c9b1f3377c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2905,6 +2905,35 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + def diff(self, n, axis=0, mgr=None): + """1st discrete difference + + Parameters + ---------- + n : int, number of periods to diff + axis : int, axis to diff upon. default 0 + mgr : default None + + Return + ------ + A list with a new TimeDeltaBlock. + + Note + ---- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype('timedelta64[ns]') + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e1bc310e1e934..ceb6c942c81b1 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -57,6 +57,32 @@ def test_diff(self): 1), 'z': pd.Series(1)}).astype('float64') assert_frame_equal(result, expected) + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime_axis0(self, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + + result = df.diff(axis=0) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), + 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime_axis1(self, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + if tz is None: + result = df.diff(axis=1) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), + 1: pd.TimedeltaIndex(['0 days', + '0 days'])}) + assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=1) + def test_diff_timedelta(self): # GH 4533 df = DataFrame(dict(time=[Timestamp('20130101 9:01'), From 87fefe237aa371a56409f5bb08821770fe9ae2ea Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Mar 2018 14:54:42 -0800 Subject: [PATCH 09/16] dispatch Series[datetime64] comparison ops to DatetimeIndex (#19800) --- pandas/core/indexes/datetimes.py | 8 ++--- pandas/core/ops.py | 34 ++++++++++--------- .../indexes/datetimes/test_partial_slicing.py | 4 +-- pandas/tests/series/test_arithmetic.py | 17 ++++++++++ pandas/tests/test_base.py | 23 ++++++++----- 5 files changed, 55 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index eb8133a1bbf97..c9b446b97e956 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -138,11 +138,9 @@ def wrapper(self, other): result = func(np.asarray(other)) result = com._values_from_object(result) - if isinstance(other, Index): - o_mask = other.values.view('i8') == libts.iNaT - else: - o_mask = other.view('i8') == libts.iNaT - + # Make sure to pass an array to result[...]; indexing with + # Series breaks with older version of numpy + o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 83f17a332f4be..931e91b941a7e 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -10,8 +10,7 @@ import numpy as np import pandas as pd -from pandas._libs import (lib, index as libindex, - algos as libalgos, ops as libops) +from pandas._libs import algos as libalgos, ops as libops from pandas import compat from pandas.util._decorators import Appender @@ -1127,24 +1126,20 @@ def na_op(x, y): # integer comparisons # we have a datetime/timedelta and may need to convert + assert not needs_i8_conversion(x) mask = None - if (needs_i8_conversion(x) or - (not is_scalar(y) and needs_i8_conversion(y))): - - if is_scalar(y): - mask = isna(x) - y = libindex.convert_scalar(x, com._values_from_object(y)) - else: - mask = isna(x) | isna(y) - y = y.view('i8') + if not is_scalar(y) and needs_i8_conversion(y): + mask = isna(x) | isna(y) + y = y.view('i8') x = x.view('i8') - try: + method = getattr(x, name, None) + if method is not None: with np.errstate(all='ignore'): - result = getattr(x, name)(y) + result = method(y) if result is NotImplemented: raise TypeError("invalid type comparison") - except AttributeError: + else: result = op(x, y) if mask is not None and mask.any(): @@ -1174,6 +1169,14 @@ def wrapper(self, other, axis=None): return self._constructor(res_values, index=self.index, name=res_name) + if is_datetime64_dtype(self) or is_datetime64tz_dtype(self): + # Dispatch to DatetimeIndex to ensure identical + # Series/Index behavior + res_values = dispatch_to_index_op(op, self, other, + pd.DatetimeIndex) + return self._constructor(res_values, index=self.index, + name=res_name) + elif is_timedelta64_dtype(self): res_values = dispatch_to_index_op(op, self, other, pd.TimedeltaIndex) @@ -1191,8 +1194,7 @@ def wrapper(self, other, axis=None): elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast - if (not is_scalar(lib.item_from_zerodim(other)) and - len(self) != len(other)): + if other.ndim != 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') res_values = na_op(self.values, np.asarray(other)) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 6bb4229883525..f263ac78cd343 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -2,7 +2,7 @@ import pytest -from datetime import datetime, date +from datetime import datetime import numpy as np import pandas as pd import operator as op @@ -349,7 +349,7 @@ def test_loc_datetime_length_one(self): @pytest.mark.parametrize('datetimelike', [ Timestamp('20130101'), datetime(2013, 1, 1), - date(2013, 1, 1), np.datetime64('2013-01-01T00:00', 'ns')]) + np.datetime64('2013-01-01T00:00', 'ns')]) @pytest.mark.parametrize('op,expected', [ (op.lt, [True, False, False, False]), (op.le, [True, True, False, False]), diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5b8d9cfab3e0d..ec0d7296e540e 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -88,6 +88,23 @@ def test_ser_cmp_result_names(self, names, op): class TestTimestampSeriesComparison(object): + def test_dt64ser_cmp_date_invalid(self): + # GH#19800 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + date = ser.iloc[0].to_pydatetime().date() + assert not (ser == date).any() + assert (ser != date).all() + with pytest.raises(TypeError): + ser > date + with pytest.raises(TypeError): + ser < date + with pytest.raises(TypeError): + ser >= date + with pytest.raises(TypeError): + ser <= date + def test_dt64ser_cmp_period_scalar(self): ser = Series(pd.period_range('2000-01-01', periods=10, freq='D')) val = Period('2000-01-04', freq='D') diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 4b5ad336139b0..6247079e4ac3a 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -10,7 +10,7 @@ import pandas as pd import pandas.compat as compat from pandas.core.dtypes.common import ( - is_object_dtype, is_datetimetz, + is_object_dtype, is_datetimetz, is_datetime64_dtype, needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, @@ -296,14 +296,21 @@ def test_none_comparison(self): # result = None != o # noqa # assert result.iat[0] # assert result.iat[1] + if (is_datetime64_dtype(o) or is_datetimetz(o)): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + with pytest.raises(TypeError): + None > o + with pytest.raises(TypeError): + o > None + else: + result = None > o + assert not result.iat[0] + assert not result.iat[1] - result = None > o - assert not result.iat[0] - assert not result.iat[1] - - result = o < None - assert not result.iat[0] - assert not result.iat[1] + result = o < None + assert not result.iat[0] + assert not result.iat[1] def test_ndarray_compat_properties(self): From d44a6ec2a9f6f28b968951d941e38539435c6bec Mon Sep 17 00:00:00 2001 From: Yian Date: Fri, 2 Mar 2018 00:02:31 +0100 Subject: [PATCH 10/16] Making to_datetime('today') and Timestamp('today') consistent (#19937) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslib.pyx | 3 +-- pandas/tests/indexes/datetimes/test_tools.py | 19 +++++++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1ede0310aa902..08363cd54c606 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -652,6 +652,7 @@ Other API Changes - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) +- ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) - :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) .. _whatsnew_0230.deprecations: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fec7f21d6e6eb..17453d8af1297 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -755,8 +755,7 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult): iresult[0] = Timestamp.utcnow().value return True elif val == 'today': - # Note: this is *not* the same as Timestamp('today') - iresult[0] = Timestamp.now().normalize().value + iresult[0] = Timestamp.today().value return True return False diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index fbf0977a04d82..0d42b6e9692fe 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -224,27 +224,34 @@ def test_to_datetime_today(self): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC - nptoday = np.datetime64('today').astype('datetime64[ns]') + nptoday = np.datetime64('today')\ + .astype('datetime64[ns]').astype(np.int64) pdtoday = pd.to_datetime('today') pdtoday2 = pd.to_datetime(['today'])[0] + tstoday = pd.Timestamp('today') + tstoday2 = pd.Timestamp.today() + # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds - assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 - assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 + assert abs(pdtoday.value - tstoday.value) < 1e10 + assert abs(pdtoday.value - tstoday2.value) < 1e10 assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None with tm.set_timezone('US/Samoa'): # 11 hours behind UTC - nptoday = np.datetime64('today').astype('datetime64[ns]') + nptoday = np.datetime64('today')\ + .astype('datetime64[ns]').astype(np.int64) pdtoday = pd.to_datetime('today') pdtoday2 = pd.to_datetime(['today'])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds - assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 - assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None From 072545d213da4a29f0133be78facdb912db6b083 Mon Sep 17 00:00:00 2001 From: David C Hall Date: Thu, 1 Mar 2018 15:06:20 -0800 Subject: [PATCH 11/16] ENH: Add option to disable MathJax (#19824). (#19856) --- doc/source/options.rst | 4 ++++ doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/config_init.py | 8 ++++++++ pandas/io/formats/format.py | 3 +++ pandas/io/formats/style.py | 12 +++++++++++- pandas/tests/io/formats/test_format.py | 7 +++++++ pandas/tests/io/formats/test_style.py | 7 +++++++ 7 files changed, 41 insertions(+), 1 deletion(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index cce16a5396377..a82be4d84bf3f 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -402,6 +402,10 @@ display.html.table_schema False Whether to publish a Table display.html.border 1 A ``border=value`` attribute is inserted in the ```` tag for the DataFrame HTML repr. +display.html.use_mathjax True When True, Jupyter notebook will process + table contents using MathJax, rendering + mathematical expressions enclosed by the + dollar symbol. io.excel.xls.writer xlwt The default Excel writer engine for 'xls' files. io.excel.xlsm.writer openpyxl The default Excel writer engine for diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 08363cd54c606..f2c96ba3f53a8 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -337,6 +337,7 @@ Other Enhancements - Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) - For subclassed ``DataFrames``, :func:`DataFrame.apply` will now preserve the ``Series`` subclass (if defined) when passing the data to the applied function (:issue:`19822`) - :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`) +- Added option ``display.html.use_mathjax`` so `MathJax `_ can be disabled when rendering tables in ``Jupyter`` notebooks (:issue:`19856`, :issue:`19824`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index da42cdbf10233..0edbf892172a9 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -207,6 +207,12 @@ def use_numexpr_cb(key): (currently both are identical) """ +pc_html_use_mathjax_doc = """\ +: boolean + When True, Jupyter notebook will process table contents using MathJax, + rendering mathematical expressions enclosed by the dollar symbol. + (default: True) +""" pc_width_doc = """ : int @@ -358,6 +364,8 @@ def table_schema_cb(key): validator=is_bool, cb=table_schema_cb) cf.register_option('html.border', 1, pc_html_border_doc, validator=is_int) + cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc, + validator=is_bool) with cf.config_prefix('html'): cf.register_option('border', 1, pc_html_border_doc, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 621641747f376..50b4f11634b78 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1209,6 +1209,9 @@ def write_result(self, buf): frame = self.frame _classes = ['dataframe'] # Default class. + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + _classes.append('tex2jax_ignore') if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 525f487d8aa39..f876ceb8a26bf 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -325,9 +325,19 @@ def format_attr(pair): .format(row=r, col=c)}) body.append(row_es) + table_attr = self.table_attributes + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + table_attr = table_attr or '' + if 'class="' in table_attr: + table_attr = table_attr.replace('class="', + 'class="tex2jax_ignore ') + else: + table_attr += ' class="tex2jax_ignore"' + return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, precision=precision, table_styles=table_styles, - caption=caption, table_attributes=self.table_attributes) + caption=caption, table_attributes=table_attr) def format(self, formatter, subset=None): """ diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 03c071dbe4bc5..6c3b75cdfa6df 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1434,6 +1434,13 @@ def test_repr_html(self): tm.reset_display_options() + def test_repr_html_mathjax(self): + df = DataFrame([[1, 2], [3, 4]]) + assert 'tex2jax_ignore' not in df._repr_html_() + + with pd.option_context('display.html.use_mathjax', False): + assert 'tex2jax_ignore' in df._repr_html_() + def test_repr_html_wide(self): max_cols = get_option('display.max_columns') df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index adf8e14b756c2..c1ab9cd184340 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -46,6 +46,13 @@ def test_init_series(self): def test_repr_html_ok(self): self.styler._repr_html_() + def test_repr_html_mathjax(self): + # gh-19824 + assert 'tex2jax_ignore' not in self.styler._repr_html_() + + with pd.option_context('display.html.use_mathjax', False): + assert 'tex2jax_ignore' in self.styler._repr_html_() + def test_update_ctx(self): self.styler._update_ctx(self.attrs) expected = {(0, 0): ['color: red'], From 5f271eb2e76a78b089879e3528018463ea0eeb1c Mon Sep 17 00:00:00 2001 From: Yian Date: Fri, 2 Mar 2018 00:13:58 +0100 Subject: [PATCH 12/16] BUG: Adding skipna as an option to groupby cumsum and cumprod (#19914) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/groupby.pyx | 16 ++++++++++++++-- pandas/core/groupby.py | 6 ++++-- pandas/tests/groupby/test_transform.py | 25 +++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f2c96ba3f53a8..7a19f87051746 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -927,6 +927,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) +- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) Sparse ^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d3fcd84e5f38d..43afd1e0f5969 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,7 +139,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - bint is_datetimelike): + bint is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -163,6 +164,11 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break @cython.boundscheck(False) @@ -170,7 +176,8 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - is_datetimelike): + is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -196,6 +203,11 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b8ca104c4b2c7..4a09d636ee320 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1888,7 +1888,8 @@ def rank(self, method='average', ascending=True, na_option='keep', @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumprod', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) @@ -1898,7 +1899,8 @@ def cumprod(self, axis=0, *args, **kwargs): @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumsum', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 1be7dfdcc64e6..b418bb0c5fea6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -498,6 +498,31 @@ def test_cython_transform_series(self, op, args, targop): tm.assert_series_equal(expected, getattr( data.groupby(labels), op)(*args)) + @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) + @pytest.mark.parametrize("skipna", [False, True]) + @pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) + def test_groupby_cum_skipna(self, op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name='value') + tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()), From d615f86acc441e206b507b5d71033808a981b398 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 2 Mar 2018 09:39:45 +0000 Subject: [PATCH 13/16] DOC: Adding script to validate docstrings, and generate list of all functions/methods with state (#19898) --- scripts/validate_docstrings.py | 355 +++++++++++++++++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100755 scripts/validate_docstrings.py diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py new file mode 100755 index 0000000000000..7807785c24751 --- /dev/null +++ b/scripts/validate_docstrings.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python +""" +Analyze docstrings to detect errors. + +If no argument is provided, it does a quick check of docstrings and returns +a csv with all API functions and results of basic checks. + +If a function or method is provided in the form "pandas.function", +"pandas.module.class.method", etc. a list of all errors in the docstring for +the specified function or method. + +Usage:: + $ ./validate_docstrings.py + $ ./validate_docstrings.py pandas.DataFrame.head +""" +import os +import sys +import csv +import re +import functools +import argparse +import contextlib +import inspect +import importlib +import doctest +import textwrap +try: + from io import StringIO +except ImportError: + from cStringIO import StringIO +import numpy + +BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +sys.path.insert(0, os.path.join(BASE_PATH)) +import pandas + +sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +from numpydoc.docscrape import NumpyDocString + + +def _to_original_callable(obj): + while True: + if inspect.isfunction(obj) or inspect.isclass(obj): + f = inspect.getfile(obj) + if f.startswith('<') and f.endswith('>'): + return None + return obj + if inspect.ismethod(obj): + obj = obj.__func__ + elif isinstance(obj, functools.partial): + obj = obj.func + elif isinstance(obj, property): + obj = obj.fget + else: + return None + + +def _output_header(title, width=80, char='#'): + full_line = char * width + side_len = (width - len(title) - 2) // 2 + adj = '' if len(title) % 2 == 0 else ' ' + title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, + title=title, + adj=adj) + + return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( + full_line=full_line, title_line=title_line) + + +class Docstring: + def __init__(self, method_name, method_obj): + self.method_name = method_name + self.method_obj = method_obj + self.raw_doc = method_obj.__doc__ or '' + self.raw_doc = textwrap.dedent(self.raw_doc) + self.doc = NumpyDocString(self.raw_doc) + + def __len__(self): + return len(self.raw_doc) + + @property + def source_file_name(self): + fname = inspect.getsourcefile(self.method_obj) + if fname: + fname = os.path.relpath(fname, BASE_PATH) + return fname + + @property + def source_file_def_line(self): + try: + return inspect.getsourcelines(self.method_obj)[-1] + except OSError: + pass + + @property + def github_url(self): + url = 'https://github.com/pandas-dev/pandas/blob/master/' + url += '{}#L{}'.format(self.source_file_name, + self.source_file_def_line) + return url + + @property + def first_line_blank(self): + if self.raw_doc: + return not bool(self.raw_doc.split('\n')[0].strip()) + + @property + def summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return '' + return ' '.join(self.doc['Summary']) + + @property + def extended_summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return ' '.join(self.doc['Summary']) + return ' '.join(self.doc['Extended Summary']) + + @property + def needs_summary(self): + return not (bool(self.summary) and bool(self.extended_summary)) + + @property + def doc_parameters(self): + return self.doc['Parameters'] + + @property + def signature_parameters(self): + if not inspect.isfunction(self.method_obj): + return tuple() + params = tuple(inspect.signature(self.method_obj).parameters.keys()) + if params and params[0] in ('self', 'cls'): + return params[1:] + return params + + @property + def parameter_mismatches(self): + errs = [] + signature_params = self.signature_parameters + if self.doc_parameters: + doc_params = list(zip(*self.doc_parameters))[0] + else: + doc_params = [] + + missing = set(signature_params) - set(doc_params) + if missing: + errs.append('Parameters {!r} not documented'.format(missing)) + extra = set(doc_params) - set(signature_params) + if extra: + errs.append('Unknown parameters {!r}'.format(extra)) + if not missing and not extra and signature_params != doc_params: + errs.append('Wrong parameters order. ' + + 'Actual: {!r}. '.format(signature_params) + + 'Documented: {!r}'.format(doc_params)) + + return errs + + @property + def correct_parameters(self): + return not bool(self.parameter_mismatches) + + @property + def see_also(self): + return self.doc['See Also'] + + @property + def examples(self): + return self.doc['Examples'] + + @property + def first_line_ends_in_dot(self): + if self.doc: + return self.doc.split('\n')[0][-1] == '.' + + @property + def deprecated(self): + pattern = re.compile('.. deprecated:: ') + return (self.method_name.startswith('pandas.Panel') or + bool(pattern.search(self.summary)) or + bool(pattern.search(self.extended_summary))) + + @property + def examples_errors(self): + flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL + finder = doctest.DocTestFinder() + runner = doctest.DocTestRunner(optionflags=flags) + context = {'np': numpy, 'pd': pandas} + error_msgs = '' + for test in finder.find(self.raw_doc, self.method_name, globs=context): + f = StringIO() + with contextlib.redirect_stdout(f): + runner.run(test) + error_msgs += f.getvalue() + return error_msgs + + +def get_api_items(): + api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') + + position = None + with open(api_fname) as f: + for line in f: + if line.startswith('.. currentmodule::'): + current_module = line.replace('.. currentmodule::', '').strip() + continue + + if line == '.. autosummary::\n': + position = 'autosummary' + continue + + if position == 'autosummary': + if line == '\n': + position = 'items' + continue + + if position == 'items': + if line == '\n': + position = None + continue + item = line.strip() + func = importlib.import_module(current_module) + for part in item.split('.'): + func = getattr(func, part) + + yield '.'.join([current_module, item]), func + + +def validate_all(): + writer = csv.writer(sys.stdout) + writer.writerow(['Function or method', + 'Type', + 'File', + 'Code line', + 'GitHub link', + 'Is deprecated', + 'Has summary', + 'Has extended summary', + 'Parameters ok', + 'Has examples', + 'Shared code with']) + seen = {} + for func_name, func in get_api_items(): + obj_type = type(func).__name__ + original_callable = _to_original_callable(func) + if original_callable is None: + writer.writerow([func_name, obj_type] + [''] * 9) + else: + doc = Docstring(func_name, original_callable) + key = doc.source_file_name, doc.source_file_def_line + shared_code = seen.get(key, '') + seen[key] = func_name + writer.writerow([func_name, + obj_type, + doc.source_file_name, + doc.source_file_def_line, + doc.github_url, + int(doc.deprecated), + int(bool(doc.summary)), + int(bool(doc.extended_summary)), + int(doc.correct_parameters), + int(bool(doc.examples)), + shared_code]) + + return 0 + + +def validate_one(func_name): + for maxsplit in range(1, func_name.count('.') + 1): + # TODO when py3 only replace by: module, *func_parts = ... + func_name_split = func_name.rsplit('.', maxsplit=maxsplit) + module = func_name_split[0] + func_parts = func_name_split[1:] + try: + func_obj = importlib.import_module(module) + except ImportError: + pass + else: + continue + + if 'module' not in locals(): + raise ImportError('No module can be imported ' + 'from "{}"'.format(func_name)) + + for part in func_parts: + func_obj = getattr(func_obj, part) + + doc = Docstring(func_name, func_obj) + + sys.stderr.write(_output_header('Docstring ({})'.format(func_name))) + sys.stderr.write('{}\n'.format(doc.raw_doc)) + + errs = [] + if not doc.summary: + errs.append('No summary found') + else: + if not doc.summary[0].isupper(): + errs.append('Summary does not start with capital') + if doc.summary[-1] != '.': + errs.append('Summary does not end with dot') + if doc.summary.split(' ')[0][-1] == 's': + errs.append('Summary must start with infinitive verb, ' + 'not third person (e.g. use "Generate" instead of ' + '"Generates")') + if not doc.extended_summary: + errs.append('No extended summary found') + + param_errs = doc.parameter_mismatches + if param_errs: + errs.append('Errors in parameters section') + for param_err in param_errs: + errs.append('\t{}'.format(param_err)) + + examples_errs = '' + if not doc.examples: + errs.append('No examples section found') + else: + examples_errs = doc.examples_errors + if examples_errs: + errs.append('Examples do not pass tests') + + sys.stderr.write(_output_header('Validation')) + if errs: + sys.stderr.write('Errors found:\n') + for err in errs: + sys.stderr.write('\t{}\n'.format(err)) + else: + sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + + if examples_errs: + sys.stderr.write(_output_header('Doctests')) + sys.stderr.write(examples_errs) + + return len(errs) + + +def main(function): + if function is None: + return validate_all() + else: + return validate_one(function) + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='validate pandas docstrings') + argparser.add_argument('function', + nargs='?', + default=None, + help=('function or method to validate ' + '(e.g. pandas.DataFrame.head) ' + 'if not provided, all docstrings ' + 'are validated')) + args = argparser.parse_args() + sys.exit(main(args.function)) From e6c7dea1508592f6ee34310e751b6e084a786fa0 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 2 Mar 2018 11:19:07 +0000 Subject: [PATCH 14/16] ENH: Let initialisation from dicts use insertion order for python >= 3.6 (part III) (#19884) --- doc/source/dsintro.rst | 37 +++++++++++++-- doc/source/whatsnew/v0.23.0.txt | 57 +++++++++++++++++++++-- pandas/core/common.py | 12 ++++- pandas/core/frame.py | 9 ++-- pandas/core/panel.py | 6 +-- pandas/core/series.py | 9 +++- pandas/core/sparse/frame.py | 7 ++- pandas/core/sparse/series.py | 4 ++ pandas/tests/frame/test_constructors.py | 20 +++++++- pandas/tests/io/test_excel.py | 8 ++-- pandas/tests/io/test_pytables.py | 2 +- pandas/tests/series/test_constructors.py | 14 +++++- pandas/tests/sparse/frame/test_frame.py | 12 +++++ pandas/tests/sparse/series/test_series.py | 14 +++++- pandas/tests/test_panel.py | 14 +++--- 15 files changed, 193 insertions(+), 32 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 1ba00b8fb6f23..ca6cefac9e842 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -81,9 +81,28 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. **From dict** -If ``data`` is a dict, if **index** is passed the values in data corresponding -to the labels in the index will be pulled out. Otherwise, an index will be -constructed from the sorted keys of the dict, if possible. +Series can be instantiated from dicts: + +.. ipython:: python + + d = {'b' : 1, 'a' : 0, 'c' : 2} + pd.Series(d) + +.. note:: + + When the data is a dict, and an index is not passed, the ``Series`` index + will be ordered by the dict's insertion order, if you're using Python + version >= 3.6 and Pandas version >= 0.23. + + If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed, + the ``Series`` index will be the lexically ordered list of dict keys. + +In the example above, if you were on a Python version lower than 3.6 or a +Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical +order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``). + +If an index is passed, the values in data corresponding to the labels in the +index will be pulled out. .. ipython:: python @@ -243,12 +262,22 @@ not matching up to the passed index. If axis labels are not passed, they will be constructed from the input data based on common sense rules. +.. note:: + + When the data is a dict, and ``columns`` is not specified, the ``DataFrame`` + columns will be ordered by the dict's insertion order, if you are using + Python version >= 3.6 and Pandas >= 0.23. + + If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not + specified, the ``DataFrame`` columns will be the lexically ordered list of dict + keys. + From dict of Series or dicts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The resulting **index** will be the **union** of the indexes of the various Series. If there are any nested dicts, these will first be converted to -Series. If no columns are passed, the columns will be the sorted list of dict +Series. If no columns are passed, the columns will be the ordered list of dict keys. .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7a19f87051746..cf2a5de583878 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -3,7 +3,7 @@ v0.23.0 ------- -This is a major release from 0.21.1 and includes a number of API changes, +This is a major release from 0.22.0 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -249,7 +249,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python using ``.assign()`` to update an existing column. Previously, callables referring to other variables being updated would get the "old" values - Previous Behaviour: + Previous Behavior: .. code-block:: ipython @@ -262,7 +262,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python 1 3 -2 2 4 -3 - New Behaviour: + New Behavior: .. ipython:: python @@ -361,6 +361,57 @@ If installed, we now require: | openpyxl | 2.4.0 | | +-----------------+-----------------+----------+ +.. _whatsnew_0230.api_breaking.dict_insertion_order: + +Instantation from dicts preserves dict insertion order for python 3.6+ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until Python 3.6, dicts in Python had no formally defined ordering. For Python +version 3.6 and later, dicts are ordered by insertion order, see +`PEP 468 `_. +Pandas will use the dict's insertion order, when creating a ``Series`` or +``DataFrame`` from a dict and you're using Python version 3.6 or +higher. (:issue:`19884`) + +Previous Behavior (and current behavior if on Python < 3.6): + +.. code-block:: ipython + + In [1]: pd.Series({'Income': 2000, + ... 'Expenses': -1500, + ... 'Taxes': -200, + ... 'Net result': 300}) + Expenses -1500 + Income 2000 + Net result 300 + Taxes -200 + dtype: int64 + +Note the Series above is ordered alphabetically by the index values. + +New Behavior (for Python >= 3.6): + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}) + +Notice that the Series is now ordered by insertion order. This new behavior is +used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries`` +and ``SparseDataFrame``). + +If you wish to retain the old behavior while using Python >= 3.6, you can use +``.sort_index()``: + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}).sort_index() + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/common.py b/pandas/core/common.py index c4fbcf28cbcae..c4890dbd39ef1 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -11,7 +11,7 @@ from pandas._libs import lib, tslib from pandas import compat -from pandas.compat import long, zip, iteritems +from pandas.compat import long, zip, iteritems, PY36, OrderedDict from pandas.core.config import get_option from pandas.core.dtypes.generic import ABCSeries, ABCIndex from pandas.core.dtypes.common import _NS_DTYPE @@ -186,6 +186,16 @@ def _try_sort(iterable): return listed +def _dict_keys_to_ordered_list(mapping): + # when pandas drops support for Python < 3.6, this function + # can be replaced by a simple list(mapping.keys()) + if PY36 or isinstance(mapping, OrderedDict): + keys = list(mapping.keys()) + else: + keys = _try_sort(mapping) + return keys + + def iterpairs(seq): """ Parameters diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae8fb48a61fce..ff4064b3f8c56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -252,6 +252,11 @@ class DataFrame(NDFrame): ---------- data : numpy ndarray (structured or homogeneous), dict, or DataFrame Dict can contain Series, arrays, constants, or list-like objects + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided @@ -460,9 +465,7 @@ def _init_dict(self, data, index, columns, dtype=None): arrays.append(v) else: - keys = list(data.keys()) - if not isinstance(data, OrderedDict): - keys = com._try_sort(keys) + keys = com._dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] diff --git a/pandas/core/panel.py b/pandas/core/panel.py index fc7fad861df44..052d555df76f1 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -204,10 +204,8 @@ def _init_dict(self, data, axes, dtype=None): for k, v in compat.iteritems(data) if k in haxis) else: - ks = list(data.keys()) - if not isinstance(data, OrderedDict): - ks = com._try_sort(ks) - haxis = Index(ks) + keys = com._dict_keys_to_ordered_list(data) + haxis = Index(keys) for k, v in compat.iteritems(data): if isinstance(v, dict): diff --git a/pandas/core/series.py b/pandas/core/series.py index 660bf3f5d4805..069f0372ab6e1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,7 +54,7 @@ from pandas import compat from pandas.io.formats.terminal import get_terminal_size from pandas.compat import ( - zip, u, OrderedDict, StringIO, range, get_range_parameters) + zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36) from pandas.compat.numpy import function as nv import pandas.core.ops as ops @@ -130,6 +130,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame): ---------- data : array-like, dict, or scalar value Contains data stored in Series + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -297,7 +302,7 @@ def _init_dict(self, data, index=None, dtype=None): # Now we just make sure the order is respected, if any if index is not None: s = s.reindex(index, copy=False) - elif not isinstance(data, OrderedDict): + elif not PY36 and not isinstance(data, OrderedDict): try: s = s.sort_index() except TypeError: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index d89b1d681c478..2cefbea722098 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -39,6 +39,10 @@ class SparseDataFrame(DataFrame): Parameters ---------- data : same types as can be passed to DataFrame or scipy.sparse.spmatrix + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' @@ -138,7 +142,8 @@ def _init_dict(self, data, index, columns, dtype=None): columns = _ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: - columns = Index(com._try_sort(list(data.keys()))) + keys = com._dict_keys_to_ordered_list(data) + columns = Index(keys) if index is None: index = extract_index(list(data.values())) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index f8b98a1a40081..714cd09a27294 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -42,6 +42,10 @@ class SparseSeries(Series): Parameters ---------- data : {array-like, Series, SparseSeries, dict} + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + kind : {'block', 'integer'} fill_value : float Code for missing value. Defaults depends on dtype. diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e0b94815878dd..499751e864331 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, - OrderedDict, is_platform_little_endian) + OrderedDict, is_platform_little_endian, PY36) from pandas import compat from pandas import (DataFrame, Index, Series, isna, MultiIndex, Timedelta, Timestamp, @@ -290,6 +290,24 @@ def test_constructor_dict(self): with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['b']) + @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') + def test_constructor_dict_order_insertion(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ba')) + tm.assert_frame_equal(frame, expected) + + @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') + def test_constructor_dict_order_by_values(self): + # GH19018 + # initialization ordering: by value if python<3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ab')) + tm.assert_frame_equal(frame, expected) + def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 15d3062394d6e..0b80af11520b5 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -762,17 +762,17 @@ def test_read_excel_multiindex_empty_level(self, ext): # GH 12453 with ensure_clean('.xlsx') as path: df = DataFrame({ - ('Zero', ''): {0: 0}, ('One', 'x'): {0: 1}, ('Two', 'X'): {0: 3}, - ('Two', 'Y'): {0: 7} + ('Two', 'Y'): {0: 7}, + ('Zero', ''): {0: 0} }) expected = DataFrame({ - ('Zero', 'Unnamed: 3_level_1'): {0: 0}, ('One', u'x'): {0: 1}, ('Two', u'X'): {0: 3}, - ('Two', u'Y'): {0: 7} + ('Two', u'Y'): {0: 7}, + ('Zero', 'Unnamed: 3_level_1'): {0: 0} }) df.to_excel(path) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 04da6da74059b..e690b1e302d8b 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2034,7 +2034,7 @@ def test_table_values_dtypes_roundtrip(self): 'bool': 1, 'int16': 1, 'int8': 1, 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) result = result.sort_index() - result = expected.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 25f425ffa0021..e0bfe41645a3f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -22,7 +22,7 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import lrange, range, zip, long +from pandas.compat import lrange, range, zip, long, PY36 from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -811,6 +811,18 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = Series(d) + if PY36: + expected = Series([1, 0, 2], index=list('bac')) + else: + expected = Series([0, 1, 2], index=list('abc')) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index ee0d63aff7367..1062de3119efc 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -139,6 +139,18 @@ def test_constructor(self): repr(self.frame) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': [2, 3], 'a': [0, 1]} + frame = SparseDataFrame(data=d) + if compat.PY36: + expected = SparseDataFrame(data=d, columns=list('ba')) + else: + expected = SparseDataFrame(data=d, columns=list('ab')) + tm.assert_sp_frame_equal(frame, expected) + def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 3f5d5a59cc540..eb63c87820070 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -14,7 +14,7 @@ from pandas.tseries.offsets import BDay import pandas.util.testing as tm import pandas.util._test_decorators as td -from pandas.compat import range +from pandas.compat import range, PY36 from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf @@ -114,6 +114,18 @@ def test_constructor_dict_input(self): result = SparseSeries(constructor_dict) tm.assert_sp_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = SparseSeries(d) + if PY36: + expected = SparseSeries([1, 0, 2], index=list('bac')) + else: + expected = SparseSeries([0, 1, 2], index=list('abc')) + tm.assert_sp_series_equal(result, expected) + def test_constructor_dtype(self): arr = SparseSeries([np.nan, 1, 2, np.nan]) assert arr.dtype == np.float64 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 1955fc301be9b..301a7fc437fcf 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2368,14 +2368,16 @@ def test_update_from_dict(self): pan.update(other) expected = Panel( - {'two': DataFrame([[3.6, 2., 3], - [1.5, np.nan, 7], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]), - 'one': DataFrame([[1.5, np.nan, 3.], + {'one': DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]])}) + [1.5, np.nan, 3.]]), + 'two': DataFrame([[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]) + } + ) assert_panel_equal(pan, expected) From b167483f08f8827ef97da8b837026090a9980f64 Mon Sep 17 00:00:00 2001 From: Gina Date: Fri, 2 Mar 2018 05:33:49 -0600 Subject: [PATCH 15/16] DOC: update install.rst to include ActivePython distribution (#19908) --- doc/source/install.rst | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 4ff63d59024b2..e3667221e5166 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -12,7 +12,7 @@ cross platform distribution for data analysis and scientific computing. This is the recommended installation method for most users. Instructions for installing from source, -`PyPI `__, various Linux distributions, or a +`PyPI `__, `ActivePython `__, various Linux distributions, or a `development version `__ are also provided. Python version support @@ -25,8 +25,8 @@ Installing pandas .. _install.anaconda: -Installing pandas with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Anaconda +~~~~~~~~~~~~~~~~~~~~~~~~ Installing pandas and the rest of the `NumPy `__ and `SciPy `__ stack can be a little @@ -58,8 +58,8 @@ that folder). .. _install.miniconda: -Installing pandas with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Miniconda +~~~~~~~~~~~~~~~~~~~~~~~~~ The previous section outlined how to get pandas installed as part of the `Anaconda `__ distribution. @@ -134,6 +134,10 @@ pandas can be installed via pip from pip install pandas +Installing with ActivePython +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Installation instructions for `ActivePython `__ can be found `here `__. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -164,7 +168,7 @@ Installing from source See the :ref:`contributing documentation ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. Running the test suite -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- pandas is equipped with an exhaustive set of unit tests, covering about 97% of the codebase as of this writing. To run it on your machine to verify that @@ -299,5 +303,5 @@ Optional Dependencies Without the optional dependencies, many useful features will not work. Hence, it is highly recommended that you install these. A packaged - distribution like `Anaconda `__, or `Enthought Canopy + distribution like `Anaconda `__, `ActivePython `__, or `Enthought Canopy `__ may be worth considering. From a7a7f8c1101aed1a9d37abbbcd80f77da414f0a8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Mar 2018 13:49:59 +0100 Subject: [PATCH 16/16] DOC: clarify version of ActivePython that includes pandas (#19964) --- doc/source/install.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index e3667221e5166..07f57dbd65709 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -137,7 +137,10 @@ pandas can be installed via pip from Installing with ActivePython ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Installation instructions for `ActivePython `__ can be found `here `__. +Installation instructions for +`ActivePython `__ can be found +`here `__. Versions +2.7 and 3.5 include pandas. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,5 +306,5 @@ Optional Dependencies Without the optional dependencies, many useful features will not work. Hence, it is highly recommended that you install these. A packaged - distribution like `Anaconda `__, `ActivePython `__, or `Enthought Canopy + distribution like `Anaconda `__, `ActivePython `__ (version 2.7 or 3.5), or `Enthought Canopy `__ may be worth considering.