diff --git a/README.md b/README.md index c76fbe7df9e6b..8623ee170d154 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ conda install pandas - xlrd >= 0.9.0 - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - Alternative Excel writer. -- [Google bq Command Line Tool](https://developers.google.com/bigquery/bq-command-line-tool/) +- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - Needed for `pandas.io.gbq` - [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. - One of the following combinations of libraries is needed to use the diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 583b36857c70c..8670ea61dbec2 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -13,9 +13,10 @@ fi if [ x"$DOC_BUILD" != x"" ]; then - # we're running network tests, let's build the docs in the meantim + + # we're running network tests, let's build the docs in the meantime echo "Will build docs" - pip install sphinx==1.1.3 ipython==1.1.0 + conda install sphinx==1.1.3 ipython mv "$TRAVIS_BUILD_DIR"/doc /tmp cd /tmp/doc diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt index 9ba8fd7ca9393..8c2f675b65603 100644 --- a/ci/requirements-3.2.txt +++ b/ci/requirements-3.2.txt @@ -1,15 +1,4 @@ python-dateutil==2.1 pytz==2013b -xlsxwriter==0.4.6 -xlrd==0.9.2 numpy==1.7.1 cython==0.19.1 -numexpr==2.1 -tables==3.0.0 -matplotlib==1.2.1 -patsy==0.1.0 -lxml==3.2.1 -html5lib -scipy==0.12.0 -beautifulsoup4==4.2.1 -statsmodels==0.5.0 diff --git a/doc/_templates/autosummary/accessor.rst b/doc/_templates/autosummary/accessor.rst new file mode 100644 index 0000000000000..1401121fb51c6 --- /dev/null +++ b/doc/_templates/autosummary/accessor.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module.split('.')[0] }} + +.. automethod:: {{ [module.split('.')[1], objname]|join('.') }} diff --git a/doc/_templates/autosummary/class_without_autosummary.rst b/doc/_templates/autosummary/class_without_autosummary.rst new file mode 100644 index 0000000000000..6676c672b206d --- /dev/null +++ b/doc/_templates/autosummary/class_without_autosummary.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 94c2d921eb116..1714e00030026 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -6,18 +6,16 @@ :suppress: import numpy as np - import random + import pandas as pd import os np.random.seed(123456) - from pandas import options - import pandas as pd np.set_printoptions(precision=4, suppress=True) import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - options.display.max_rows=15 + pd.options.display.mpl_style = 'default' + pd.options.display.max_rows = 15 #### portions of this were borrowed from the #### Pandas cheatsheet @@ -298,7 +296,7 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E']=['one', 'one','two','three','four','three'] + df2['E'] = ['one', 'one','two','three','four','three'] df2 df2[df2['E'].isin(['two','four'])] @@ -310,7 +308,7 @@ by the indexes .. ipython:: python - s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6)) + s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) s1 df['F'] = s1 @@ -359,7 +357,7 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) df1.loc[dates[0]:dates[1],'E'] = 1 df1 @@ -409,9 +407,9 @@ In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) s - df.sub(s,axis='index') + df.sub(s, axis='index') Apply @@ -431,7 +429,7 @@ See more at :ref:`Histogramming and Discretization ` .. ipython:: python - s = pd.Series(np.random.randint(0,7,size=10)) + s = pd.Series(np.random.randint(0, 7, size=10)) s s.value_counts() @@ -516,9 +514,9 @@ See the :ref:`Grouping section ` .. ipython:: python df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], + 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], + 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 262c439cde636..850f59c2713eb 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -6,15 +6,10 @@ :suppress: import numpy as np - import random - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 import pandas as pd - randn = np.random.randn - randint = np.random.randint + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - from pandas.compat import range, zip + pd.options.display.max_rows=15 ****************************** MultiIndex / Advanced Indexing @@ -80,10 +75,10 @@ demo different ways to initialize MultiIndexes. tuples = list(zip(*arrays)) tuples - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) index - s = Series(randn(8), index=index) + s = pd.Series(np.random.randn(8), index=index) s When you want every pairing of the elements in two iterables, it can be easier @@ -92,7 +87,7 @@ to use the ``MultiIndex.from_product`` function: .. ipython:: python iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - MultiIndex.from_product(iterables, names=['first', 'second']) + pd.MultiIndex.from_product(iterables, names=['first', 'second']) As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically: @@ -101,9 +96,9 @@ DataFrame to construct a MultiIndex automatically: arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] - s = Series(randn(8), index=arrays) + s = pd.Series(np.random.randn(8), index=arrays) s - df = DataFrame(randn(8, 4), index=arrays) + df = pd.DataFrame(np.random.randn(8, 4), index=arrays) df All of the ``MultiIndex`` constructors accept a ``names`` argument which stores @@ -119,9 +114,9 @@ of the index is up to you: .. ipython:: python - df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) df - DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) We've "sparsified" the higher levels of the indexes to make the console output a bit easier on the eyes. @@ -131,7 +126,7 @@ tuples as atomic labels on an axis: .. ipython:: python - Series(randn(8), index=tuples) + pd.Series(np.random.randn(8), index=tuples) The reason that the ``MultiIndex`` matters is that it can allow you to do grouping, selection, and reshaping operations as we will describe below and in @@ -282,16 +277,16 @@ As usual, **both sides** of the slicers are included as this is label indexing. def mklbl(prefix,n): return ["%s%s" % (prefix,i) for i in range(n)] - miindex = MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) - micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), - ('b','foo'),('b','bah')], - names=['lvl0', 'lvl1']) - dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), - index=miindex, - columns=micolumns).sortlevel().sortlevel(axis=1) + miindex = pd.MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = pd.DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) dfmi Basic multi-index slicing using slices, lists, and labels. @@ -418,9 +413,9 @@ instance: .. ipython:: python - midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], - labels=[[1,1,0,0],[1,0,1,0]]) - df = DataFrame(randn(4,2), index=midx) + midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = pd.DataFrame(np.random.randn(4,2), index=midx) df df2 = df.mean(level=0) df2 @@ -471,7 +466,7 @@ labels will be sorted lexicographically! .. ipython:: python import random; random.shuffle(tuples) - s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s s.sortlevel(0) s.sortlevel(1) @@ -509,13 +504,13 @@ an exception. Here is a concrete example to illustrate this: .. ipython:: python tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] - idx = MultiIndex.from_tuples(tuples) + idx = pd.MultiIndex.from_tuples(tuples) idx.lexsort_depth reordered = idx[[1, 0, 3, 2]] reordered.lexsort_depth - s = Series(randn(4), index=reordered) + s = pd.Series(np.random.randn(4), index=reordered) s.ix['a':'a'] However: @@ -540,7 +535,7 @@ index positions. ``take`` will also accept negative integers as relative positio .. ipython:: python - index = Index(randint(0, 1000, 10)) + index = pd.Index(np.random.randint(0, 1000, 10)) index positions = [0, 9, 3] @@ -548,7 +543,7 @@ index positions. ``take`` will also accept negative integers as relative positio index[positions] index.take(positions) - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.iloc[positions] ser.take(positions) @@ -558,7 +553,7 @@ row or column positions. .. ipython:: python - frm = DataFrame(randn(5, 3)) + frm = pd.DataFrame(np.random.randn(5, 3)) frm.take([1, 4, 3]) @@ -569,11 +564,11 @@ intended to work on boolean indices and may return unexpected results. .. ipython:: python - arr = randn(10) + arr = np.random.randn(10) arr.take([False, False, True, True]) arr[[0, 1]] - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.take([False, False, True, True]) ser.ix[[0, 1]] @@ -583,14 +578,14 @@ faster than fancy indexing. .. ipython:: - arr = randn(10000, 5) + arr = np.random.randn(10000, 5) indexer = np.arange(10000) random.shuffle(indexer) timeit arr[indexer] timeit arr.take(indexer, axis=0) - ser = Series(arr[:, 0]) + ser = pd.Series(arr[:, 0]) timeit ser.ix[indexer] timeit ser.take(indexer) @@ -608,10 +603,9 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv .. ipython:: python - df = DataFrame({'A' : np.arange(6), - 'B' : Series(list('aabbca')).astype('category', - categories=list('cab')) - }) + df = pd.DataFrame({'A': np.arange(6), + 'B': list('aabbca')}) + df['B'] = df['B'].astype('category', categories=list('cab')) df df.dtypes df.B.cat.categories @@ -669,15 +663,15 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. .. code-block:: python - In [10]: df3 = DataFrame({'A' : np.arange(6), - 'B' : Series(list('aabbca')).astype('category', - categories=list('abc')) - }).set_index('B') + In [9]: df3 = pd.DataFrame({'A' : np.arange(6), + 'B' : pd.Series(list('aabbca')).astype('category')}) + + In [11]: df3 = df3.set_index('B') In [11]: df3.index Out[11]: CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], categories=[u'a', u'b', u'c'], ordered=False, name=u'B', dtype='category') - In [12]: pd.concat([df2,df3] + In [12]: pd.concat([df2, df3] TypeError: categories must match existing categories when appending .. _indexing.float64index: @@ -702,9 +696,9 @@ same. .. ipython:: python - indexf = Index([1.5, 2, 3, 4.5, 5]) + indexf = pd.Index([1.5, 2, 3, 4.5, 5]) indexf - sf = Series(range(5),index=indexf) + sf = pd.Series(range(5), index=indexf) sf Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) @@ -746,17 +740,17 @@ In non-float indexes, slicing using floats will raise a ``TypeError`` .. code-block:: python - In [1]: Series(range(5))[3.5] + In [1]: pd.Series(range(5))[3.5] TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) - In [1]: Series(range(5))[3.5:4.5] + In [1]: pd.Series(range(5))[3.5:4.5] TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) Using a scalar float indexer will be deprecated in a future version, but is allowed for now. .. code-block:: python - In [3]: Series(range(5))[3.0] + In [3]: pd.Series(range(5))[3.0] Out[3]: 3 Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat @@ -765,12 +759,12 @@ example be millisecond offsets. .. ipython:: python - dfir = concat([DataFrame(randn(5,2), - index=np.arange(5) * 250.0, - columns=list('AB')), - DataFrame(randn(6,2), - index=np.arange(4,10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat([pd.DataFrame(np.random.randn(5,2), + index=np.arange(5) * 250.0, + columns=list('AB')), + pd.DataFrame(np.random.randn(6,2), + index=np.arange(4,10) * 250.1, + columns=list('AB'))]) dfir Selection operations then will always work on a value basis, for all selection operators. diff --git a/doc/source/api.rst b/doc/source/api.rst index 57ae089e463c8..f5ba03afc9f19 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -358,6 +358,8 @@ Computations / Descriptive Stats Series.median Series.min Series.mode + Series.nlargest + Series.nsmallest Series.pct_change Series.prod Series.quantile @@ -470,6 +472,7 @@ These can be accessed like ``Series.dt.``. Series.dt.microsecond Series.dt.nanosecond Series.dt.second + Series.dt.week Series.dt.weekofyear Series.dt.dayofweek Series.dt.weekday @@ -481,6 +484,10 @@ These can be accessed like ``Series.dt.``. Series.dt.is_quarter_end Series.dt.is_year_start Series.dt.is_year_end + Series.dt.daysinmonth + Series.dt.days_in_month + Series.dt.tz + Series.dt.freq **Datetime Methods** @@ -575,6 +582,20 @@ strings and apply several methods to it. These can be acccessed like Series.str.isdecimal Series.str.get_dummies +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series class page) + +.. + .. autosummary:: + :toctree: generated/ + :template: autosummary/accessor.rst + + Series.str + Series.cat + Series.dt + + .. _api.categorical: Categorical @@ -582,22 +603,28 @@ Categorical If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the -following usable methods and properties (all available as ``Series.cat.``). +following usable methods and properties: + +.. autosummary:: + :toctree: generated/ + :template: autosummary/accessor_attribute.rst + + Series.cat.categories + Series.cat.ordered + Series.cat.codes .. autosummary:: :toctree: generated/ + :template: autosummary/accessor_method.rst - Categorical.categories - Categorical.ordered - Categorical.rename_categories - Categorical.reorder_categories - Categorical.add_categories - Categorical.remove_categories - Categorical.remove_unused_categories - Categorical.set_categories - Categorical.as_ordered - Categorical.as_unordered - Categorical.codes + Series.cat.rename_categories + Series.cat.reorder_categories + Series.cat.add_categories + Series.cat.remove_categories + Series.cat.remove_unused_categories + Series.cat.set_categories + Series.cat.as_ordered + Series.cat.as_unordered To create a Series of dtype ``category``, use ``cat = s.astype("category")``. @@ -606,8 +633,13 @@ adding ordering information or special categories is need at creation time of th .. autosummary:: :toctree: generated/ + :template: autosummary/class_without_autosummary.rst Categorical + +.. autosummary:: + :toctree: generated/ + Categorical.from_codes ``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts @@ -1232,8 +1264,6 @@ Modifying and Computations Index.argmax Index.copy Index.delete - Index.diff - Index.sym_diff Index.drop Index.drop_duplicates Index.duplicated @@ -1279,15 +1309,17 @@ Time-specific operations Index.shift -Combining / joining / merging -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Combining / joining / set operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ Index.append - Index.intersection Index.join + Index.intersection Index.union + Index.difference + Index.sym_diff Selecting ~~~~~~~~~ diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 76efdc0553c7d..d16feb3a6c448 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1,16 +1,14 @@ .. currentmodule:: pandas -.. _basics: .. ipython:: python :suppress: import numpy as np - from pandas import * - randn = np.random.randn + import pandas as pd np.set_printoptions(precision=4, suppress=True) - from pandas.compat import lrange - options.display.max_rows=15 + pd.options.display.max_rows = 15 +.. _basics: ============================== Essential Basic Functionality @@ -22,13 +20,13 @@ the previous section: .. ipython:: python - index = date_range('1/1/2000', periods=8) - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + index = pd.date_range('1/1/2000', periods=8) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = pd.DataFrame(np.random.randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) .. _basics.head_tail: @@ -41,7 +39,7 @@ of elements to display is five, but you may pass a custom number. .. ipython:: python - long_series = Series(randn(1000)) + long_series = pd.Series(np.random.randn(1000)) long_series.head() long_series.tail(3) @@ -143,9 +141,9 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = DataFrame({'one' : Series(randn(3), index=['a', 'b', 'c']), - 'two' : Series(randn(4), index=['a', 'b', 'c', 'd']), - 'three' : Series(randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) df row = df.ix[1] column = df['two'] @@ -166,8 +164,8 @@ Furthermore you can align a level of a multi-indexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], - names=['first','second']) + dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], + names=['first','second']) dfmi.sub(column, axis=0, level='second') With Panel, describing the matching behavior is a bit more difficult, so @@ -236,7 +234,7 @@ see :ref:`here` Boolean Reductions ~~~~~~~~~~~~~~~~~~ - You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, +You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, :meth:`~DataFrame.all`, and :meth:`~DataFrame.bool` to provide a way to summarize a boolean result. @@ -256,17 +254,17 @@ You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` prope .. ipython:: python df.empty - DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list('ABC')).empty To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: .. ipython:: python - Series([True]).bool() - Series([False]).bool() - DataFrame([[True]]).bool() - DataFrame([[False]]).bool() + pd.Series([True]).bool() + pd.Series([False]).bool() + pd.DataFrame([[True]]).bool() + pd.DataFrame([[False]]).bool() .. warning:: @@ -327,8 +325,8 @@ equality to be True: .. ipython:: python - df1 = DataFrame({'col':['foo', 0, np.nan]}) - df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df1 = pd.DataFrame({'col':['foo', 0, np.nan]}) + df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) df1.equals(df2) df1.equals(df2.sort()) @@ -348,10 +346,10 @@ which we illustrate: .. ipython:: python - df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], - 'B' : [np.nan, 2., 3., np.nan, 6.]}) - df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], - 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) df1 df2 df1.combine_first(df2) @@ -368,7 +366,7 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: .. ipython:: python - combiner = lambda x, y: np.where(isnull(x), y, x) + combiner = lambda x, y: np.where(pd.isnull(x), y, x) df1.combine(df2, combiner) .. _basics.stats: @@ -467,7 +465,7 @@ number of unique non-null values: .. ipython:: python - series = Series(randn(500)) + series = pd.Series(np.random.randn(500)) series[20:500] = np.nan series[10:20] = 5 series.nunique() @@ -483,10 +481,10 @@ course): .. ipython:: python - series = Series(randn(1000)) + series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.ix[::2] = np.nan frame.describe() @@ -503,7 +501,7 @@ summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) s.describe() Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will @@ -512,7 +510,7 @@ categorical columns: .. ipython:: python - frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) frame.describe() This behaviour can be controlled by providing a list of types as ``include``/``exclude`` @@ -538,11 +536,11 @@ corresponding values: .. ipython:: python - s1 = Series(randn(5)) + s1 = pd.Series(np.random.randn(5)) s1 s1.idxmin(), s1.idxmax() - df1 = DataFrame(randn(5,3), columns=['A','B','C']) + df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C']) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -553,7 +551,7 @@ matching index: .. ipython:: python - df3 = DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) df3 df3['A'].idxmin() @@ -573,18 +571,18 @@ of a 1D array of values. It can also be used as a function on regular arrays: data = np.random.randint(0, 7, size=50) data - s = Series(data) + s = pd.Series(data) s.value_counts() - value_counts(data) + pd.value_counts(data) Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: .. ipython:: python - s5 = Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) + s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50)}) df5.mode() @@ -597,10 +595,10 @@ and :func:`qcut` (bins based on sample quantiles) functions: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, 4) + factor = pd.cut(arr, 4) factor - factor = cut(arr, [-5, -1, 0, 1, 5]) + factor = pd.cut(arr, [-5, -1, 0, 1, 5]) factor :func:`qcut` computes sample quantiles. For example, we could slice up some @@ -609,16 +607,16 @@ normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, .25, .5, .75, 1]) factor - value_counts(factor) + pd.value_counts(factor) We can also pass infinite values to define the bins: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, [-np.inf, 0, np.inf]) + factor = pd.cut(arr, [-np.inf, 0, np.inf]) factor .. _basics.apply: @@ -647,8 +645,8 @@ maximum value for each column occurred: .. ipython:: python - tsdf = DataFrame(randn(1000, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=1000)) tsdf.apply(lambda x: x.idxmax()) You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` @@ -671,14 +669,14 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = DataFrame(randn(10, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) tsdf.values[3:7] = np.nan .. ipython:: python tsdf - tsdf.apply(Series.interpolate) + tsdf.apply(pd.Series.interpolate) Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which converts each row or column into a Series before applying the function. When @@ -718,9 +716,9 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = Series({'six' : 6., 'seven' : 7.}) + s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], + index=['a', 'b', 'c', 'd', 'e']) + t = pd.Series({'six' : 6., 'seven' : 7.}) s s.map(t) @@ -797,7 +795,7 @@ This is equivalent to the following .. ipython:: python - result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) + result = pd.Panel(dict([ (ax, f(panel.loc[:,:,ax])) for ax in panel.minor_axis ])) result result.loc[:,:,'ItemA'] @@ -823,7 +821,7 @@ Here is a simple example: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s s.reindex(['e', 'b', 'f', 'd']) @@ -909,7 +907,7 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s1 = s[:4] s2 = s[1:] s1.align(s2) @@ -960,8 +958,8 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = date_range('1/3/2000', periods=8) - ts = Series(randn(8), index=rng) + rng = pd.date_range('1/3/2000', periods=8) + ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 @@ -1095,11 +1093,11 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) print(df2) print(df2.T) - df2_t = DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) print(df2_t) .. note:: @@ -1109,7 +1107,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df_iter = DataFrame([[1, 1.0]], columns=['x', 'y']) + df_iter = pd.DataFrame([[1, 1.0]], columns=['x', 'y']) row = next(df_iter.iterrows())[1] print(row['x'].dtype) print(df_iter['x'].dtype) @@ -1140,7 +1138,7 @@ This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = Series(date_range('20130101 09:10:12',periods=4)) + s = pd.Series(pd.date_range('20130101 09:10:12',periods=4)) s s.dt.hour s.dt.second @@ -1171,7 +1169,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = Series(period_range('20130101',periods=4,freq='D')) + s = pd.Series(pd.period_range('20130101', periods=4,freq='D')) s s.dt.year s.dt.day @@ -1179,7 +1177,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = Series(timedelta_range('1 day 00:00:05',periods=4,freq='s')) + s = pd.Series(pd.timedelta_range('1 day 00:00:05',periods=4,freq='s')) s s.dt.days s.dt.seconds @@ -1200,7 +1198,7 @@ built-in string methods. For example: .. ipython:: python - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1234,7 +1232,7 @@ determine the sort order: .. ipython:: python - df1 = DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) df1.sort_index(by='two') The ``by`` argument can take a list of column names, e.g.: @@ -1265,12 +1263,12 @@ Series has the :meth:`~Series.searchsorted` method, which works similar to .. ipython:: python - ser = Series([1, 2, 3]) + ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) ser.searchsorted([1, 3], side='right') ser.searchsorted([1, 3], side='left') - ser = Series([3, 1, 2]) + ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) .. _basics.nsorted: @@ -1286,7 +1284,7 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - s = Series(np.random.permutation(10)) + s = pd.Series(np.random.permutation(10)) s s.order() s.nsmallest(3) @@ -1303,7 +1301,7 @@ all levels to ``by``. .. ipython:: python - df1.columns = MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) + df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) df1.sort_index(by=('a','two')) @@ -1336,13 +1334,13 @@ attribute for DataFrames returns a Series with the data type of each column. .. ipython:: python - dft = DataFrame(dict( A = np.random.rand(3), - B = 1, - C = 'foo', - D = Timestamp('20010102'), - E = Series([1.0]*3).astype('float32'), - F = False, - G = Series([1]*3,dtype='int8'))) + dft = pd.DataFrame(dict(A = np.random.rand(3), + B = 1, + C = 'foo', + D = pd.Timestamp('20010102'), + E = pd.Series([1.0]*3).astype('float32'), + F = False, + G = pd.Series([1]*3,dtype='int8'))) dft dft.dtypes @@ -1359,10 +1357,10 @@ general). .. ipython:: python # these ints are coerced to floats - Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.]) # string data forces an ``object`` dtype - Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6., 'foo']) The method :meth:`~DataFrame.get_dtype_counts` will return the number of columns of each type in a ``DataFrame``: @@ -1378,12 +1376,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 df1.dtypes - df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), - B = Series(randn(8)), - C = Series(np.array(randn(8),dtype='uint8')) )) + df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'), + B = pd.Series(np.random.randn(8)), + C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) )) df2 df2.dtypes @@ -1395,16 +1393,16 @@ By default integer types are ``int64`` and float types are ``float64``, .. ipython:: python - DataFrame([1, 2], columns=['a']).dtypes - DataFrame({'a': [1, 2]}).dtypes - DataFrame({'a': 1 }, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=['a']).dtypes + pd.DataFrame({'a': [1, 2]}).dtypes + pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes Numpy, however will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. .. ipython:: python - frame = DataFrame(np.array([1, 2])) + frame = pd.DataFrame(np.array([1, 2])) upcasting @@ -1473,9 +1471,10 @@ but occasionally has non-dates intermixed and you want to represent as missing. .. ipython:: python - s = Series([datetime(2001,1,1,0,0), - 'foo', 1.0, 1, Timestamp('20010104'), - '20010105'],dtype='O') + import datetime + s = pd.Series([datetime.datetime(2001,1,1,0,0), + 'foo', 1.0, 1, pd.Timestamp('20010104'), + '20010105'], dtype='O') s s.convert_objects(convert_dates='coerce') @@ -1527,14 +1526,14 @@ dtypes: .. ipython:: python - df = DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, - 'category': pd.Categorical(list("ABC"))}) + df = pd.DataFrame({'string': list('abc'), + 'int64': list(range(1, 4)), + 'uint8': np.arange(3, 6).astype('u1'), + 'float64': np.arange(4.0, 7.0), + 'bool1': [True, False, True], + 'bool2': [False, True, False], + 'dates': pd.date_range('now', periods=3).values, + 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') df['other_dates'] = pd.date_range('20130101', periods=3).values diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 11e7fb0fd4117..0c63759201517 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -6,14 +6,10 @@ :suppress: import numpy as np - import random - import os - np.random.seed(123456) - from pandas import options - from pandas import * import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - options.display.max_rows=15 + pd.options.display.max_rows = 15 **************** @@ -65,14 +61,14 @@ By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = df["A"].astype('category') df @@ -80,7 +76,7 @@ By using some special functions: .. ipython:: python - df = DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) @@ -92,11 +88,11 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to .. ipython:: python - raw_cat = Categorical(["a","b","c","a"], categories=["b","c","d"], + raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], ordered=False) - s = Series(raw_cat) + s = pd.Series(raw_cat) s - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = raw_cat df @@ -104,7 +100,7 @@ You can also specify differently ordered categories or make the resulting data o .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s_cat = s.astype("category", categories=["b","c","d"], ordered=False) s_cat @@ -129,7 +125,7 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s s2 = s.astype('category') s2 @@ -143,7 +139,7 @@ constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = Series(Categorical.from_codes(splitter, categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- @@ -153,8 +149,8 @@ Using ``.describe()`` on categorical data will produce similar output to a `Seri .. ipython:: python - cat = Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) - df = DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) + df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() df["cat"].describe() @@ -168,7 +164,7 @@ passed in values. .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s.cat.categories s.cat.ordered @@ -176,7 +172,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = Series(Categorical(["a","b","c","a"], categories=["c","b","a"])) + s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) s.cat.categories s.cat.ordered @@ -194,7 +190,7 @@ by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s s.cat.categories = ["Group %s" % g for g in s.cat.categories] s @@ -247,7 +243,7 @@ Removing unused categories can also be done: .. ipython:: python - s = Series(Categorical(["a","b","a"], categories=["a","b","c","d"])) + s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) s s.cat.remove_unused_categories() @@ -259,7 +255,7 @@ or simply set the categories to a predefined scale, use :func:`Categorical.set_c .. ipython:: python - s = Series(["one","two","four", "-"], dtype="category") + s = pd.Series(["one","two","four", "-"], dtype="category") s s = s.cat.set_categories(["one","two","three","four"]) s @@ -283,9 +279,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` .. ipython:: python - s = Series(Categorical(["a","b","c","a"], ordered=False)) + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort() - s = Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) s.sort() s s.min(), s.max() @@ -303,7 +299,7 @@ This is even true for strings and numeric data: .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.set_categories([2,3,1], ordered=True) s s.sort() @@ -321,7 +317,7 @@ necessarily make the sort order the same as the categories order. .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.reorder_categories([2,3,1], ordered=True) s s.sort() @@ -351,8 +347,8 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = DataFrame({'A' : Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), - 'B' : [1,2,1,2,2,1,2,1] }) + dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), + 'B' : [1,2,1,2,2,1,2,1] }) dfs.sort(['A', 'B']) Reordering the ``categories`` changes a future sort. @@ -385,9 +381,9 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) + cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) + cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) cat cat_base @@ -443,19 +439,19 @@ present in the data: .. ipython:: python - s = Series(Categorical(["a","b","c","c"], categories=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) - df = DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) + df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df2 = DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) + cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -463,8 +459,8 @@ Pivot tables: .. ipython:: python - raw_cat = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df = DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) Data munging @@ -482,10 +478,10 @@ the ``category`` dtype is preserved. .. ipython:: python - idx = Index(["h","i","j","k","l","m","n",]) - cats = Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) + idx = pd.Index(["h","i","j","k","l","m","n",]) + cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] df.iloc[2:4,:].dtypes df.loc["h":"j","cats"] @@ -527,10 +523,10 @@ Setting values in a categorical column (or `Series`) works as long as the value .. ipython:: python - idx = Index(["h","i","j","k","l","m","n"]) - cats = Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] = [["b",2],["b",2]] df @@ -543,10 +539,10 @@ Setting values by assigning categorical data will also check that the `categorie .. ipython:: python - df.loc["j":"k","cats"] = Categorical(["a","a"], categories=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = Categorical(["b","b"], categories=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -554,9 +550,9 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python - df = DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = Categorical(["b","b"], categories=["a","b"]) - df.loc[2:3,"b"] = Categorical(["b","b"], categories=["a","b"]) + df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -569,9 +565,9 @@ but the categories of these categoricals need to be the same: .. ipython:: python - cat = Series(["a","b"], dtype="category") + cat = pd.Series(["a","b"], dtype="category") vals = [1,2] - df = DataFrame({"cats":cat, "vals":vals}) + df = pd.DataFrame({"cats":cat, "vals":vals}) res = pd.concat([df,df]) res res.dtypes @@ -611,12 +607,12 @@ relevant columns back to `category` and assign the right categories and categori .. ipython:: python - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) - df = DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) + df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) df2 = pd.read_csv(StringIO(csv.getvalue())) @@ -643,10 +639,10 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = Series(["a","b",np.nan,"a"], dtype="category") + s = pd.Series(["a","b",np.nan,"a"], dtype="category") # only two categories s - s2 = Series(["a","b","c","a"], dtype="category") + s2 = pd.Series(["a","b","c","a"], dtype="category") s2.cat.categories = [1,2,np.nan] # three categories, np.nan included s2 @@ -660,11 +656,11 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - c = Series(["a","b",np.nan], dtype="category") + c = pd.Series(["a","b",np.nan], dtype="category") c.cat.set_categories(["a","b",np.nan], inplace=True) # will be inserted as a NA category: c[0] = np.nan - s = Series(c) + s = pd.Series(c) s pd.isnull(s) s.fillna("a") @@ -697,7 +693,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = Series(['foo','bar']*1000) + s = pd.Series(['foo','bar']*1000) # object dtype s.nbytes @@ -712,7 +708,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(['foo%04d' % i for i in range(2000)]) # object dtype s.nbytes @@ -734,7 +730,7 @@ will work with the current pandas version, resulting in subtle bugs: .. code-block:: python - >>> cat = Categorical([1,2], [1,2,3]) + >>> cat = pd.Categorical([1,2], [1,2,3]) >>> # old version >>> cat.get_values() array([2, 3], dtype=int64) @@ -762,7 +758,7 @@ object and not as a low-level `numpy` array dtype. This leads to some problems. except TypeError as e: print("TypeError: " + str(e)) - dtype = Categorical(["a"]).dtype + dtype = pd.Categorical(["a"]).dtype try: np.dtype(dtype) except TypeError as e: @@ -780,15 +776,15 @@ To check if a Series contains Categorical data, with pandas 0.16 or later, use .. ipython:: python - hasattr(Series(['a'], dtype='category'), 'cat') - hasattr(Series(['a']), 'cat') + hasattr(pd.Series(['a'], dtype='category'), 'cat') + hasattr(pd.Series(['a']), 'cat') Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python - s = Series(Categorical([1,2,3,4])) + s = pd.Series(pd.Categorical([1,2,3,4])) try: np.sum(s) #same with np.log(s),.. @@ -807,33 +803,36 @@ basic type) and applying along columns will also convert to object. .. ipython:: python - df = DataFrame({"a":[1,2,3,4], - "b":["a","b","c","d"], - "cats":Categorical([1,2,3,2])}) + df = pd.DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":pd.Categorical([1,2,3,2])}) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) -No Categorical Index -~~~~~~~~~~~~~~~~~~~~ +Categorical Index +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.16.1 -There is currently no index of type ``category``, so setting the index to categorical column will -convert the categorical data to a "normal" dtype first and therefore remove any custom -ordering of the categories: +A new ``CategoricalIndex`` index type is introduced in version 0.16.1. See the +:ref:`advanced indexing docs ` for a more detailed +explanation. + +Setting the index, will create create a ``CategoricalIndex`` .. ipython:: python - cats = Categorical([1,2,3,4], categories=[4,2,3,1]) + cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) strings = ["a","b","c","d"] values = [4,2,3,1] - df = DataFrame({"strings":strings, "values":values}, index=cats) + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) df.index - # This should sort by categories but does not as there is no CategoricalIndex! + # This now sorts by the categories order df.sort_index() -.. note:: - This could change if a `CategoricalIndex` is implemented (see - https://github.com/pydata/pandas/issues/7629) - +In previous versions (<0.16.1) there is no index of type ``category``, so +setting the index to categorical column will convert the categorical data to a +"normal" dtype first and therefore remove any custom ordering of the categories. Side Effects ~~~~~~~~~~~~ @@ -843,12 +842,12 @@ means that changes to the `Series` will in most cases change the original `Categ .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat") + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat - df = DataFrame(s) + df = pd.DataFrame(s) df["cat"].cat.categories = [1,2,3,4,5] cat @@ -856,8 +855,8 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat", copy=True) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 cat diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 4621d7bd9b216..dfb9fab19bf31 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -258,7 +258,7 @@ These functions can be applied to ndarrays or Series objects: ts.plot(style='k--') @savefig rolling_mean_ex.png - rolling_mean(ts, 60).plot(style='k') + pd.rolling_mean(ts, 60).plot(style='k') They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: @@ -275,7 +275,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns: df = df.cumsum() @savefig rolling_mean_frame.png - rolling_sum(df, 60).plot(subplots=True) + pd.rolling_sum(df, 60).plot(subplots=True) The ``rolling_apply`` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function @@ -286,7 +286,7 @@ compute the mean absolute deviation on a rolling basis: mad = lambda x: np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - rolling_apply(ts, 60, mad).plot(style='k') + pd.rolling_apply(ts, 60, mad).plot(style='k') The ``rolling_window`` function performs a generic rolling window computation on the input data. The weights used in the window are specified by the ``win_type`` @@ -311,21 +311,21 @@ keyword. The list of recognized types are: ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) - rolling_window(ser, 5, 'triang') + pd.rolling_window(ser, 5, 'triang') Note that the ``boxcar`` window is equivalent to ``rolling_mean``. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_mean(ser, 5) + pd.rolling_mean(ser, 5) For some windowing functions, additional parameters must be specified: .. ipython:: python - rolling_window(ser, 5, 'gaussian', std=0.1) + pd.rolling_window(ser, 5, 'gaussian', std=0.1) By default the labels are set to the right edge of the window, but a ``center`` keyword is available so the labels can be set at the center. @@ -333,11 +333,11 @@ This keyword is available in other rolling functions as well. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_window(ser, 5, 'boxcar', center=True) + pd.rolling_window(ser, 5, 'boxcar', center=True) - rolling_mean(ser, 5, center=True) + pd.rolling_mean(ser, 5, center=True) .. _stats.moments.normalization: @@ -376,7 +376,7 @@ For example: .. ipython:: python df2 = df[:20] - rolling_corr(df2, df2['B'], window=5) + pd.rolling_corr(df2, df2['B'], window=5) .. _stats.moments.corr_pairwise: @@ -401,12 +401,12 @@ can even be omitted: .. ipython:: python - covs = rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) + covs = pd.rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) covs[df.index[-50]] .. ipython:: python - correls = rolling_corr(df, 50) + correls = pd.rolling_corr(df, 50) correls[df.index[-50]] .. note:: @@ -440,9 +440,9 @@ they are implemented in pandas such that the following two calls are equivalent: .. ipython:: python - rolling_mean(df, window=len(df), min_periods=1)[:5] + pd.rolling_mean(df, window=len(df), min_periods=1)[:5] - expanding_mean(df)[:5] + pd.expanding_mean(df)[:5] Like the ``rolling_`` functions, the following methods are included in the ``pandas`` namespace or can be located in ``pandas.stats.moments``. @@ -501,7 +501,7 @@ relative impact of an individual data point. As an example, here is the ts.plot(style='k--') @savefig expanding_mean_frame.png - expanding_mean(ts).plot(style='k') + pd.expanding_mean(ts).plot(style='k') .. _stats.moments.exponentially_weighted: @@ -583,7 +583,7 @@ Here is an example for a univariate time series: ts.plot(style='k--') @savefig ewma_ex.png - ewma(ts, span=20).plot(style='k') + pd.ewma(ts, span=20).plot(style='k') All the EW functions have a ``min_periods`` argument, which has the same meaning it does for all the ``expanding_`` and ``rolling_`` functions: diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 1ece60bf704d6..1f58992dba017 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -113,10 +113,10 @@ This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. The testing suite will run automatically on Travis-CI once your Pull Request is -submitted. However, if you wish to run the test suite on a branch prior to +submitted. However, if you wish to run the test suite on a branch prior to submitting the Pull Request, then Travis-CI needs to be hooked up to your GitHub repository. Instructions are for doing so are `here -`_. +`__. Creating a Branch ----------------- @@ -219,7 +219,7 @@ To return to you home root environment: deactivate See the full ``conda`` docs `here -`_. +`__. At this point you can easily do an *in-place* install, as detailed in the next section. @@ -372,7 +372,7 @@ If you want to do a full clean build, do:: Starting with 0.13.1 you can tell ``make.py`` to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. You will be prompted to delete `.rst` files that aren't required. This is okay -since the prior version can be checked out from git, but make sure to +since the prior version can be checked out from git, but make sure to not commit the file deletions. :: @@ -401,7 +401,7 @@ Built Master Branch Documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When pull-requests are merged into the pandas *master* branch, the main parts of the documentation are -also built by Travis-CI. These docs are then hosted `here `_. +also built by Travis-CI. These docs are then hosted `here `__. Contributing to the code base ============================= diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index adcf2fca9b4c5..9221f2685d79b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -577,10 +577,8 @@ row-wise. For example: df - df.iloc[0] -In the special case of working with time series data, if the Series is a -TimeSeries (which it will be automatically if the index contains datetime -objects), and the DataFrame index also contains dates, the broadcasting will be -column-wise: +In the special case of working with time series data, and the DataFrame index +also contains dates, the broadcasting will be column-wise: .. ipython:: python :okwarning: diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index d007446a5b922..54fd0a2131861 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -7,7 +7,7 @@ import os import csv - from pandas import DataFrame + from pandas import DataFrame, Series import pandas as pd pd.options.display.max_rows=15 @@ -68,9 +68,10 @@ Here's the function in pure python: We achieve our result by using ``apply`` (row-wise): -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + In [7]: %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 174 ms per loop But clearly this isn't fast enough for us. Let's take a look and see where the time is spent during this operation (limited to the most time consuming @@ -97,7 +98,7 @@ First we're going to need to import the cython magic function to ipython: .. ipython:: python - %load_ext cythonmagic + %load_ext Cython Now, let's simply copy our functions over to cython as is (the suffix @@ -122,9 +123,10 @@ is here to distinguish between function versions): to be using bleeding edge ipython for paste to play well with cell magics. -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + In [4]: %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 85.5 ms per loop Already this has shaved a third off, not too bad for a simple copy and paste. @@ -150,9 +152,10 @@ We get another huge improvement simply by providing type information: ...: return s * dx ...: -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + In [4]: %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 20.3 ms per loop Now, we're talking! It's now over ten times faster than the original python implementation, and we haven't *really* modified the code. Let's have another @@ -229,9 +232,10 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra Loops like this would be *extremely* slow in python, but in Cython looping over numpy arrays is *fast*. -.. ipython:: python +.. code-block:: python - %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + In [4]: %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + 1000 loops, best of 3: 1.25 ms per loop We've gotten another big improvement. Let's check again where the time is spent: @@ -278,20 +282,70 @@ advanced cython techniques: ...: return res ...: -.. ipython:: python +.. code-block:: python - %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) + In [4]: %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) + 1000 loops, best of 3: 987 us per loop Even faster, with the caveat that a bug in our cython code (an off-by-one error, for example) might cause a segfault because memory access isn't checked. -Further topics -~~~~~~~~~~~~~~ +.. _enhancingperf.numba: + +Using numba +----------- + +A recent alternative to statically compiling cython code, is to use a *dynamic jit-compiler*, ``numba``. + +Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters. + +Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack. + +.. note:: + + You will need to install ``numba``. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. + +We simply take the plain python code from above and annotate with the ``@jit`` decorator. + +.. code-block:: python + + import numba + + @numba.jit + def f_plain(x): + return x * (x - 1) + + @numba.jit + def integrate_f_numba(a, b, N): + s = 0 + dx = (b - a) / N + for i in range(N): + s += f_plain(a + i * dx) + return s * dx + + @numba.jit + def apply_integrate_f_numba(col_a, col_b, col_N): + n = len(col_N) + result = np.empty(n, dtype='float64') + assert len(col_a) == len(col_b) == n + for i in range(n): + result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) + return result + + def compute_numba(df): + result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values) + return Series(result, index=df.index, name='result') + +Similar to above, we directly pass ``numpy`` arrays directly to the numba function. Further +we are wrapping the results to provide a nice interface by passing/returning pandas objects. + +.. code-block:: python -- Loading C modules into cython. + In [4]: %timeit compute_numba(df) + 1000 loops, best of 3: 798 us per loop -Read more in the `cython docs `__. +Read more in the `numba docs `__. .. _enhancingperf.eval: diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 20762e3fc039f..1fc8488e92fde 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -207,9 +207,9 @@ properties. Here are the pandas equivalents: Frequency conversion ~~~~~~~~~~~~~~~~~~~~ -Frequency conversion is implemented using the ``resample`` method on TimeSeries -and DataFrame objects (multiple time series). ``resample`` also works on panels -(3D). Here is some code that resamples daily data to monthly: +Frequency conversion is implemented using the ``resample`` method on Series +and DataFrame objects with a DatetimeIndex or PeriodIndex. ``resample`` also +works on panels (3D). Here is some code that resamples daily data to montly: .. ipython:: python @@ -369,4 +369,3 @@ just a thin layer around the ``QTableView``. mw = MainWidget() mw.show() app.exec_() - diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7ad2641dec52a..c9e18b585c764 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -NA group handling -~~~~~~~~~~~~~~~~~ +NA and NaT group handling +~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN values in the grouping key, these will be automatically -excluded. So there will never be an "NA group". This was not the case in older +If there are any NaN or NaT values in the grouping key, these will be automatically +excluded. So there will never be an "NA group" or "NaT group". This was not the case in older versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). diff --git a/doc/source/install.rst b/doc/source/install.rst index 79adab0463588..b3f86db5e3e59 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -35,7 +35,7 @@ pandas at all. Simply create an account, and have access to pandas from within your brower via an `IPython Notebook `__ in a few minutes. -.. _install.anaconda +.. _install.anaconda: Installing pandas with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -68,7 +68,7 @@ admin rights to install it, it will install in the user's home directory, and this also makes it trivial to delete Anaconda at a later date (just delete that folder). -.. _install.miniconda +.. _install.miniconda: Installing pandas with Miniconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -255,7 +255,7 @@ Optional Dependencies * Alternative Excel writer. * `boto `__: necessary for Amazon S3 access. -* `blosc `__: for msgpack compression using ``blosc`` * One of `PyQt4 `__, `PySide `__, `pygtk diff --git a/doc/source/overview.rst b/doc/source/overview.rst index 49a788def2854..b1addddc2121d 100644 --- a/doc/source/overview.rst +++ b/doc/source/overview.rst @@ -9,7 +9,7 @@ Package overview :mod:`pandas` consists of the following things * A set of labeled array data structures, the primary of which are - Series/TimeSeries and DataFrame + Series and DataFrame * Index objects enabling both simple axis indexing and multi-level / hierarchical axis indexing * An integrated group by engine for aggregating and transforming data sets @@ -32,7 +32,6 @@ Data structures at a glance :widths: 15, 20, 50 1, Series, "1D labeled homogeneously-typed array" - 1, TimeSeries, "Series with index containing datetimes" 2, DataFrame, "General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed columns" 3, Panel, "General 3D labeled, also size-mutable array" diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index 2207c823f43b1..da37c92c88ecf 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -15,7 +15,69 @@ rpy2 / R interface .. warning:: - In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be removed in a future version**. Similar functionaility can be accessed thru the `rpy2 `_ project. + In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be + removed in a future version**. Similar functionality can be accessed + through the `rpy2 `_ project. + See the :ref:`updating ` section for a guide to port your + code from the ``pandas.rpy`` to ``rpy2`` functions. + + +.. _rpy.updating: + +Updating your code to use rpy2 functions +---------------------------------------- + +In v0.16.0, the ``pandas.rpy`` module has been **deprecated** and users are +pointed to the similar functionality in ``rpy2`` itself (rpy2 >= 2.4). + +Instead of importing ``import pandas.rpy.common as com``, the following imports +should be done to activate the pandas conversion support in rpy2:: + + from rpy2.robjects import pandas2ri + pandas2ri.activate() + +Converting data frames back and forth between rpy2 and pandas should be largely +automated (no need to convert explicitly, it will be done on the fly in most +rpy2 functions). + +To convert explicitly, the functions are ``pandas2ri.py2ri()`` and +``pandas2ri.ri2py()``. So these functions can be used to replace the existing +functions in pandas: + +- ``com.convert_to_r_dataframe(df)`` should be replaced with ``pandas2ri.py2ri(df)`` +- ``com.convert_robj(rdf)`` should be replaced with ``pandas2ri.ri2py(rdf)`` + +Note: these functions are for the latest version (rpy2 2.5.x) and were called +``pandas2ri.pandas2ri()`` and ``pandas2ri.ri2pandas()`` previously. + +Some of the other functionality in `pandas.rpy` can be replaced easily as well. +For example to load R data as done with the ``load_data`` function, the +current method:: + + df_iris = com.load_data('iris') + +can be replaced with:: + + from rpy2.robjects import r + r.data('iris') + df_iris = pandas2ri.ri2py(r[name]) + +The ``convert_to_r_matrix`` function can be replaced by the normal +``pandas2ri.py2ri`` to convert dataframes, with a subsequent call to R +``as.matrix`` function. + +.. warning:: + + Not all conversion functions in rpy2 are working exactly the same as the + current methods in pandas. If you experience problems or limitations in + comparison to the ones in pandas, please report this at the + `issue tracker `_. + +See also the documentation of the `rpy2 `_ project. + + +R interface with rpy2 +--------------------- If your computer has R and rpy2 (> 2.2) installed (which will be left to the reader), you will be able to leverage the below functionality. On Windows, diff --git a/doc/source/text.rst b/doc/source/text.rst index 810e3e0146f9f..d40445d8490f7 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -82,11 +82,11 @@ Elements in the split lists can be accessed using ``get`` or ``[]`` notation: s2.str.split('_').str.get(1) s2.str.split('_').str[1] -Easy to expand this to return a DataFrame using ``return_type``. +Easy to expand this to return a DataFrame using ``expand``. .. ipython:: python - s2.str.split('_', return_type='frame') + s2.str.split('_', expand=True) Methods like ``replace`` and ``findall`` take `regular expressions `__, too: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index b69b523d9c908..ce1035e91391a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1008,7 +1008,7 @@ Time series-related instance methods Shifting / lagging ~~~~~~~~~~~~~~~~~~ -One may want to *shift* or *lag* the values in a TimeSeries back and forward in +One may want to *shift* or *lag* the values in a time series back and forward in time. The method for this is ``shift``, which is available on all of the pandas objects. @@ -1026,7 +1026,7 @@ The shift method accepts an ``freq`` argument which can accept a ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and -``TimeSeries`` objects also have a ``tshift`` convenience method that changes +``Series`` objects also have a ``tshift`` convenience method that changes all the dates in the index by a specified number of offsets: .. ipython:: python @@ -1569,7 +1569,7 @@ time zones using ``tz_convert``: rng_berlin[5] rng_eastern[5].tz_convert('Europe/Berlin') -Localization of Timestamps functions just like DatetimeIndex and TimeSeries: +Localization of Timestamps functions just like DatetimeIndex and Series: .. ipython:: python @@ -1577,8 +1577,8 @@ Localization of Timestamps functions just like DatetimeIndex and TimeSeries: rng[5].tz_localize('Asia/Shanghai') -Operations between TimeSeries in different time zones will yield UTC -TimeSeries, aligning the data on the UTC timestamps: +Operations between Series in different time zones will yield UTC +Series, aligning the data on the UTC timestamps: .. ipython:: python diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 6dfeeadeb0167..51912b5d6b106 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -220,8 +220,8 @@ Histogram can be drawn specifying ``kind='hist'``. .. ipython:: python - df4 = pd.DataFrame({'a': randn(1000) + 1, 'b': randn(1000), - 'c': randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), + 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) plt.figure(); diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d05c19a5e4bea..c8e32ac2a3309 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.16.2.txt + .. include:: whatsnew/v0.16.1.txt .. include:: whatsnew/v0.16.0.txt diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 5e893f3c4fd73..79a0c48238be7 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -31,44 +31,6 @@ Highlights include: Enhancements ~~~~~~~~~~~~ -- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`) - - .. ipython:: python - - Timestamp('2014-08-01 09:00') + BusinessHour() - Timestamp('2014-08-01 07:00') + BusinessHour() - Timestamp('2014-08-01 16:30') + BusinessHour() - -- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) - -- Allow ``clip``, ``clip_lower``, and ``clip_upper`` to accept array-like arguments as thresholds (This is a regression from 0.11.0). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). (:issue:`6966`) - -- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) - -- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) - - .. ipython:: python - - df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) - df.drop(['A', 'X'], axis=1, errors='ignore') - -- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) -- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) -- ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) - -- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) -- Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) - -- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) -- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) - -- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) -- Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) - -- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` - -- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) - .. _whatsnew_0161.enhancements.categoricalindex: CategoricalIndex @@ -188,16 +150,6 @@ String Methods Enhancements :ref:`Continuing from v0.16.0 `, the following enhancements make string operations easier and more consistent with standard python string operations. -- The following new methods are accesible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) - - ================ =============== =============== =============== ================ - .. .. Methods .. .. - ================ =============== =============== =============== ================ - ``capitalize()`` ``swapcase()`` ``normalize()`` ``partition()`` ``rpartition()`` - ``index()`` ``rindex()`` ``translate()`` - ================ =============== =============== =============== ================ - - - Added ``StringMethods`` (``.str`` accessor) to ``Index`` (:issue:`9068`) @@ -220,6 +172,14 @@ enhancements make string operations easier and more consistent with standard pyt idx.str.startswith('a') s[s.index.str.startswith('a')] +- The following new methods are accesible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) + + ================ =============== =============== =============== ================ + .. .. Methods .. .. + ================ =============== =============== =============== ================ + ``capitalize()`` ``swapcase()`` ``normalize()`` ``partition()`` ``rpartition()`` + ``index()`` ``rindex()`` ``translate()`` + ================ =============== =============== =============== ================ - ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`) @@ -244,14 +204,59 @@ enhancements make string operations easier and more consistent with standard pyt - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) -.. _whatsnew_0161.api: -API changes -~~~~~~~~~~~ +.. _whatsnew_0161.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ + +- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`) + + .. ipython:: python + from pandas.tseries.offsets import BusinessHour + Timestamp('2014-08-01 09:00') + BusinessHour() + Timestamp('2014-08-01 07:00') + BusinessHour() + Timestamp('2014-08-01 16:30') + BusinessHour() +- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) +- Allow ``clip``, ``clip_lower``, and ``clip_upper`` to accept array-like arguments as thresholds (This is a regression from 0.11.0). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). (:issue:`6966`) + +- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) +- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) + + .. ipython:: python + + df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) + df.drop(['A', 'X'], axis=1, errors='ignore') + +- Add support for separating years and quarters using dashes, for + example 2014-Q1. (:issue:`9688`) + +- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) +- ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) + +- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) +- Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) + +- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) +- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) + +- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) +- Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) + +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` + +- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) + + +.. _whatsnew_0161.api: + +API changes +~~~~~~~~~~~ - When passing in an ax to ``df.plot( ..., ax=ax)``, the `sharex` kwarg will now default to `False`. The result is that the visibility of xlabels and xticklabels will not anymore be changed. You @@ -260,16 +265,19 @@ API changes If pandas creates the subplots itself (e.g. no passed in `ax` kwarg), then the default is still ``sharex=True`` and the visibility changes are applied. - - -- Add support for separating years and quarters using dashes, for - example 2014-Q1. (:issue:`9688`) - - :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously the order was arbitrary. (:issue:`9777`) - By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) +.. _whatsnew_0161.deprecations: + +Deprecations +^^^^^^^^^^^^ + +- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`) + + .. _whatsnew_0161.index_repr: Index Representation @@ -303,25 +311,17 @@ New Behavior .. ipython:: python - pd.set_option('display.width',100) - pd.Index(range(4),name='foo') - pd.Index(range(25),name='foo') - pd.Index(range(104),name='foo') - pd.Index(['datetime', 'sA', 'sB', 'sC', 'flow', 'error', 'temp', 'ref', 'a_bit_a_longer_one']*2) - pd.CategoricalIndex(['a','bb','ccc','dddd'],ordered=True,name='foobar') - pd.CategoricalIndex(['a','bb','ccc','dddd']*10,ordered=True,name='foobar') - pd.CategoricalIndex(['a','bb','ccc','dddd']*100,ordered=True,name='foobar') - pd.CategoricalIndex(np.arange(1000),ordered=True,name='foobar') - pd.date_range('20130101',periods=4,name='foo',tz='US/Eastern') - pd.date_range('20130101',periods=25,name='foo',tz='US/Eastern') - pd.date_range('20130101',periods=104,name='foo',tz='US/Eastern') - -.. _whatsnew_0161.deprecations: + pd.set_option('display.width', 80) + pd.Index(range(4), name='foo') + pd.Index(range(30), name='foo') + pd.Index(range(104), name='foo') + pd.CategoricalIndex(['a','bb','ccc','dddd'], ordered=True, name='foobar') + pd.CategoricalIndex(['a','bb','ccc','dddd']*10, ordered=True, name='foobar') + pd.CategoricalIndex(['a','bb','ccc','dddd']*100, ordered=True, name='foobar') + pd.date_range('20130101',periods=4, name='foo', tz='US/Eastern') + pd.date_range('20130101',periods=25, freq='D') + pd.date_range('20130101',periods=104, name='foo', tz='US/Eastern') -Deprecations -^^^^^^^^^^^^ - -- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`) .. _whatsnew_0161.performance: @@ -333,7 +333,6 @@ Performance Improvements - Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`) - .. _whatsnew_0161.bug_fixes: Bug Fixes @@ -361,7 +360,6 @@ Bug Fixes - Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`) - Bug in ``setup.py`` that would allow an incompat cython version to build (:issue:`9827`) - Bug in plotting ``secondary_y`` incorrectly attaches ``right_ax`` property to secondary axes specifying itself recursively. (:issue:`9861`) - - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) @@ -372,20 +370,13 @@ Bug Fixes - Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) - Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) - Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) - - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) - Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) - Bug in C csv parser causing spurious NaNs when data started with newline followed by whitespace. (:issue:`10022`) - - Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`) - Bug where .iloc and .loc behavior is not consistent on empty dataframes (:issue:`9964`) - - Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`) - - - - - Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`) - Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`) - Bug in ``where`` when dtype is ``datetime64/timedelta64``, but dtype of other is not (:issue:`9804`) @@ -394,25 +385,13 @@ Bug Fixes - Bug in ``DataFrame`` constructor when ``columns`` parameter is set, and ``data`` is an empty list (:issue:`9939`) - Bug in bar plot with ``log=True`` raises ``TypeError`` if all values are less than 1 (:issue:`9905`) - Bug in horizontal bar plot ignores ``log=True`` (:issue:`9905`) - - - - Bug in PyTables queries that did not return proper results using the index (:issue:`8265`, :issue:`9676`) - - - - - Bug where dividing a dataframe containing values of type ``Decimal`` by another ``Decimal`` would raise. (:issue:`9787`) - Bug where using DataFrames asfreq would remove the name of the index. (:issue:`9885`) - Bug causing extra index point when resample BM/BQ (:issue:`9756`) - Changed caching in ``AbstractHolidayCalendar`` to be at the instance level rather than at the class level as the latter can result in unexpected behaviour. (:issue:`9552`) - - Fixed latex output for multi-indexed dataframes (:issue:`9778`) - Bug causing an exception when setting an empty range using ``DataFrame.loc`` (:issue:`9596`) - - - - - Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`) - Bug in ``transform`` and ``filter`` when grouping on a categorical variable (:issue:`9921`) - Bug in ``transform`` when groups are equal in number and dtype to the input index (:issue:`9700`) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt new file mode 100644 index 0000000000000..ddfe6fa0b2f74 --- /dev/null +++ b/doc/source/whatsnew/v0.16.2.txt @@ -0,0 +1,90 @@ +.. _whatsnew_0162: + +v0.16.2 (June 12, 2015) +----------------------- + +This is a minor bug-fix release from 0.16.1 and includes a a large number of +bug fixes along several new features, enhancements, and performance improvements. +We recommend that all users upgrade to this version. + +Highlights include: + +- Documentation on how to use ``numba`` with *pandas*, see :ref:`here ` + +Check the :ref:`API Changes ` before updating. + +.. contents:: What's new in v0.16.2 + :local: + :backlinks: none + +.. _whatsnew_0162.enhancements: + +New features +~~~~~~~~~~~~ + +.. _whatsnew_0162.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0162.api: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0162.api_breaking: + +.. _whatsnew_0162.api_breaking.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in constructor. (:issue:`102171`) + +.. _whatsnew_0162.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved ``Series.resample`` performance with dtype=datetime64[ns] (:issue:`7754`) +- Modest improvement in datetime writing speed in to_csv (:issue:`10271`) + +.. _whatsnew_0162.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- Bug where read_hdf store.select modifies the passed columns list when + multi-indexed (:issue:`7212`) +- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) + +- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`) +- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) +- Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) +- Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`) +- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) +- Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) + + +- Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) +- Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) + +- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) + +- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) + + +- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) + +- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) + +- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`) +- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`) + + +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) +- Bug in ``SparseSeries`` constructor ignores input data name (:issue:`10258`) + +- Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) + +- Bug to handle masking empty ``DataFrame``(:issue:`10126`) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 0184acce7a46b..87a9d197bd0d1 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1,9 +1,9 @@ .. _whatsnew_0170: -v0.17.0 (July ??, 2015) +v0.17.0 (July 31, 2015) ----------------------- -This is a major release from 0.16.1 and includes a small number of API changes, several new features, +This is a major release from 0.16.2 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 6be0facf2bffc..2a273629544cb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,6 +26,7 @@ Other items: * OrderedDefaultDict +* platform checker """ # pylint disable=W0611 import functools @@ -754,3 +755,16 @@ def __missing__(self, key): def __reduce__(self): # optional, for pickle support args = self.default_factory if self.default_factory else tuple() return type(self), args, None, None, list(self.items()) + + +# https://github.com/pydata/pandas/pull/9123 +def is_platform_windows(): + return sys.platform == 'win32' or sys.platform == 'cygwin' + + +def is_platform_linux(): + return sys.platform == 'linux2' + + +def is_platform_mac(): + return sys.platform == 'darwin' diff --git a/pandas/core/base.py b/pandas/core/base.py index 2f171cdd6adf3..540b900844a9e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,13 +1,10 @@ """ Base and utility classes for pandas objects. """ -import datetime - from pandas import compat import numpy as np from pandas.core import common as com import pandas.core.nanops as nanops -import pandas.tslib as tslib import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly from pandas.core.strings import StringMethods diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 97368baffd40b..c5cd8390359dc 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -18,7 +18,7 @@ _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, _ensure_object, _ensure_int64, - _coerce_indexer_dtype, _values_from_object, take_1d) + _coerce_indexer_dtype, take_1d) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option @@ -1310,8 +1310,7 @@ def _repr_categories_info(self): levheader = "Categories (%d, %s): " % (len(self.categories), self.categories.dtype) width, height = get_terminal_size() - max_width = (width if get_option("display.width") == 0 - else get_option("display.width")) + max_width = get_option("display.width") or width if com.in_ipython_frontend(): # 0 = no breaks max_width = 0 diff --git a/pandas/core/common.py b/pandas/core/common.py index 3c92300d1f9a5..1c9326c047a79 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3322,10 +3322,17 @@ def save(obj, path): # TODO remove in 0.13 def _maybe_match_name(a, b): - a_name = getattr(a, 'name', None) - b_name = getattr(b, 'name', None) - if a_name == b_name: - return a_name + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + return None + elif a_has: + return a.name + elif b_has: + return b.name return None def _random_state(state=None): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7cce560baa1fc..ab6f11a4b8d5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,8 +26,9 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, is_sequence, _infer_dtype_from_scalar, _values_from_object, - is_list_like, _get_dtype, _maybe_box_datetimelike, - is_categorical_dtype, is_object_dtype, _possibly_infer_to_datetimelike) + is_list_like, _maybe_box_datetimelike, + is_categorical_dtype, is_object_dtype, + _possibly_infer_to_datetimelike) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, @@ -661,6 +662,8 @@ def from_dict(cls, data, orient='columns', dtype=None): The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise if the keys should be rows, pass 'index'. + dtype : dtype, default None + Data type to force, otherwise infer Returns ------- @@ -2148,7 +2151,7 @@ def _setitem_array(self, key, value): def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 - if key.values.dtype != np.bool_: + if key.values.size and not com.is_bool_dtype(key.values): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) @@ -2742,7 +2745,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, Parameters ---------- - axis : {0, 1}, or tuple/list thereof + axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof Pass tuple or list to drop on multiple axes how : {'any', 'all'} * any : if any NA values are present, drop that label @@ -2887,7 +2890,7 @@ def sort(self, columns=None, axis=0, ascending=True, ascending : boolean or list, default True Sort ascending vs. descending. Specify list for multiple sort orders - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -2916,7 +2919,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, Parameters ---------- - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns by : object Column name(s) in frame. Accepts a column name or a list @@ -3024,7 +3027,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, Parameters ---------- level : int - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 ascending : boolean, default True inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -3636,9 +3639,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, ---------- func : function Function to apply to each column/row - axis : {0, 1} - * 0 : apply function to each column - * 1 : apply function to each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index': apply function to each column + * 1 or 'columns': apply function to each row broadcast : boolean, default False For aggregation functions, return object of same size with values propagated @@ -4159,8 +4162,8 @@ def corrwith(self, other, axis=0, drop=False): Parameters ---------- other : DataFrame - axis : {0, 1} - 0 to compute column-wise, 1 for row-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise drop : boolean, default False Drop missing indices from result, default returns union of all @@ -4211,8 +4214,8 @@ def count(self, axis=0, level=None, numeric_only=False): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame @@ -4365,8 +4368,8 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA @@ -4396,8 +4399,8 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be first index. @@ -4443,9 +4446,9 @@ def mode(self, axis=0, numeric_only=False): Parameters ---------- - axis : {0, 1, 'index', 'columns'} (default 0) - * 0/'index' : get mode of each column - * 1/'columns' : get mode of each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row numeric_only : boolean, default False if True, only apply to numeric columns @@ -4550,7 +4553,7 @@ def rank(self, axis=0, numeric_only=None, method='average', Parameters ---------- - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data @@ -4602,7 +4605,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period vs. end - axis : {0, 1} default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If false then underlying input data is not copied @@ -4633,7 +4636,7 @@ def to_period(self, freq=None, axis=0, copy=True): Parameters ---------- freq : string, default - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If False then underlying input data is not copied diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4fb08a7b7e107..d6c7d87bb25b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -17,7 +17,7 @@ import pandas.core.common as com import pandas.core.datetools as datetools from pandas import compat -from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap +from pandas.compat import map, zip, lrange, string_types, isidentifier from pandas.core.common import (isnull, notnull, is_list_like, _values_from_object, _maybe_promote, _maybe_box_datetimelike, ABCSeries, @@ -2398,15 +2398,15 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, Parameters ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). (values not in the dict/Series/DataFrame will not be filled). This value cannot be a list. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap axis : %(axes_single_arg)s inplace : boolean, default False If True, fill in place. Note: this will modify any @@ -3365,11 +3365,10 @@ def _align_series(self, other, join='outer', axis=None, level=None, level=level, return_indexers=True) - left_result = self._reindex_indexer(join_index, lidx, copy) - right_result = other._reindex_indexer(join_index, ridx, copy) + left = self._reindex_indexer(join_index, lidx, copy) + right = other._reindex_indexer(join_index, ridx, copy) else: - # one has > 1 ndim fdata = self._data if axis == 0: @@ -3399,23 +3398,19 @@ def _align_series(self, other, join='outer', axis=None, level=None, if copy and fdata is self._data: fdata = fdata.copy() - left_result = DataFrame(fdata) + left = DataFrame(fdata) if ridx is None: - right_result = other + right = other else: - right_result = other.reindex(join_index, level=level) + right = other.reindex(join_index, level=level) # fill fill_na = notnull(fill_value) or (method is not None) if fill_na: - return (left_result.fillna(fill_value, method=method, limit=limit, - axis=fill_axis), - right_result.fillna(fill_value, method=method, - limit=limit)) - else: - return (left_result.__finalize__(self), - right_result.__finalize__(other)) + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) + right = right.fillna(fill_value, method=method, limit=limit) + return (left.__finalize__(self), right.__finalize__(other)) _shared_docs['where'] = (""" Return an object of same shape as self and whose corresponding diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1f76d80c34a90..4abdd1112c721 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,7 +14,7 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index, _union_indexes +from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -187,7 +187,7 @@ class Grouper(object): Examples -------- - >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A') + >>> df.groupby(Grouper(key='A')) : syntactic sugar for df.groupby('A') >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date' >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) : specify a resample on the level 'date' on the columns axis with a frequency of 60s @@ -426,7 +426,11 @@ def convert(key, s): return Timestamp(key).asm8 return key - sample = next(iter(self.indices)) + if len(self.indices) > 0: + sample = next(iter(self.indices)) + else: + sample = None # Dummy sample + if isinstance(sample, tuple): if not isinstance(name, tuple): msg = ("must supply a tuple to get_group with multiple" @@ -1496,6 +1500,8 @@ def aggregate(self, values, how, axis=0): if is_datetime_or_timedelta_dtype(values.dtype): values = values.view('int64') + # GH 7754 + is_numeric = True elif is_bool_dtype(values.dtype): values = _algos.ensure_float64(values) elif com.is_integer_dtype(values): @@ -2938,7 +2944,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): cd = 'coerce' else: cd = True - return result.convert_objects(convert_dates=cd) + result = result.convert_objects(convert_dates=cd) + return self._reindex_output(result) else: # only coerce dates if we find at least 1 datetime diff --git a/pandas/core/index.py b/pandas/core/index.py index 21f1fed2cd6da..2bd96fcec2e42 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -8,7 +8,6 @@ from pandas import compat import numpy as np -from math import ceil from sys import getsizeof import pandas.tslib as tslib import pandas.lib as lib @@ -581,8 +580,18 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): + if not com.is_list_like(other): + raise TypeError('Input must be Index or array-like') return True + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = self.name if self.name == other.name else None + return other, result_name + @property def nlevels(self): return 1 @@ -1365,16 +1374,14 @@ def union(self, other): ------- union : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable.') + self._assert_can_do_setop(other) + other = _ensure_index(other) if len(other) == 0 or self.equals(other): return self if len(self) == 0: - return _ensure_index(other) - - self._assert_can_do_setop(other) + return other if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') @@ -1440,11 +1447,7 @@ def intersection(self, other): ------- intersection : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - self._assert_can_do_setop(other) - other = _ensure_index(other) if self.equals(other): @@ -1493,18 +1496,12 @@ def difference(self, other): >>> index.difference(index2) """ - - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') + self._assert_can_do_setop(other) if self.equals(other): return Index([], name=self.name) - if not isinstance(other, Index): - other = np.asarray(other) - result_name = self.name - else: - result_name = self.name if self.name == other.name else None + other, result_name = self._convert_can_do_setop(other) theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) @@ -1518,7 +1515,7 @@ def sym_diff(self, other, result_name=None): Parameters ---------- - other : array-like + other : Index or array-like result_name : str Returns @@ -1546,13 +1543,10 @@ def sym_diff(self, other, result_name=None): >>> idx1 ^ idx2 Int64Index([1, 5], dtype='int64') """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - - if not isinstance(other, Index): - other = Index(other) - result_name = result_name or self.name - + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) @@ -5461,12 +5455,11 @@ def union(self, other): >>> index.union(index2) """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): return self - result_names = self.names if self.names == other.names else None - uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -5484,12 +5477,11 @@ def intersection(self, other): Index """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if self.equals(other): return self - result_names = self.names if self.names == other.names else None - self_tuples = self.values other_tuples = other.values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) @@ -5510,18 +5502,10 @@ def difference(self, other): diff : MultiIndex """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) - if not isinstance(other, MultiIndex): - if len(other) == 0: + if len(other) == 0: return self - try: - other = MultiIndex.from_tuples(other) - except: - raise TypeError('other must be a MultiIndex or a list of' - ' tuples') - result_names = self.names - else: - result_names = self.names if self.names == other.names else None if self.equals(other): return MultiIndex(levels=[[]] * self.nlevels, @@ -5538,15 +5522,30 @@ def difference(self, other): return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - def _assert_can_do_setop(self, other): - pass - def astype(self, dtype): if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) return self._shallow_copy() + def _convert_can_do_setop(self, other): + result_names = self.names + + if not hasattr(other, 'names'): + if len(other) == 0: + other = MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + verify_integrity=False) + else: + msg = 'other must be a MultiIndex or a list of tuples' + try: + other = MultiIndex.from_tuples(other) + except: + raise TypeError(msg) + else: + result_names = self.names if self.names == other.names else None + return other, result_names + def insert(self, loc, item): """ Make new MultiIndex inserting new item at location diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7c373b0a2b01d..e0f06e22c431b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,7 +1,6 @@ # pylint: disable=W0223 -from datetime import datetime -from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.index import Index, MultiIndex from pandas.compat import range, zip import pandas.compat as compat import pandas.core.common as com @@ -10,8 +9,6 @@ is_null_slice, ABCSeries, ABCDataFrame, ABCPanel, is_float, _values_from_object, _infer_fill_value, is_integer) -import pandas.lib as lib - import numpy as np # the supported indexers diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4121dd8e89bee..c64c50f791edf 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,7 +1,5 @@ -import sys import itertools import functools - import numpy as np try: @@ -10,7 +8,6 @@ except ImportError: # pragma: no cover _USE_BOTTLENECK = False -import pandas.core.common as com import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib from pandas.compat import builtins @@ -19,11 +16,11 @@ ensure_float, _ensure_float64, _ensure_int64, _ensure_object, is_float, is_integer, is_complex, - is_float_dtype, is_floating_dtype, + is_float_dtype, is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, + is_datetime_or_timedelta_dtype, _get_dtype, is_int_or_datetime_dtype, is_any_int_dtype) @@ -257,8 +254,16 @@ def nansum(values, axis=None, skipna=True): @bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max)) - count = _get_counts(mask, axis) + + dtype_sum = dtype_max + dtype_count = np.float64 + if is_integer_dtype(dtype): + dtype_sum = np.float64 + elif is_float_dtype(dtype): + dtype_sum = dtype + dtype_count = dtype + count = _get_counts(mask, axis, dtype=dtype_count) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count @@ -368,7 +373,7 @@ def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof) @@ -462,7 +467,7 @@ def nanargmin(values, axis=None, skipna=True): def nanskew(values, axis=None, skipna=True): mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -497,7 +502,7 @@ def nanskew(values, axis=None, skipna=True): def nankurt(values, axis=None, skipna=True): mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -560,15 +565,16 @@ def _maybe_arg_null_out(result, axis, mask, skipna): return result -def _get_counts(mask, axis): +def _get_counts(mask, axis, dtype=float): + dtype = _get_dtype(dtype) if axis is None: - return float(mask.size - mask.sum()) + return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) try: - return count.astype(float) + return count.astype(dtype) except AttributeError: - return np.array(count, dtype=float) + return np.array(count, dtype=dtype) def _maybe_null_out(result, axis, mask): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index a4c9bff3dd97f..0b62eb1e53ddb 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -213,7 +213,7 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, Parameters ---------- - flex_arith_method : function (optional) + flex_arith_method : function factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) radd_func : function (optional) @@ -703,12 +703,35 @@ def _radd_compat(left, right): return output +_op_descriptions = {'add': {'op': '+', 'desc': 'Addition', 'reversed': False, 'reverse': 'radd'}, + 'sub': {'op': '-', 'desc': 'Subtraction', 'reversed': False, 'reverse': 'rsub'}, + 'mul': {'op': '*', 'desc': 'Multiplication', 'reversed': False, 'reverse': 'rmul'}, + 'mod': {'op': '%', 'desc': 'Modulo', 'reversed': False, 'reverse': 'rmod'}, + 'pow': {'op': '**', 'desc': 'Exponential power', 'reversed': False, 'reverse': 'rpow'}, + 'truediv': {'op': '/', 'desc': 'Floating division', 'reversed': False, 'reverse': 'rtruediv'}, + 'floordiv': {'op': '//', 'desc': 'Integer division', 'reversed': False, 'reverse': 'rfloordiv'}} + +_op_names = list(_op_descriptions.keys()) +for k in _op_names: + reverse_op = _op_descriptions[k]['reverse'] + _op_descriptions[reverse_op] = _op_descriptions[k].copy() + _op_descriptions[reverse_op]['reversed'] = True + _op_descriptions[reverse_op]['reverse'] = k def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs): + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' series' + else: + equiv = 'series ' + op_desc['op'] + ' other' + doc = """ - Binary operator %s with support to substitute a fill_value for missing data - in one of the inputs + %s of series and other, element-wise (binary operator `%s`). + + Equivalent to ``%s``, but with support to substitute a fill_value for + missing data in one of the inputs. Parameters ---------- @@ -723,7 +746,11 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, Returns ------- result : Series - """ % name + + See also + -------- + Series.%s + """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -813,7 +840,48 @@ def na_op(x, y): return result - @Appender(_arith_doc_FRAME % name) + if name in _op_descriptions: + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' dataframe' + else: + equiv = 'dataframe ' + op_desc['op'] + ' other' + + doc = """ + %s of dataframe and other, element-wise (binary operator `%s`). + + Equivalent to ``%s``, but with support to substitute a fill_value for + missing data in one of the inputs. + + Parameters + ---------- + other : Series, DataFrame, or constant + axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on + fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame locations are + missing, the result will be missing + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Notes + ----- + Mismatched indices will be unioned together + + Returns + ------- + result : DataFrame + + See also + -------- + DataFrame.%s + """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + else: + doc = _arith_doc_FRAME % name + + @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, pd.DataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 2cd2412cfac66..580510829baff 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -6,7 +6,6 @@ from pandas.compat import (map, zip, range, lrange, lmap, u, OrderedDict, OrderedDefaultdict) from pandas import compat -import sys import warnings import numpy as np from pandas.core.common import (PandasError, _try_sort, _default_index, @@ -27,9 +26,10 @@ deprecate_kwarg) import pandas.core.common as com import pandas.core.ops as ops -import pandas.core.nanops as nanops import pandas.computation.expressions as expressions from pandas import lib +from pandas.core.ops import _op_descriptions + _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', @@ -239,7 +239,8 @@ def from_dict(cls, data, intersect=False, orient='items', dtype=None): (default). Otherwise if the columns of the values of the passed DataFrame objects should be the items (which in the case of mixed-dtype data you should do), instead pass 'minor' - + dtype : dtype, default None + Data type to force, otherwise infer Returns ------- @@ -1383,6 +1384,7 @@ def _homogenize_dict(self, frames, intersect=True, dtype=None): result[key] = None axes_dict['data'] = result + axes_dict['dtype'] = dtype return axes_dict @staticmethod @@ -1437,7 +1439,7 @@ def _add_aggregate_operations(cls, use_numexpr=True): ---------- other : %s or %s""" % (cls._constructor_sliced.__name__, cls.__name__) + """ axis : {""" + ', '.join(cls._AXIS_ORDERS) + "}" + """ -Axis to broadcast over + Axis to broadcast over Returns ------- @@ -1459,8 +1461,36 @@ def na_op(x, y): result = com._fill_zeros(result, x, y, name, fill_zeros) return result - @Substitution(name) - @Appender(_agg_doc) + if name in _op_descriptions: + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' panel' + else: + equiv = 'panel ' + op_desc['op'] + ' other' + + _op_doc = """ + %%s of series and other, element-wise (binary operator `%%s`). + Equivalent to ``%%s``. + + Parameters + ---------- + other : %s or %s""" % (cls._constructor_sliced.__name__, cls.__name__) + """ + axis : {""" + ', '.join(cls._AXIS_ORDERS) + "}" + """ + Axis to broadcast over + + Returns + ------- + """ + cls.__name__ + """ + + See also + -------- + """ + cls.__name__ + ".%s\n" + doc = _op_doc % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + else: + doc = _agg_doc % name + + @Appender(doc) def f(self, other, axis=0): return self._combine(other, na_op, axis=axis) f.__name__ = name diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index d021cb2d59ecf..35e6412efc760 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -1,6 +1,5 @@ """ Factory methods to create N-D panels """ -import pandas.lib as lib from pandas.compat import zip import pandas.compat as compat diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 9a812ec71b9a2..3225b4aa33ac2 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -14,8 +14,7 @@ from pandas._sparse import IntIndex from pandas.core.categorical import Categorical -from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, - isnull) +from pandas.core.common import notnull, _ensure_platform_int, _maybe_promote from pandas.core.groupby import get_group_index, _compress_group_index import pandas.core.common as com diff --git a/pandas/core/series.py b/pandas/core/series.py index 95b6a6aa1e7dd..c54bd96f64c73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,8 +19,8 @@ is_list_like, _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, - ABCSparseArray, _maybe_match_name, _coerce_to_dtype, - _ensure_object, SettingWithCopyError, + ABCSparseArray, _maybe_match_name, + _coerce_to_dtype, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index) @@ -1442,7 +1442,7 @@ def searchsorted(self, v, side='left', sorter=None): def append(self, to_append, verify_integrity=False): """ - Concatenate two or more Series. The indexes must not overlap + Concatenate two or more Series. Parameters ---------- @@ -1508,7 +1508,12 @@ def _binop(self, other, func, level=None, fill_value=None): result = func(this_vals, other_vals) name = _maybe_match_name(self, other) - return self._constructor(result, index=new_index).__finalize__(self) + result = self._constructor(result, index=new_index, name=name) + result = result.__finalize__(self) + if name is None: + # When name is None, __finalize__ overwrites current name + result.name = None + return result def combine(self, other, func, fill_value=nan): """ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 8da43c18b989f..f4ac0166cf44b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -813,7 +813,7 @@ def str_strip(arr, to_strip=None, side='both'): def str_wrap(arr, width, **kwargs): - """ + r""" Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 8bdcfb44242ff..c4cd788216018 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -211,7 +211,6 @@ cdef class StringHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val char *buf @@ -223,12 +222,9 @@ cdef class StringHashTable(HashTable): buf = util.get_c_string(val) k = kh_get_str(self.table, buf) if k == self.table.n_buckets: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - count += 1 + kh_put_str(self.table, buf, &ret) uniques.append(val) - # return None return uniques.to_array() def factorize(self, ndarray[object] values): @@ -258,7 +254,6 @@ cdef class StringHashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int32HashTable(HashTable): @@ -319,7 +314,6 @@ cdef class Int32HashTable(HashTable): def lookup(self, ndarray[int32_t] values): cdef: Py_ssize_t i, n = len(values) - int ret = 0 int32_t val khiter_t k ndarray[int32_t] locs = np.empty(n, dtype=np.int64) @@ -357,7 +351,6 @@ cdef class Int32HashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int64HashTable: #(HashTable): @@ -518,7 +511,6 @@ cdef class Int64HashTable: #(HashTable): def unique(self, ndarray[int64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 ndarray result int64_t val @@ -529,9 +521,8 @@ cdef class Int64HashTable: #(HashTable): val = values[i] k = kh_get_int64(self.table, val) if k == self.table.n_buckets: - k = kh_put_int64(self.table, val, &ret) + kh_put_int64(self.table, val, &ret) uniques.append(val) - count += 1 result = uniques.to_array() @@ -644,7 +635,6 @@ cdef class Float64HashTable(HashTable): def unique(self, ndarray[float64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 float64_t val khiter_t k @@ -657,9 +647,8 @@ cdef class Float64HashTable(HashTable): if val == val: k = kh_get_float64(self.table, val) if k == self.table.n_buckets: - k = kh_put_float64(self.table, val, &ret) + kh_put_float64(self.table, val, &ret) uniques.append(val) - count += 1 elif not seen_na: seen_na = 1 uniques.append(ONAN) @@ -786,7 +775,6 @@ cdef class PyObjectHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val ndarray result @@ -800,7 +788,7 @@ cdef class PyObjectHashTable(HashTable): if not _checknan(val): k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: - k = kh_put_pymap(self.table, val, &ret) + kh_put_pymap(self.table, val, &ret) uniques.append(val) elif not seen_na: seen_na = 1 @@ -918,7 +906,7 @@ cdef class Int64Factorizer: cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 @@ -938,7 +926,6 @@ cpdef value_count_int64(ndarray[int64_t] values): cdef: Py_ssize_t i kh_int64_t *table - int ret = 0 int k table = kh_init_int64() @@ -961,7 +948,7 @@ cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 @@ -983,7 +970,7 @@ cdef build_count_table_object(ndarray[object] values, cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: - Py_ssize_t i = len(values) + Py_ssize_t i kh_pymap_t *table int k @@ -1008,9 +995,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): int count, max_count = 2 int j = -1 # so you can do += int k - Py_ssize_t i, n = len(values) kh_pymap_t *table - int ret = 0 table = kh_init_pymap() build_count_table_object(values, mask, table) @@ -1036,11 +1021,10 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): def mode_int64(ndarray[int64_t] values): cdef: - int val, max_val = 2 + int count, max_count = 2 int j = -1 # so you can do += int k kh_int64_t *table - list uniques = [] table = kh_init_int64() @@ -1049,12 +1033,12 @@ def mode_int64(ndarray[int64_t] values): modes = np.empty(table.n_buckets, dtype=np.int64) for k in range(table.n_buckets): if kh_exist_int64(table, k): - val = table.vals[k] + count = table.vals[k] - if val == max_val: + if count == max_count: j += 1 - elif val > max_val: - max_val = val + elif count > max_count: + max_count = count j = 0 else: continue diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 458a245da6bdb..4cbc7aeaa3df7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3453,6 +3453,10 @@ def get_blk_items(mgr, blocks): def process_axes(self, obj, columns=None): """ process axes filters """ + # make a copy to avoid side effects + if columns is not None: + columns = list(columns) + # make sure to include levels if we have them if columns is not None and self.is_multi_index: for n in self.levels: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6cfd569904097..7d9c3c051344f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4617,6 +4617,29 @@ def test_preserve_timedeltaindex_type(self): store['df'] = df assert_frame_equal(store['df'], df) + def test_colums_multiindex_modified(self): + # BUG: 7212 + # read_hdf store.select modified the passed columns parameters + # when multi-indexed. + + df = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + df.index.name = 'letters' + df = df.set_index(keys='E', append=True) + + data_columns = df.index.names+df.columns.tolist() + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', + mode='a', + append=True, + data_columns=data_columns, + index=False) + cols2load = list('BCD') + cols2load_original = list(cols2load) + df_loaded = read_hdf(path, 'df', columns=cols2load) + self.assertTrue(cols2load_original == cols2load) + def _test_sort(obj): if isinstance(obj, DataFrame): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index fa7debeb228ce..9576f80696350 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -2195,6 +2195,13 @@ def setUp(self): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") + def tearDown(self): + from pymysql.err import Error + try: + self.db.close() + except Error: + pass + def test_basic(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() diff --git a/pandas/rpy/__init__.py b/pandas/rpy/__init__.py index 899b684ecbff9..bad7ebc580ce2 100644 --- a/pandas/rpy/__init__.py +++ b/pandas/rpy/__init__.py @@ -5,7 +5,10 @@ import warnings warnings.warn("The pandas.rpy module is deprecated and will be " "removed in a future version. We refer to external packages " - "like rpy2, found here: http://rpy.sourceforge.net", FutureWarning) + "like rpy2. " + "\nSee here for a guide on how to port your code to rpy2: " + "http://pandas.pydata.org/pandas-docs/stable/r_interface.html", + FutureWarning) try: from .common import importr, r, load_data diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 2c328e51b5090..24d06970f4741 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -121,6 +121,9 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if data is None: data = [] + if isinstance(data, Series) and name is None: + name = data.name + is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: @@ -399,7 +402,7 @@ def abs(self): res_sp_values = np.abs(self.sp_values) return self._constructor(res_sp_values, index=self.index, sparse_index=self.sp_index, - fill_value=self.fill_value) + fill_value=self.fill_value).__finalize__(self) def get(self, label, default=None): """ diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index dd1d10f3d15ed..96e5ff87fbb0c 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -128,14 +128,15 @@ def setUp(self): date_index = bdate_range('1/1/2011', periods=len(index)) - self.bseries = SparseSeries(arr, index=index, kind='block') - self.bseries.name = 'bseries' + self.bseries = SparseSeries(arr, index=index, kind='block', + name='bseries') self.ts = self.bseries self.btseries = SparseSeries(arr, index=date_index, kind='block') - self.iseries = SparseSeries(arr, index=index, kind='integer') + self.iseries = SparseSeries(arr, index=index, kind='integer', + name='iseries') arr, index = _test_data2() self.bseries2 = SparseSeries(arr, index=index, kind='block') @@ -143,7 +144,7 @@ def setUp(self): arr, index = _test_data1_zero() self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0) + fill_value=0, name='zbseries') self.ziseries = SparseSeries(arr, index=index, kind='integer', fill_value=0) @@ -234,12 +235,21 @@ def test_constructor(self): self.bseries.to_dense().fillna(0).values) # pass SparseSeries - s2 = SparseSeries(self.bseries) - s3 = SparseSeries(self.iseries) - s4 = SparseSeries(self.zbseries) - assert_sp_series_equal(s2, self.bseries) - assert_sp_series_equal(s3, self.iseries) - assert_sp_series_equal(s4, self.zbseries) + def _check_const(sparse, name): + # use passed series name + result = SparseSeries(sparse) + assert_sp_series_equal(result, sparse) + self.assertEqual(sparse.name, name) + self.assertEqual(result.name, name) + + # use passed name + result = SparseSeries(sparse, name='x') + assert_sp_series_equal(result, sparse) + self.assertEqual(result.name, 'x') + + _check_const(self.bseries, 'bseries') + _check_const(self.iseries, 'iseries') + _check_const(self.zbseries, 'zbseries') # Sparse time series works date_index = bdate_range('1/1/2000', periods=len(self.bseries)) @@ -509,6 +519,21 @@ def _check_inplace_op(iop, op): _check_inplace_op( getattr(operator, "i%s" % op), getattr(operator, op)) + def test_abs(self): + s = SparseSeries([1, 2, -3], name='x') + expected = SparseSeries([1, 2, 3], name='x') + result = s.abs() + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = np.abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + def test_reindex(self): def _compare_with_series(sps, new_index): spsre = sps.reindex(new_index) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index a0cdc0ff5e841..598cdff30e4f7 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -37,6 +37,8 @@ cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 79722a26ebedc..428decd4dca10 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -28,6 +28,8 @@ ctypedef unsigned char UChar cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index b91c46377267a..e9526f9fad1ac 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -244,6 +244,26 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): else: self.assertRaises(AttributeError, lambda : getattr(o,op)) + def test_binary_ops_docs(self): + from pandas import DataFrame, Panel + op_map = {'add': '+', + 'sub': '-', + 'mul': '*', + 'mod': '%', + 'pow': '**', + 'truediv': '/', + 'floordiv': '//'} + for op_name in ['add', 'sub', 'mul', 'mod', 'pow', 'truediv', 'floordiv']: + for klass in [Series, DataFrame, Panel]: + operand1 = klass.__name__.lower() + operand2 = 'other' + op = op_map[op_name] + expected_str = ' '.join([operand1, op, operand2]) + self.assertTrue(expected_str in getattr(klass, op_name).__doc__) + + # reverse version of the binary ops + expected_str = ' '.join([operand2, op, operand1]) + self.assertTrue(expected_str in getattr(klass, 'r' + op_name).__doc__) class TestIndexOps(Ops): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index c03fd93f6173f..21b64378cfc24 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -521,6 +521,15 @@ def test_empty_print(self): expected = ("[], Categories (0, object): []") self.assertEqual(expected, repr(factor)) + def test_print_none_width(self): + # GH10087 + a = pd.Series(pd.Categorical([1,2,3,4], name="a")) + exp = u("0 1\n1 2\n2 3\n3 4\n" + + "Name: a, dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + + with option_context("display.width", None): + self.assertEqual(exp, repr(a)) + def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3282a36bda7b8..c3d39fcdf906f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -545,6 +545,27 @@ def test_random_state(): com._random_state(5.5) +def test_maybe_match_name(): + + matched = com._maybe_match_name(Series([1], name='x'), Series([2], name='x')) + assert(matched == 'x') + + matched = com._maybe_match_name(Series([1], name='x'), Series([2], name='y')) + assert(matched is None) + + matched = com._maybe_match_name(Series([1]), Series([2], name='x')) + assert(matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), Series([2])) + assert(matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), [2]) + assert(matched == 'x') + + matched = com._maybe_match_name([1], Series([2], name='y')) + assert(matched == 'y') + + class TestTake(tm.TestCase): # standard incompatible fill error fill_error = re.compile("Incompatible type for fill_value") diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index fd9d9546ba235..a7129bca59a7f 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -14,7 +14,7 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series, Index, Timestamp, MultiIndex +from pandas import DataFrame, Series, Index, Timestamp, MultiIndex, date_range, NaT import pandas.core.format as fmt import pandas.util.testing as tm @@ -2495,7 +2495,7 @@ def test_to_string(self): def test_freq_name_separation(self): s = Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10), name=0) + index=date_range('1/1/2000', periods=10), name=0) result = repr(s) self.assertTrue('Freq: D, Name: 0' in result) @@ -2556,7 +2556,6 @@ def test_float_trim_zeros(self): def test_datetimeindex(self): - from pandas import date_range, NaT index = date_range('20130102',periods=6) s = Series(1,index=index) result = s.to_string() @@ -2574,7 +2573,6 @@ def test_datetimeindex(self): def test_timedelta64(self): - from pandas import date_range from datetime import datetime, timedelta Series(np.array([1100, 20], dtype='timedelta64[ns]')).to_string() @@ -3179,6 +3177,44 @@ def test_date_nanos(self): result = fmt.Datetime64Formatter(x).get_result() self.assertEqual(result[0].strip(), "1970-01-01 00:00:00.000000200") + def test_dates_display(self): + + # 10170 + # make sure that we are consistently display date formatting + x = Series(date_range('20130101 09:00:00',periods=5,freq='D')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-05 09:00:00") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='s')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:04") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='ms')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.004") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='us')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000004") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='N')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000000000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000000004") class TestNaTFormatting(tm.TestCase): def test_repr(self): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4964d13f7ac28..f74cb07557342 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -794,6 +794,19 @@ def test_setitem_empty(self): result.loc[result.b.isnull(), 'a'] = result.a assert_frame_equal(result, df) + def test_setitem_empty_frame_with_boolean(self): + # Test for issue #10126 + + for dtype in ('float', 'int64'): + for df in [ + pd.DataFrame(dtype=dtype), + pd.DataFrame(dtype=dtype, index=[1]), + pd.DataFrame(dtype=dtype, columns=['A']), + ]: + df2 = df.copy() + df[df > df2] = 47 + assert_frame_equal(df, df2) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] @@ -2821,7 +2834,7 @@ def custom_frame_function(self): data = {'col1': range(10), 'col2': range(10)} cdf = CustomDataFrame(data) - + # Did we get back our own DF class? self.assertTrue(isinstance(cdf, CustomDataFrame)) @@ -2833,7 +2846,7 @@ def custom_frame_function(self): # Do we get back our own DF class after slicing row-wise? cdf_rows = cdf[1:5] self.assertTrue(isinstance(cdf_rows, CustomDataFrame)) - self.assertEqual(cdf_rows.custom_frame_function(), 'OK') + self.assertEqual(cdf_rows.custom_frame_function(), 'OK') # Make sure sliced part of multi-index frame is custom class mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')]) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 33c88b0e3b4b7..82f4b8c05ca06 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -439,6 +439,38 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None, else: raise AssertionError + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xoff = all(not g.gridOn for g in self.plt.gca().xaxis.get_major_ticks()) + yoff = all(not g.gridOn for g in self.plt.gca().yaxis.get_major_ticks()) + return not(xoff and yoff) + + spndx=1 + for kind in kinds: + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, **kws) + self.assertFalse(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, grid=False, **kws) + self.assertFalse(is_grid_on()) + + if kind != 'pie': + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, **kws) + self.assertTrue(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, grid=True, **kws) + self.assertTrue(is_grid_on()) @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -553,6 +585,29 @@ def test_ts_area_lim(self): self.assertEqual(xmin, line[0]) self.assertEqual(xmax, line[-1]) + def test_label(self): + s = Series([1, 2]) + ax = s.plot(label='LABEL', legend=True) + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['None']) + self.plt.close() + # get name from index + s.name = 'NAME' + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['NAME']) + self.plt.close() + # override the default + ax = s.plot(legend=True, label='LABEL') + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + # Add lebel info, but don't draw + ax = s.plot(legend=False, label='LABEL') + self.assertEqual(ax.get_legend(), None) # Hasn't been drawn + ax.legend() # draw it + self._check_legend_labels(ax, labels=['LABEL']) + def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] s = Series(values) @@ -1085,6 +1140,12 @@ def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) + @slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(Series([1,2,3]), + plotting._series_kinds + plotting._common_kinds) + @tm.mplskip class TestDataFramePlots(TestPlotBase): @@ -3403,6 +3464,12 @@ def test_sharey_and_ax(self): "y label is invisible but shouldn't") + @slow + def test_df_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(DataFrame({'a':[1,2,3],'b':[2,3,4]}), + plotting._dataframe_kinds, kws={'x':'a','y':'b'}) + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c308308603167..ab78bd63a7c94 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -699,7 +699,6 @@ def test_get_group(self): expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) assert_panel_equal(gp, expected) - # GH 5267 # be datelike friendly df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', @@ -2596,6 +2595,35 @@ def get_stats(group): result = self.df.groupby(cats).D.apply(get_stats) self.assertEqual(result.index.names[0], 'C') + def test_apply_categorical_data(self): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical(list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + def test_apply_corner_cases(self): # #535, can't use sliding iterator @@ -2837,6 +2865,49 @@ def test_groupby_list_infer_array_like(self): result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + def test_groupby_nat_exclude(self): + # GH 6992 + df = pd.DataFrame({'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'), + np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', + np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [[1, 7], [3, 5]] + keys = sorted(grouped.groups.keys()) + self.assertEqual(len(keys), 2) + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + self.assertEqual(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + self.assertEqual(grouped.ngroups, 2) + expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5])} + for k in grouped.indices: + self.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + self.assertEqual(nan_df['nan'].dtype, 'float64') + self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]') + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + self.assertEqual(grouped.groups, {}) + self.assertEqual(grouped.ngroups, 0) + self.assertEqual(grouped.indices, {}) + self.assertRaises(KeyError, grouped.get_group, np.nan) + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + def test_dictify(self): dict(iter(self.df.groupby('A'))) dict(iter(self.df.groupby(['A', 'B']))) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 444aa2a0bab1e..ed84c9764dd84 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -251,6 +251,136 @@ def test_take(self): expected = ind[indexer] self.assertTrue(result.equals(expected)) + def test_setops_errorcases(self): + for name, idx in compat.iteritems(self.indices): + # # non-iterable input + cases = [0.5, 'xxx'] + methods = [idx.intersection, idx.union, idx.difference, idx.sym_diff] + + for method in methods: + for case in cases: + assertRaisesRegexp(TypeError, + "Input must be Index or array-like", + method, case) + + def test_intersection_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[:5] + second = idx[:3] + intersect = first.intersection(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.intersection(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.intersection([1, 2, 3]) + + def test_union_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[3:] + second = idx[:5] + everything = idx + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.union(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.union([1, 2, 3]) + + def test_difference_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[2:] + second = idx[:4] + answer = idx[4:] + result = first.difference(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.difference(case) + elif isinstance(idx, CategoricalIndex): + pass + elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): + self.assertEqual(result.__class__, answer.__class__) + self.assert_numpy_array_equal(result.asi8, answer.asi8) + else: + result = first.difference(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.difference([1, 2, 3]) + + def test_symmetric_diff(self): + for name, idx in compat.iteritems(self.indices): + first = idx[1:] + second = idx[:-1] + if isinstance(idx, CategoricalIndex): + pass + else: + answer = idx[[0, -1]] + result = first.sym_diff(second) + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.sym_diff(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.sym_diff(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.sym_diff([1, 2, 3]) + + class TestIndex(Base, tm.TestCase): _holder = Index _multiprocess_can_split_ = True @@ -620,16 +750,12 @@ def test_intersection(self): first = self.strIndex[:20] second = self.strIndex[:10] intersect = first.intersection(second) - self.assertTrue(tm.equalContents(intersect, second)) # Corner cases inter = first.intersection(first) self.assertIs(inter, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) - idx1 = Index([1, 2, 3, 4, 5], name='idx') # if target has the same name, it is preserved idx2 = Index([3, 4, 5, 6, 7], name='idx') @@ -671,6 +797,12 @@ def test_union(self): union = first.union(second) self.assertTrue(tm.equalContents(union, everything)) + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + # Corner cases union = first.union(first) self.assertIs(union, first) @@ -681,9 +813,6 @@ def test_union(self): union = Index([]).union(first) self.assertIs(union, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) - # preserve names first.name = 'A' second.name = 'A' @@ -792,11 +921,7 @@ def test_difference(self): self.assertEqual(len(result), 0) self.assertEqual(result.name, first.name) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.difference, 0.5) - def test_symmetric_diff(self): - # smoke idx1 = Index([1, 2, 3, 4], name='idx1') idx2 = Index([2, 3, 4, 5]) @@ -842,10 +967,6 @@ def test_symmetric_diff(self): self.assertTrue(tm.equalContents(result, expected)) self.assertEqual(result.name, 'new_name') - # other isn't iterable - with tm.assertRaises(TypeError): - Index(idx1,dtype='object').difference(1) - def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) self.assertFalse(self.strIndex.is_numeric()) @@ -1786,6 +1907,7 @@ def test_equals(self): self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca'))) self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan])) + class Numeric(Base): def test_numeric_compat(self): @@ -1858,6 +1980,25 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index('A B C D E F'.split()) + dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + self.assertEqual(idx.groupby(to_groupby), + {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + + to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), + pd.NaT, pd.NaT, + datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values + + ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')])) + expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]} + self.assertEqual(idx.groupby(to_groupby), expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index @@ -2642,6 +2783,36 @@ def test_time_overflow_for_32bit_machines(self): idx2 = pd.date_range(end='2000', periods=periods, freq='S') self.assertEqual(len(idx2), periods) + def test_intersection(self): + first = self.index + second = self.index[5:] + intersect = first.intersection(second) + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + third = Index(['a', 'b', 'c']) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + self.assert_index_equal(result, expected) + + def test_union(self): + first = self.index[:5] + second = self.index[5:] + everything = self.index + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex @@ -2652,7 +2823,7 @@ def setUp(self): self.setup_indices() def create_index(self): - return period_range('20130101',periods=5,freq='D') + return period_range('20130101', periods=5, freq='D') def test_pickle_compat_construction(self): pass diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 2a605cba8a6c0..1adb8a5d9217c 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,7 +5,7 @@ import numpy as np -from pandas.core.common import isnull +from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -323,6 +323,32 @@ def test_nanmean(self): allow_complex=False, allow_obj=False, allow_str=False, allow_date=False, allow_tdelta=True) + def test_nanmean_overflow(self): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + from pandas import Series + + # numpy < 1.9.0 is not computing this correctly + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.9.0': + for a in [2 ** 55, -2 ** 55, 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + self.assertEqual(result, a) + self.assertEqual(result, np_result) + self.assertTrue(result.dtype == np.float64) + + # check returned dtype + for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + s = Series(range(10), dtype=dtype) + result = s.mean() + if is_integer_dtype(dtype): + self.assertTrue(result.dtype == np.float64) + else: + self.assertTrue(result.dtype == dtype) + def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, allow_complex=False, allow_str=False, allow_date=False, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 236bdc8a98ff4..57fd465993e14 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -404,6 +404,8 @@ def test_abs(self): expected = np.abs(s) assert_series_equal(result, expected) assert_series_equal(result2, expected) + self.assertEqual(result.name, 'A') + self.assertEqual(result2.name, 'A') class CheckIndexing(object): @@ -962,6 +964,12 @@ def _check_dtype(panel, dtype): panel = Panel(np.random.randn(2,10,5),items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) _check_dtype(panel,dtype) + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + df1 = DataFrame(np.random.randn(2, 5), index=lrange(2), columns=lrange(5)) + df2 = DataFrame(np.random.randn(2, 5), index=lrange(2), columns=lrange(5)) + panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype) + _check_dtype(panel, dtype) + def test_constructor_fails_with_not_3d_input(self): with tm.assertRaisesRegexp(ValueError, "The number of dimensions required is 3"): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 22f8aee1e0a4e..eb583f17f3ace 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -264,10 +264,11 @@ def test_tab_completion(self): self.assertTrue('dt' not in dir(s)) def test_binop_maybe_preserve_name(self): - # names match, preserve result = self.ts * self.ts self.assertEqual(result.name, self.ts.name) + result = self.ts.mul(self.ts) + self.assertEqual(result.name, self.ts.name) result = self.ts * self.ts[:-2] self.assertEqual(result.name, self.ts.name) @@ -277,6 +278,22 @@ def test_binop_maybe_preserve_name(self): cp.name = 'something else' result = self.ts + cp self.assertIsNone(result.name) + result = self.ts.add(cp) + self.assertIsNone(result.name) + + ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow'] + ops = ops + ['r' + op for op in ops] + for op in ops: + # names match, preserve + s = self.ts.copy() + result = getattr(s, op)(s) + self.assertEqual(result.name, self.ts.name) + + # names don't match, don't preserve + cp = self.ts.copy() + cp.name = 'changed' + result = getattr(s, op)(cp) + self.assertIsNone(result.name) def test_combine_first_name(self): result = self.ts.combine_first(self.ts[:5]) @@ -2299,7 +2316,7 @@ def test_iteritems(self): self.assertFalse(hasattr(self.series.iteritems(), 'reverse')) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, check_allna=True) def test_sum_inf(self): import pandas.core.nanops as nanops @@ -2612,7 +2629,7 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - def _check_stat_op(self, name, alternate, check_objects=False): + def _check_stat_op(self, name, alternate, check_objects=False, check_allna=False): import pandas.core.nanops as nanops def testit(): @@ -2636,7 +2653,17 @@ def testit(): assert_almost_equal(f(self.series), alternate(nona.values)) allna = self.series * nan - self.assertTrue(np.isnan(f(allna))) + + if check_allna: + # xref 9422 + # bottleneck >= 1.0 give 0.0 for an allna Series sum + try: + self.assertTrue(nanops._USE_BOTTLENECK) + import bottleneck as bn + self.assertTrue(bn.__version__ >= LooseVersion('1.0')) + self.assertEqual(f(allna),0.0) + except: + self.assertTrue(np.isnan(f(allna))) # dtype=object with None, it works! s = Series([1, 2, 3, None, 5]) @@ -5398,7 +5425,8 @@ def test_getitem_setitem_datetime_tz_pytz(self): def test_getitem_setitem_datetime_tz_dateutil(self): tm._skip_if_no_dateutil(); from dateutil.tz import tzutc - from dateutil.zoneinfo import gettz + from pandas.tslib import _dateutil_gettz as gettz + tz = lambda x: tzutc() if x == 'UTC' else gettz(x) # handle special case for utc in dateutil from pandas import date_range @@ -5931,6 +5959,10 @@ def _check_align(a, b, how='left', fill=None): assert_series_equal(aa, ea) assert_series_equal(ab, eb) + self.assertEqual(aa.name, 'ts') + self.assertEqual(ea.name, 'ts') + self.assertEqual(ab.name, 'ts') + self.assertEqual(eb.name, 'ts') for kind in JOIN_TYPES: _check_align(self.ts[2:], self.ts[:-5], how=kind) @@ -5938,12 +5970,15 @@ def _check_align(a, b, how='left', fill=None): # empty left _check_align(self.ts[:0], self.ts[:-5], how=kind) + _check_align(self.ts[:0], self.ts[:-5], how=kind, fill=-1) # empty right _check_align(self.ts[:-5], self.ts[:0], how=kind) + _check_align(self.ts[:-5], self.ts[:0], how=kind, fill=-1) # both empty _check_align(self.ts[:0], self.ts[:0], how=kind) + _check_align(self.ts[:0], self.ts[:0], how=kind, fill=-1) def test_align_fill_method(self): def _check_align(a, b, how='left', method='pad', limit=None): diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 1b796ed2d83d1..035b3ac07342d 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -9,6 +9,8 @@ import pandas.lib as lib import pandas._period as period import pandas.algos as algos +from pandas.tseries.holiday import Holiday, SA, next_monday +from pandas import DateOffset class TestTseriesUtil(tm.TestCase): @@ -737,6 +739,17 @@ def test_get_period_field_raises_on_out_of_range(self): def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, period.get_period_field_arr, -1, np.empty(1), 0) + +class TestHolidayConflictingArguments(tm.TestCase): + + # GH 10217 + + def test_both_offset_observance_raises(self): + + with self.assertRaises(NotImplementedError) as cm: + h = Holiday("Cyber Monday", month=11, day=1, + offset=[DateOffset(weekday=SA(4))], observance=next_monday) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index f92f398d9be94..76685e2589012 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -810,7 +810,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else True + grid = False if secondary_y else self.plt.rcParams['axes.grid'] self.grid = grid self.legend = legend @@ -999,7 +999,7 @@ def _compute_plot_data(self): data = self.data if isinstance(data, Series): - label = self.kwds.pop('label', None) + label = self.label if label is None and data.name is None: label = 'None' data = data.to_frame(name=label) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index f3a7aa0bfa4c6..88b4117d4807c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -3,7 +3,7 @@ """ import warnings -from datetime import datetime, time, timedelta +from datetime import datetime, timedelta from pandas import compat import numpy as np @@ -13,11 +13,9 @@ import pandas.lib as lib from pandas.core.index import Index from pandas.util.decorators import Appender, cache_readonly -from pandas.tseries.frequencies import ( - infer_freq, to_offset, get_period_alias, - Resolution) +from pandas.tseries.frequencies import infer_freq, to_offset, Resolution import pandas.algos as _algos -from pandas.core.config import get_option + class DatetimeIndexOpsMixin(object): """ common ops mixin to support a unified inteface datetimelike Index """ diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 8e468a7701462..c273906ef3d05 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -6,7 +6,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas import lib, tslib +from pandas import tslib from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, is_datetime_arraylike, is_integer_dtype, is_list_like, get_dtype_kinds) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index d0d71c63183fa..4af8c68110978 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -935,7 +935,9 @@ def _get_wom_rule(self): return None week_of_months = unique((self.index.day - 1) // 7) - if len(week_of_months) > 1: + # Only attempt to infer up to WOM-4. See #9425 + week_of_months = week_of_months[week_of_months < 4] + if len(week_of_months) == 0 or len(week_of_months) > 1: return None # get which week diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 799be98a329fa..f55569302ca05 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -148,6 +148,9 @@ class from pandas.tseries.offsets >>> July3rd = Holiday('July 3rd', month=7, day=3, days_of_week=(0, 1, 2, 3)) """ + if offset is not None and observance is not None: + raise NotImplementedError("Cannot use both offset and observance.") + self.name = name self.year = year self.month = month diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f56b40a70d551..745c536914e47 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1,13 +1,8 @@ # pylint: disable=E1101 import operator - from datetime import time, datetime from datetime import timedelta - import numpy as np - -import warnings - from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, ABCSeries, is_integer, is_float) @@ -658,14 +653,18 @@ def _sub_datelike(self, other): def _add_delta(self, delta): from pandas import TimedeltaIndex + name = self.name + if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) + # update name when delta is Index + name = com._maybe_match_name(self, delta) else: new_values = self.astype('O') + delta tz = 'UTC' if self.tz is not None else None - result = DatetimeIndex(new_values, tz=tz, freq='infer') + result = DatetimeIndex(new_values, tz=tz, name=name, freq='infer') utc = _utc() if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) @@ -805,6 +804,7 @@ def union(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -1040,6 +1040,7 @@ def intersection(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py index 104e088ee4e84..bcce64c3a71bf 100644 --- a/pandas/tseries/interval.py +++ b/pandas/tseries/interval.py @@ -1,4 +1,3 @@ -import numpy as np from pandas.core.index import Index diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8b7dc90738bd0..6627047f0c335 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,10 +1,6 @@ # pylint: disable=E1101,E1103,W0232 -import operator - -from datetime import datetime, date, timedelta +from datetime import datetime, timedelta import numpy as np -from pandas.core.base import PandasObject - import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index @@ -114,20 +110,20 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): Parameters ---------- - data : array-like (1-dimensional), optional + data : array-like (1-dimensional), optional Optional period-like data to construct index with dtype : NumPy dtype (default: i8) - copy : bool + copy : bool Make a copy of input ndarray freq : string or period object, optional One of pandas period strings or corresponding objects start : starting value, period-like, optional If data is None, used as the start point in generating regular period data. - periods : int, optional, > 0 + periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence over end argument - end : end value, period-like, optional + end : end value, period-like, optional If periods is none, generated index will extend to first conforming period on or just past end argument year : int, array, or Series, default None @@ -505,7 +501,6 @@ def shift(self, n): ---------- n : int Periods to shift by - freq : freq string Returns ------- @@ -684,6 +679,8 @@ def join(self, other, how='left', level=None, return_indexers=False): return self._apply_meta(result) def _assert_can_do_setop(self, other): + super(PeriodIndex, self)._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): raise ValueError('can only call with other PeriodIndex-ed objects') @@ -974,8 +971,8 @@ def period_range(start=None, end=None, periods=None, freq='D', name=None): Parameters ---------- - start : - end : + start : starting value, period-like, optional + end : ending value, period-like, optional periods : int, default None Number of periods in the index freq : str/DateOffset, default 'D' diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 899d2bfdc9c76..9d28fa11f646f 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -5,17 +5,13 @@ #!!! TODO: Use the fact that axis can have units to simplify the process from matplotlib import pylab - -import numpy as np - -from pandas import isnull from pandas.tseries.period import Period from pandas.tseries.offsets import DateOffset import pandas.tseries.frequencies as frequencies from pandas.tseries.index import DatetimeIndex import pandas.core.common as com -from pandas.tseries.converter import (PeriodConverter, TimeSeries_DateLocator, +from pandas.tseries.converter import (TimeSeries_DateLocator, TimeSeries_DateFormatter) #---------------------------------------------------------------------- diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 942dea84f501a..53c1292204f71 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,14 +1,11 @@ from datetime import timedelta - import numpy as np - from pandas.core.groupby import BinGrouper, Grouper from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod from pandas.tseries.index import DatetimeIndex, date_range from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds from pandas.tseries.period import PeriodIndex, period_range -import pandas.tseries.tools as tools import pandas.core.common as com import pandas.compat as compat diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 80475fc8426db..de68dd763d68c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -1,17 +1,13 @@ """ implement the TimedeltaIndex """ -import operator -import datetime from datetime import timedelta import numpy as np - from pandas.core.common import (ABCSeries, _TD_DTYPE, _INT64_DTYPE, is_timedelta64_dtype, _maybe_box, _values_from_object, isnull, is_integer, is_float) from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u -from pandas.core.base import PandasObject from pandas.util.decorators import cache_readonly from pandas.tseries.frequencies import to_offset import pandas.core.common as com @@ -285,12 +281,15 @@ def __setstate__(self, state): def _add_delta(self, delta): if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) + name = self.name elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) + # update name when delta is index + name = com._maybe_match_name(self, delta) else: raise ValueError("cannot add the type {0} to a TimedeltaIndex".format(type(delta))) - result = TimedeltaIndex(new_values, freq='infer') + result = TimedeltaIndex(new_values, freq='infer', name=name) return result def _evaluate_with_timedelta_like(self, other, op, opstr): @@ -437,12 +436,12 @@ def union(self, other): ------- y : Index or TimedeltaIndex """ - if _is_convertible_to_index(other): + self._assert_can_do_setop(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) - except TypeError: + except (TypeError, ValueError): pass - this, other = self, other if this._can_fast_union(other): @@ -582,6 +581,7 @@ def intersection(self, other): ------- y : Index or TimedeltaIndex """ + self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index d1b986e7a7a1c..55482401a20f4 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -634,27 +634,27 @@ def test_dti_dti_deprecated_ops(self): def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'], name='foo') - dti = date_range('20130101',periods=3, name='bar') + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') td = Timedelta('1 days') dt = Timestamp('20130101') result = tdi - tdi expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) # must be foo + tm.assert_index_equal(result, expected) result = tdi + tdi expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) # must be foo + tm.assert_index_equal(result, expected) - result = dti - tdi + result = dti - tdi # name will be reset expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) tm.assert_index_equal(result, expected) def test_addition_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'], name='foo') + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') dti = date_range('20130101', periods=3, name='bar') td = Timedelta('1 days') dt = Timestamp('20130101') @@ -669,11 +669,11 @@ def test_addition_ops(self): result = td + tdi expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) # must be foo + tm.assert_index_equal(result, expected) result = tdi + td expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result,expected, check_names=False) # must be foo + tm.assert_index_equal(result, expected) # unequal length self.assertRaises(ValueError, lambda : tdi + dti[0:1]) @@ -685,21 +685,21 @@ def test_addition_ops(self): # this is a union! #self.assertRaises(TypeError, lambda : Int64Index([1,2,3]) + tdi) - result = tdi + dti + result = tdi + dti # name will be reset expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) - tm.assert_index_equal(result,expected) + tm.assert_index_equal(result, expected) - result = dti + tdi - expected = DatetimeIndex(['20130102',pd.NaT,'20130105']) - tm.assert_index_equal(result,expected) + result = dti + tdi # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) result = dt + td expected = Timestamp('20130102') - self.assertEqual(result,expected) + self.assertEqual(result, expected) result = td + dt expected = Timestamp('20130102') - self.assertEqual(result,expected) + self.assertEqual(result, expected) def test_value_counts_unique(self): # GH 7735 diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 841d81c15b4e9..69b1d84670d45 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -441,7 +441,7 @@ def test_month_range_union_tz_pytz(self): def test_month_range_union_tz_dateutil(self): _skip_if_windows_python_3() tm._skip_if_no_dateutil() - from dateutil.zoneinfo import gettz as timezone + from pandas.tslib import _dateutil_gettz as timezone tz = timezone('US/Eastern') early_start = datetime(2011, 1, 1) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 2f2d249539b81..823c762c692e5 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -212,6 +212,16 @@ def test_week_of_month(self): for i in range(1, 5): self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day)) + def test_fifth_week_of_month(self): + # Only supports freq up to WOM-4. See #9425 + func = lambda: date_range('2014-01-01', freq='WOM-5MON') + self.assertRaises(ValueError, func) + + def test_fifth_week_of_month_infer(self): + # Only attempts to infer up to WOM-4. See #9425 + index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) + assert frequencies.infer_freq(index) is None + def test_week_of_month_fake(self): #All of these dates are on same day of week and are 4 or 5 weeks apart index = DatetimeIndex(["2013-08-27","2013-10-01","2013-10-29","2013-11-26"]) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 70c706fc66398..0218af63ca7d6 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -101,15 +101,15 @@ def test_timestamp_tz_arg(self): pytz.timezone('Europe/Brussels').normalize(p).tzinfo) def test_timestamp_tz_arg_dateutil(self): - import dateutil + from pandas.tslib import _dateutil_gettz as gettz from pandas.tslib import maybe_get_tz p = Period('1/1/2005', freq='M').to_timestamp(tz=maybe_get_tz('dateutil/Europe/Brussels')) - self.assertEqual(p.tz, dateutil.zoneinfo.gettz('Europe/Brussels')) + self.assertEqual(p.tz, gettz('Europe/Brussels')) def test_timestamp_tz_arg_dateutil_from_string(self): - import dateutil + from pandas.tslib import _dateutil_gettz as gettz p = Period('1/1/2005', freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - self.assertEqual(p.tz, dateutil.zoneinfo.gettz('Europe/Brussels')) + self.assertEqual(p.tz, gettz('Europe/Brussels')) def test_timestamp_nat_tz(self): t = Period('NaT', freq='M').to_timestamp() diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 45145eb7ab7e8..948a0be91b276 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -23,6 +23,8 @@ import pandas.util.testing as tm from numpy.random import rand, randn from pandas import _np_version_under1p8 +import pandas.compat as compat + iNaT = tslib.iNaT @@ -309,51 +311,70 @@ class Other: def test_fields(self): + def check(value): + # that we are int/long like + self.assertTrue(isinstance(value, (int, compat.long))) + # compat to datetime.timedelta rng = to_timedelta('1 days, 10:11:12') - self.assertEqual(rng.days,1) - self.assertEqual(rng.seconds,10*3600+11*60+12) - self.assertEqual(rng.microseconds,0) - self.assertEqual(rng.nanoseconds,0) + self.assertEqual(rng.days, 1) + self.assertEqual(rng.seconds, 10*3600+11*60+12) + self.assertEqual(rng.microseconds, 0) + self.assertEqual(rng.nanoseconds, 0) self.assertRaises(AttributeError, lambda : rng.hours) self.assertRaises(AttributeError, lambda : rng.minutes) self.assertRaises(AttributeError, lambda : rng.milliseconds) + # GH 10050 + check(rng.days) + check(rng.seconds) + check(rng.microseconds) + check(rng.nanoseconds) + td = Timedelta('-1 days, 10:11:12') - self.assertEqual(abs(td),Timedelta('13:48:48')) + self.assertEqual(abs(td), Timedelta('13:48:48')) self.assertTrue(str(td) == "-1 days +10:11:12") - self.assertEqual(-td,Timedelta('0 days 13:48:48')) - self.assertEqual(-Timedelta('-1 days, 10:11:12').value,49728000000000) - self.assertEqual(Timedelta('-1 days, 10:11:12').value,-49728000000000) + self.assertEqual(-td, Timedelta('0 days 13:48:48')) + self.assertEqual(-Timedelta('-1 days, 10:11:12').value, 49728000000000) + self.assertEqual(Timedelta('-1 days, 10:11:12').value, -49728000000000) rng = to_timedelta('-1 days, 10:11:12.100123456') - self.assertEqual(rng.days,-1) - self.assertEqual(rng.seconds,10*3600+11*60+12) - self.assertEqual(rng.microseconds,100*1000+123) - self.assertEqual(rng.nanoseconds,456) + self.assertEqual(rng.days, -1) + self.assertEqual(rng.seconds, 10*3600+11*60+12) + self.assertEqual(rng.microseconds, 100*1000+123) + self.assertEqual(rng.nanoseconds, 456) self.assertRaises(AttributeError, lambda : rng.hours) self.assertRaises(AttributeError, lambda : rng.minutes) self.assertRaises(AttributeError, lambda : rng.milliseconds) # components tup = pd.to_timedelta(-1, 'us').components - self.assertEqual(tup.days,-1) - self.assertEqual(tup.hours,23) - self.assertEqual(tup.minutes,59) - self.assertEqual(tup.seconds,59) - self.assertEqual(tup.milliseconds,999) - self.assertEqual(tup.microseconds,999) - self.assertEqual(tup.nanoseconds,0) + self.assertEqual(tup.days, -1) + self.assertEqual(tup.hours, 23) + self.assertEqual(tup.minutes, 59) + self.assertEqual(tup.seconds, 59) + self.assertEqual(tup.milliseconds, 999) + self.assertEqual(tup.microseconds, 999) + self.assertEqual(tup.nanoseconds, 0) + + # GH 10050 + check(tup.days) + check(tup.hours) + check(tup.minutes) + check(tup.seconds) + check(tup.milliseconds) + check(tup.microseconds) + check(tup.nanoseconds) tup = Timedelta('-1 days 1 us').components - self.assertEqual(tup.days,-2) - self.assertEqual(tup.hours,23) - self.assertEqual(tup.minutes,59) - self.assertEqual(tup.seconds,59) - self.assertEqual(tup.milliseconds,999) - self.assertEqual(tup.microseconds,999) - self.assertEqual(tup.nanoseconds,0) + self.assertEqual(tup.days, -2) + self.assertEqual(tup.hours, 23) + self.assertEqual(tup.minutes, 59) + self.assertEqual(tup.seconds, 59) + self.assertEqual(tup.milliseconds, 999) + self.assertEqual(tup.microseconds, 999) + self.assertEqual(tup.nanoseconds, 0) def test_timedelta_range(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 0c4961d80a5f4..8412ba8d4aad1 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -417,9 +417,9 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): _skip_if_windows_python_3() tm._skip_if_no_dateutil() - import dateutil + from pandas.tslib import _dateutil_gettz as gettz rng = date_range('20090415', '20090519', - tz=dateutil.zoneinfo.gettz('US/Eastern')) + tz=gettz('US/Eastern')) stamp = rng[0] dtval = stamp.to_pydatetime() @@ -791,7 +791,7 @@ def test_series_repr_nat(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') result = repr(series) - expected = ('0 1970-01-01 00:00:00\n' + expected = ('0 1970-01-01 00:00:00.000000\n' '1 1970-01-01 00:00:00.000001\n' '2 1970-01-01 00:00:00.000002\n' '3 NaT\n' @@ -1807,7 +1807,7 @@ def test_append_concat_tz_explicit_pytz(self): def test_append_concat_tz_dateutil(self): # GH 2938 tm._skip_if_no_dateutil() - from dateutil.zoneinfo import gettz as timezone + from pandas.tslib import _dateutil_gettz as timezone rng = date_range('5/8/2012 1:45', periods=10, freq='5T', tz='dateutil/US/Eastern') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index e452ddee9d8db..341450f504e2a 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -14,6 +14,8 @@ import pandas.tseries.offsets as offsets import pandas.util.testing as tm from pandas.util.testing import assert_series_equal +import pandas.compat as compat + class TestTimestamp(tm.TestCase): @@ -369,6 +371,50 @@ def test_today(self): self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - ts_from_method_tz.tz_localize(None)) < delta) + def test_fields(self): + + def check(value, equal): + # that we are int/long like + self.assertTrue(isinstance(value, (int, compat.long))) + self.assertEqual(value, equal) + + # GH 10050 + ts = Timestamp('2015-05-10 09:06:03.000100001') + check(ts.year, 2015) + check(ts.month, 5) + check(ts.day, 10) + check(ts.hour, 9) + check(ts.minute, 6) + check(ts.second, 3) + self.assertRaises(AttributeError, lambda : ts.millisecond) + check(ts.microsecond, 100) + check(ts.nanosecond, 1) + check(ts.dayofweek, 6) + check(ts.quarter, 2) + check(ts.dayofyear, 130) + check(ts.week, 19) + check(ts.daysinmonth, 31) + check(ts.daysinmonth, 31) + + def test_nat_fields(self): + # GH 10050 + ts = Timestamp('NaT') + self.assertTrue(np.isnan(ts.year)) + self.assertTrue(np.isnan(ts.month)) + self.assertTrue(np.isnan(ts.day)) + self.assertTrue(np.isnan(ts.hour)) + self.assertTrue(np.isnan(ts.minute)) + self.assertTrue(np.isnan(ts.second)) + self.assertTrue(np.isnan(ts.microsecond)) + self.assertTrue(np.isnan(ts.nanosecond)) + self.assertTrue(np.isnan(ts.dayofweek)) + self.assertTrue(np.isnan(ts.quarter)) + self.assertTrue(np.isnan(ts.dayofyear)) + self.assertTrue(np.isnan(ts.week)) + self.assertTrue(np.isnan(ts.daysinmonth)) + self.assertTrue(np.isnan(ts.days_in_month)) + + class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): bad_date_strings = ( diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 5b353058f0093..624981c5536f5 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -3,14 +3,12 @@ """ import re -from datetime import timedelta - import numpy as np import pandas.tslib as tslib from pandas import compat -from pandas.core.common import (ABCSeries, is_integer, is_integer_dtype, - is_timedelta64_dtype, _values_from_object, - is_list_like, isnull, _ensure_object) +from pandas.core.common import (ABCSeries, is_integer_dtype, + is_timedelta64_dtype, is_list_like, + isnull, _ensure_object) def to_timedelta(arg, unit='ns', box=True, coerce=False): """ diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 72b12ea495ba0..6c534de0a7aaa 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,8 +1,5 @@ from pandas.compat import range, lrange import numpy as np - -import pandas as pd - import pandas.core.common as com from pandas.core.frame import DataFrame import pandas.core.nanops as nanops diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 40dbbd7584c7a..8fda9bb31061e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -5,6 +5,7 @@ from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) import numpy as np +from cpython.ref cimport PyObject from cpython cimport ( PyTypeObject, PyFloat_Check, @@ -12,13 +13,14 @@ from cpython cimport ( PyObject_RichCompareBool, PyObject_RichCompare, PyString_Check, - Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE + Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE, ) # Cython < 0.17 doesn't have this in cpython cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) int PySlice_Check(object) + object PyUnicode_FromFormat(const char*, ...) cdef extern from "datetime_helper.h": double total_seconds(object) @@ -41,7 +43,11 @@ from datetime import time as datetime_time # dateutil compat from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, tzfile as _dateutil_tzfile, tzutc as _dateutil_tzutc) -from dateutil.zoneinfo import gettz as _dateutil_gettz +from pandas.compat import is_platform_windows +if is_platform_windows(): + from dateutil.zoneinfo import gettz as _dateutil_gettz +else: + from dateutil.tz import gettz as _dateutil_gettz from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo from pandas.compat import parse_date, string_types, PY3, iteritems @@ -627,7 +633,7 @@ class NaTType(_NaT): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', - 'week', 'dayofyear', 'days_in_month'] + 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek'] for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) @@ -952,7 +958,7 @@ cdef class _Timestamp(datetime): cpdef _get_field(self, field): out = get_date_field(np.array([self.value], dtype=np.int64), field) - return out[0] + return int(out[0]) cpdef _get_start_end_field(self, field): month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 @@ -1414,6 +1420,8 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object f """ cdef: int64_t val, ns, N = len(values) + ndarray[int64_t] consider_values + bint show_ms = 0, show_us = 0, show_ns = 0, basic_format = 0 ndarray[object] result = np.empty(N, dtype=object) object ts, res pandas_datetimestruct dts @@ -1421,43 +1429,82 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object f if na_rep is None: na_rep = 'NaT' + # if we don't have a format nor tz, then choose + # a format based on precision + basic_format = format is None and tz is None + if basic_format: + consider_values = values[values != iNaT] + show_ns = (consider_values%1000).any() + + if not show_ns: + consider_values //= 1000 + show_us = (consider_values%1000).any() + + if not show_ms: + consider_values //= 1000 + show_ms = (consider_values%1000).any() + for i in range(N): - val = values[i] + val = values[i] - if val == iNaT: - result[i] = na_rep - else: - if format is None and tz is None: + if val == iNaT: + result[i] = na_rep + elif basic_format: pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) - res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, - dts.month, - dts.day, - dts.hour, - dts.min, - dts.sec) + if show_ns: + ns = dts.ps / 1000 + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%09d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + ns + 1000 * dts.us) + elif show_us: + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%06d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + dts.us) + + elif show_ms: + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%03d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + dts.us/1000) + else: + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec) - ns = dts.ps / 1000 + result[i] = res - if ns != 0: - res += '.%.9d' % (ns + 1000 * dts.us) - elif dts.us != 0: - res += '.%.6d' % dts.us + else: - result[i] = res + ts = Timestamp(val, tz=tz) + if format is None: + result[i] = str(ts) + else: - else: - ts = Timestamp(val, tz=tz) - if format is None: - result[i] = str(ts) - else: - - # invalid format string - # requires dates > 1900 - try: - result[i] = ts.strftime(format) - except ValueError: - result[i] = str(ts) + # invalid format string + # requires dates > 1900 + try: + result[i] = ts.strftime(format) + except ValueError: + result[i] = str(ts) return result diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index d839437a6fe33..9cd538511e946 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -26,7 +26,7 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None): Name of prefered argument in function mapping : dict or callable If mapping is present, use it to translate old arguments to - new arguments. A callable must do its own value checking; + new arguments. A callable must do its own value checking; values not found in a dict will be forwarded unchanged. Examples @@ -45,7 +45,7 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None): should raise warning >>> f(cols='should error', columns="can't pass do both") TypeError: Can only specify 'cols' or 'columns', not both - >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no', False}) + >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False}) ... def f(new=False): ... print('yes!' if new else 'no!') ... diff --git a/setup.py b/setup.py index edd3e398c27be..4375aa550f020 100755 --- a/setup.py +++ b/setup.py @@ -195,7 +195,7 @@ def build_extensions(self): MAJOR = 0 MINOR = 16 MICRO = 1 -ISRELEASED = True +ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) QUALIFIER = '' diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index f0c3961ae0277..57fb1ada78691 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -135,6 +135,16 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark("ts.resample('D', how='mean')", setup, start_date=datetime(2012, 4, 25)) +# GH 7754 +setup = common_setup + """ +rng = date_range(start='2000-01-01 00:00:00', + end='2000-01-01 10:00:00', freq='555000U') +int_ts = Series(5, rng, dtype='int64') +ts = int_ts.astype('datetime64[ns]') +""" + +timeseries_resample_datetime64 = Benchmark("ts.resample('1S', how='last')", setup) + #---------------------------------------------------------------------- # to_datetime