Merge remote-tracking branch 'upstream/master' into feature/flake8-rst

# Conflicts: # ci/code_checks.sh
pandas-dev · Nov 4, 2018 · 403cc95 · 403cc95
2 parents c01e22d + 047242b
commit 403cc95
Show file tree

Hide file tree

Showing 120 changed files with 2,083 additions and 1,125 deletions.
diff --git a/ci/azure-windows-36.yaml b/ci/azure-windows-36.yaml
@@ -7,7 +7,6 @@ dependencies:
   - bottleneck
   - boost-cpp<1.67
   - fastparquet
-  - feather-format
   - matplotlib
   - numexpr
   - numpy=1.14*

diff --git a/ci/azure/windows-py27.yml b/ci/azure/windows-py27.yml
@@ -37,7 +37,7 @@ jobs:
       displayName: 'Build'
     - script: |
         call activate %CONDA_ENV%
-        pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict %*
+        pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %*
       displayName: 'Test'
     - task: PublishTestResults@2
       inputs:

diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml
@@ -28,7 +28,7 @@ jobs:
       displayName: 'Build'
     - script: |
         call activate %CONDA_ENV%
-        pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict %*
+        pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %*
       displayName: 'Test'
     - task: PublishTestResults@2
       inputs:

diff --git a/ci/circle/run_circle.sh b/ci/circle/run_circle.sh
@@ -5,5 +5,5 @@ export PATH="$MINICONDA_DIR/bin:$PATH"
 
 source activate pandas
 
-echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas"
-pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas
+echo "pytest --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas"
+pytest       --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -47,12 +47,16 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     echo "flake8-rst --version"
     flake8-rst --version
 
-    MSG='Linting code-blocks in .py docstrings' ; echo $MSG
-    flake8-rst pandas
+    MSG='Linting code-blocks in .rst documentation' ; echo $MSG
+    flake8-rst doc/source --filename=*.rst
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Linting code-blocks in .rst documentation' ; echo $MSG
-    flake8-rst doc --filename=*.rst
+    # Check that cython casting is of the form `<type>obj` as opposed to `<type> obj`;
+    # it doesn't make a difference, but we want to be internally consistent.
+    # Note: this grep pattern is (intended to be) equivalent to the python
+    # regex r'(?<![ ->])> '
+    MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG
+    ! grep -r -E --include '*.pyx' --include '*.pxi.in' '> ' pandas/_libs | grep -v '[ ->]> '
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     # readability/casting: Warnings about C casting instead of C++ casting

diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
@@ -2,7 +2,6 @@ beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
 fastparquet
-feather-format
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -13,7 +12,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow
+pyarrow>=0.4.1
 pymysql
 pytables>=3.4.2
 pytest-cov

diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
@@ -4,7 +4,6 @@ beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
 fastparquet
-feather-format
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -15,7 +14,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow
+pyarrow>=0.4.1
 pymysql
 tables
 pytest-cov
@@ -28,4 +27,4 @@ statsmodels
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
diff --git a/ci/script_multi.sh b/ci/script_multi.sh
@@ -27,17 +27,17 @@ if [ "$DOC" ]; then
     echo "We are not running pytest as this is a doc-build"
 
 elif [ "$COVERAGE" ]; then
-    echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
-    pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
+    echo pytest -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
+    pytest      -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
 
 elif [ "$SLOW" ]; then
     TEST_ARGS="--only-slow --skip-network"
-    echo pytest -m "not single and slow" -v --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
-    pytest      -m "not single and slow" -v --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
+    echo pytest -m "not single and slow" -v --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
+    pytest      -m "not single and slow" -v --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
 
 else
-    echo pytest -n 2 -m "not single" --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
-    pytest      -n 2 -m "not single" --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas # TODO: doctest
+    echo pytest -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas
+    pytest      -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas # TODO: doctest
 
 fi
 

diff --git a/ci/script_single.sh b/ci/script_single.sh
@@ -26,13 +26,13 @@ if [ "$DOC" ]; then
     echo "We are not running pytest as this is a doc-build"
 
 elif [ "$COVERAGE" ]; then
-    echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas
-    pytest      -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas
+    echo pytest -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas
+    pytest      -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas
     echo pytest -s --strict scripts
     pytest      -s --strict scripts
 else
-    echo pytest -m "single" --junitxml=test-data-single.xml --strict $TEST_ARGS pandas
-    pytest      -m "single" --junitxml=test-data-single.xml --strict $TEST_ARGS pandas
+    echo pytest -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas
+    pytest      -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas
 
 fi
 

diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml
@@ -7,7 +7,6 @@ dependencies:
   - bottleneck
   - cython=0.28.2
   - fastparquet
-  - feather-format
   - gcsfs
   - html5lib
   - ipython

diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml
@@ -8,7 +8,6 @@ dependencies:
   - bottleneck
   - cython>=0.28.2
   - fastparquet
-  - feather-format
   - html5lib
   - hypothesis>=3.58.0
   - ipykernel
@@ -24,6 +23,7 @@ dependencies:
   - numpy=1.13*
   - openpyxl
   - pandoc
+  - pyarrow
   - pyqt
   - pytables
   - python-dateutil

diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml
@@ -7,7 +7,6 @@ dependencies:
   - cython>=0.28.2
   - dask
   - fastparquet
-  - feather-format
   - flake8>=3.5
   - flake8-comprehensions
   - flake8-rst
@@ -24,7 +23,7 @@ dependencies:
   - numpy
   - openpyxl
   - psycopg2
-  - pyarrow
+  - pyarrow=0.9.0
   - pymysql
   - pytables
   - python-snappy

diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml
@@ -9,6 +9,7 @@ dependencies:
   - numpy
   - python-dateutil
   - nomkl
+  - pyarrow
   - pytz
   - pytest
   - pytest-xdist

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -2104,6 +2104,62 @@ Methods
     Timedelta.to_timedelta64
     Timedelta.total_seconds
 
+.. _api.dateoffsets:
+
+Date Offsets
+------------
+
+.. currentmodule:: pandas.tseries.offsets
+
+.. autosummary::
+   :toctree: generated/
+
+   DateOffset
+   BusinessDay
+   BusinessHour
+   CustomBusinessDay
+   CustomBusinessHour
+   MonthOffset
+   MonthEnd
+   MonthBegin
+   BusinessMonthEnd
+   BusinessMonthBegin
+   CustomBusinessMonthEnd
+   CustomBusinessMonthBegin
+   SemiMonthOffset
+   SemiMonthEnd
+   SemiMonthBegin
+   Week
+   WeekOfMonth
+   LastWeekOfMonth
+   QuarterOffset
+   BQuarterEnd
+   BQuarterBegin
+   QuarterEnd
+   QuarterBegin
+   YearOffset
+   BYearEnd
+   BYearBegin
+   YearEnd
+   YearBegin
+   FY5253
+   FY5253Quarter
+   Easter
+   Tick
+   Day
+   Hour
+   Minute
+   Second
+   Milli
+   Micro
+   Nano
+   BDay
+   BMonthEnd
+   BMonthBegin
+   CBMonthEnd
+   CBMonthBegin
+   CDay
+
 .. _api.frequencies:
 
 Frequencies

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -99,7 +99,7 @@
 # JP: added from sphinxdocs
 autosummary_generate = False
 
-if any(re.match("\s*api\s*", l) for l in index_rst_lines):
+if any(re.match(r"\s*api\s*", l) for l in index_rst_lines):
     autosummary_generate = True
 
 # numpydoc
@@ -341,8 +341,8 @@
 # file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
     ('index', 'pandas.tex',
-     u'pandas: powerful Python data analysis toolkit',
-     u'Wes McKinney\n\& PyData Development Team', 'manual'),
+     'pandas: powerful Python data analysis toolkit',
+     r'Wes McKinney\n\& PyData Development Team', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of

diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
@@ -1226,6 +1226,17 @@ Computation
 Correlation
 ***********
 
+Often it's useful to obtain the lower (or upper) triangular form of a correlation matrix calculated from :func:`DataFrame.corr`.  This can be achieved by passing a boolean mask to ``where`` as follows:
+
+.. ipython:: python
+
+    df = pd.DataFrame(np.random.random(size=(100, 5)))
+
+    corr_mat = df.corr()
+    mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1)
+
+    corr_mat.where(mask)
+
 The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types.  Here we compute the `distance correlation <https://en.wikipedia.org/wiki/Distance_correlation>`__ matrix for a `DataFrame` object.
 
 .. code-block:: python

diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -125,6 +125,16 @@ We could naturally group by either the ``A`` or ``B`` columns, or both:
    grouped = df.groupby('A')
    grouped = df.groupby(['A', 'B'])
 
+.. versionadded:: 0.24
+
+If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all
+but the specified columns
+
+.. ipython:: python
+
+   df2 = df.set_index(['A', 'B'])
+   grouped = df2.groupby(level=df2.index.names.difference(['B'])
+
 These will split the DataFrame on its index (rows). We could also split by the
 columns:
 

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -258,7 +258,7 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
-* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
+* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
 * `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are: