Merge pull request #10967 from chris-b1/excel-read-multiindex

ENH: read_excel MultiIndex #4679
pandas-dev · Sep 9, 2015 · 0e56279 · 0e56279
2 parents 2bddfcf + 98405f0
commit 0e56279
Show file tree

Hide file tree

Showing 12 changed files with 339 additions and 141 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1989,6 +1989,46 @@ advanced strategies
 Reading Excel Files
 '''''''''''''''''''
 
+.. versionadded:: 0.17
+
+``read_excel`` can read a ``MultiIndex`` index, by passing a list of columns to ``index_col``
+and a ``MultiIndex`` column by passing a list of rows to ``header``.  If either the ``index``
+or ``columns`` have serialized level names those will be read in as well by specifying
+the rows/columns that make up the levels.
+
+.. ipython:: python
+
+   # MultiIndex index - no names
+   df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]},
+                     index=pd.MultiIndex.from_product([['a','b'],['c','d']]))
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
+   df
+
+   # MultiIndex index - with names
+   df.index = df.index.set_names(['lvl1', 'lvl2'])
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
+   df
+
+   # MultiIndex index and column - with names
+   df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2'])
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx',
+                       index_col=[0,1], header=[0,1])
+   df
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('path_to_file.xlsx')
+
+.. warning::
+
+   Excel files saved in version 0.16.2 or prior that had index names will still able to be read in,
+   but the ``has_index_names`` argument must specified to ``True``.
+
 .. versionadded:: 0.16
 
 ``read_excel`` can read more than one sheet, by setting ``sheetname`` to either

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -205,6 +205,53 @@ The support math functions are `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`,
 These functions map to the intrinsics for the NumExpr engine.  For Python
 engine, they are mapped to NumPy calls.
 
+Changes to Excel with ``MultiIndex``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In version 0.16.2 a ``DataFrame`` with ``MultiIndex`` columns could not be written to Excel via ``to_excel``.
+That functionality has been added (:issue:`10564`), along with updating  ``read_excel`` so that the data can
+be read back with no loss of information by specifying which columns/rows make up the ``MultiIndex``
+in the ``header`` and ``index_col`` parameters (:issue:`4679`)
+
+See the :ref:`documentation <io.excel>` for more details.
+
+.. ipython:: python
+
+   df = pd.DataFrame([[1,2,3,4], [5,6,7,8]],
+                     columns = pd.MultiIndex.from_product([['foo','bar'],['a','b']],
+                                                          names = ['col1', 'col2']),
+                     index = pd.MultiIndex.from_product([['j'], ['l', 'k']],
+                                                        names = ['i1', 'i2']))
+
+   df
+   df.to_excel('test.xlsx')
+
+   df = pd.read_excel('test.xlsx', header=[0,1], index_col=[0,1])
+   df
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('test.xlsx')
+
+Previously, it was necessary to specify the ``has_index_names`` argument in ``read_excel``
+if the serialized data had index names.  For version 0.17 the ouptput format of ``to_excel``
+has been changed to make this keyword unnecessary - the change is shown below.
+
+**Old**
+
+.. image:: _static/old-excel-index.png
+
+**New**
+
+.. image:: _static/new-excel-index.png
+
+.. warning::
+
+   Excel files saved in version 0.16.2 or prior that had index names will still able to be read in,
+   but the ``has_index_names`` argument must specified to ``True``.
+
+
 .. _whatsnew_0170.enhancements.other:
 
 Other enhancements
@@ -764,7 +811,6 @@ Changes to ``Categorical.unique``
    cat
    cat.unique()
 
-
 .. _whatsnew_0170.api_breaking.other:
 
 Other API Changes
@@ -774,7 +820,6 @@ Other API Changes
 - Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
 - The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`).
-- Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue:`10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`).
 - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above  (:issue:`10508`)
 - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
 - When constructing ``DataFrame`` with an array of ``complex64`` dtype that meant the corresponding column was automatically promoted to the ``complex128`` dtype. Pandas will now preserve the itemsize of the input for complex data (:issue:`10952`)

diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -4,7 +4,6 @@
 # pylint: disable=W0141
 
 import sys
-import warnings
 
 from pandas.core.base import PandasObject
 from pandas.core.common import adjoin, notnull
@@ -1641,14 +1640,11 @@ class ExcelFormatter(object):
     inf_rep : string, default `'inf'`
         representation for np.inf values (which aren't representable in Excel)
         A `'-'` sign will be added in front of -inf.
-    verbose: boolean, default True
-        If True, warn user that the resulting output file may not be
-        re-read or parsed directly by pandas.
     """
 
     def __init__(self, df, na_rep='', float_format=None, cols=None,
                  header=True, index=True, index_label=None, merge_cells=False,
-                 inf_rep='inf', verbose=True):
+                 inf_rep='inf'):
         self.df = df
         self.rowcounter = 0
         self.na_rep = na_rep
@@ -1661,7 +1657,6 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
         self.header = header
         self.merge_cells = merge_cells
         self.inf_rep = inf_rep
-        self.verbose = verbose
 
     def _format_value(self, val):
         if lib.checknull(val):
@@ -1682,10 +1677,6 @@ def _format_header_mi(self):
                 raise NotImplementedError("Writing to Excel with MultiIndex"
                                           " columns and no index ('index'=False) "
                                           "is not yet implemented.")
-            elif self.index and self.verbose:
-                warnings.warn("Writing to Excel with MultiIndex columns is a"
-                              " one way serializable operation. You will not"
-                              " be able to re-read or parse the output file.")
 
         has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
         if not(has_aliases or self.header):
@@ -1796,18 +1787,14 @@ def _format_regular_rows(self):
             else:
                 index_label = self.df.index.names[0]
 
+            if isinstance(self.columns, MultiIndex):
+                self.rowcounter += 1
+
             if index_label and self.header is not False:
-                if self.merge_cells:
-                    yield ExcelCell(self.rowcounter,
-                                    0,
-                                    index_label,
-                                    header_style)
-                    self.rowcounter += 1
-                else:
-                    yield ExcelCell(self.rowcounter - 1,
-                                    0,
-                                    index_label,
-                                    header_style)
+                yield ExcelCell(self.rowcounter - 1,
+                                0,
+                                index_label,
+                                header_style)
 
             # write index_values
             index_values = self.df.index
@@ -1841,19 +1828,21 @@ def _format_hierarchical_rows(self):
                                                (list, tuple, np.ndarray, Index)):
                 index_labels = self.index_label
 
+            # MultiIndex columns require an extra row
+            # with index names (blank if None) for
+            # unambigous round-trip
+            if isinstance(self.columns, MultiIndex):
+                self.rowcounter += 1
+
             # if index labels are not empty go ahead and dump
             if (any(x is not None for x in index_labels)
                     and self.header is not False):
 
-                if not self.merge_cells:
-                    self.rowcounter -= 1
-
                 for cidx, name in enumerate(index_labels):
-                    yield ExcelCell(self.rowcounter,
+                    yield ExcelCell(self.rowcounter - 1,
                                     cidx,
                                     name,
                                     header_style)
-                self.rowcounter += 1
 
             if self.merge_cells:
                 # Format hierarchical rows as merged cells.

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1336,9 +1336,6 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
         inf_rep : string, default 'inf'
             Representation for infinity (there is no native representation for
             infinity in Excel)
-        verbose: boolean, default True
-             If True, warn user that the resulting output file may not be
-             re-read or parsed directly by pandas.
 
         Notes
         -----
@@ -1371,7 +1368,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                                        index=index,
                                        index_label=index_label,
                                        merge_cells=merge_cells,
-                                       inf_rep=inf_rep, verbose=verbose)
+                                       inf_rep=inf_rep)
         formatted_cells = formatter.get_formatted_cells()
         excel_writer.write_cells(formatted_cells, sheet_name,
                                  startrow=startrow, startcol=startcol)