diff --git a/doc/source/io.rst b/doc/source/io.rst index 31d0be6151ba4..f3d14b78bbf54 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1989,6 +1989,46 @@ advanced strategies Reading Excel Files ''''''''''''''''''' +.. versionadded:: 0.17 + +``read_excel`` can read a ``MultiIndex`` index, by passing a list of columns to ``index_col`` +and a ``MultiIndex`` column by passing a list of rows to ``header``. If either the ``index`` +or ``columns`` have serialized level names those will be read in as well by specifying +the rows/columns that make up the levels. + +.. ipython:: python + + # MultiIndex index - no names + df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]}, + index=pd.MultiIndex.from_product([['a','b'],['c','d']])) + df.to_excel('path_to_file.xlsx') + df = pd.read_excel('path_to_file.xlsx', index_col=[0,1]) + df + + # MultiIndex index - with names + df.index = df.index.set_names(['lvl1', 'lvl2']) + df.to_excel('path_to_file.xlsx') + df = pd.read_excel('path_to_file.xlsx', index_col=[0,1]) + df + + # MultiIndex index and column - with names + df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2']) + df.to_excel('path_to_file.xlsx') + df = pd.read_excel('path_to_file.xlsx', + index_col=[0,1], header=[0,1]) + df + +.. ipython:: python + :suppress: + + import os + os.remove('path_to_file.xlsx') + +.. warning:: + + Excel files saved in version 0.16.2 or prior that had index names will still able to be read in, + but the ``has_index_names`` argument must specified to ``True``. + .. versionadded:: 0.16 ``read_excel`` can read more than one sheet, by setting ``sheetname`` to either diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 8eb7a3e9e8830..38c5593e5911a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -205,6 +205,53 @@ The support math functions are `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, These functions map to the intrinsics for the NumExpr engine. For Python engine, they are mapped to NumPy calls. +Changes to Excel with ``MultiIndex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In version 0.16.2 a ``DataFrame`` with ``MultiIndex`` columns could not be written to Excel via ``to_excel``. +That functionality has been added (:issue:`10564`), along with updating ``read_excel`` so that the data can +be read back with no loss of information by specifying which columns/rows make up the ``MultiIndex`` +in the ``header`` and ``index_col`` parameters (:issue:`4679`) + +See the :ref:`documentation ` for more details. + +.. ipython:: python + + df = pd.DataFrame([[1,2,3,4], [5,6,7,8]], + columns = pd.MultiIndex.from_product([['foo','bar'],['a','b']], + names = ['col1', 'col2']), + index = pd.MultiIndex.from_product([['j'], ['l', 'k']], + names = ['i1', 'i2'])) + + df + df.to_excel('test.xlsx') + + df = pd.read_excel('test.xlsx', header=[0,1], index_col=[0,1]) + df + +.. ipython:: python + :suppress: + + import os + os.remove('test.xlsx') + +Previously, it was necessary to specify the ``has_index_names`` argument in ``read_excel`` +if the serialized data had index names. For version 0.17 the ouptput format of ``to_excel`` +has been changed to make this keyword unnecessary - the change is shown below. + +**Old** + +.. image:: _static/old-excel-index.png + +**New** + +.. image:: _static/new-excel-index.png + +.. warning:: + + Excel files saved in version 0.16.2 or prior that had index names will still able to be read in, + but the ``has_index_names`` argument must specified to ``True``. + + .. _whatsnew_0170.enhancements.other: Other enhancements @@ -764,7 +811,6 @@ Changes to ``Categorical.unique`` cat cat.unique() - .. _whatsnew_0170.api_breaking.other: Other API Changes @@ -774,7 +820,6 @@ Other API Changes - Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`) - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). - The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`). -- Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue:`10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`). - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`) - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`) - When constructing ``DataFrame`` with an array of ``complex64`` dtype that meant the corresponding column was automatically promoted to the ``complex128`` dtype. Pandas will now preserve the itemsize of the input for complex data (:issue:`10952`) diff --git a/pandas/core/format.py b/pandas/core/format.py index 29f1e1efe9f5d..47d0ef37383c4 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -4,7 +4,6 @@ # pylint: disable=W0141 import sys -import warnings from pandas.core.base import PandasObject from pandas.core.common import adjoin, notnull @@ -1641,14 +1640,11 @@ class ExcelFormatter(object): inf_rep : string, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. - verbose: boolean, default True - If True, warn user that the resulting output file may not be - re-read or parsed directly by pandas. """ def __init__(self, df, na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, merge_cells=False, - inf_rep='inf', verbose=True): + inf_rep='inf'): self.df = df self.rowcounter = 0 self.na_rep = na_rep @@ -1661,7 +1657,6 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, self.header = header self.merge_cells = merge_cells self.inf_rep = inf_rep - self.verbose = verbose def _format_value(self, val): if lib.checknull(val): @@ -1682,10 +1677,6 @@ def _format_header_mi(self): raise NotImplementedError("Writing to Excel with MultiIndex" " columns and no index ('index'=False) " "is not yet implemented.") - elif self.index and self.verbose: - warnings.warn("Writing to Excel with MultiIndex columns is a" - " one way serializable operation. You will not" - " be able to re-read or parse the output file.") has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if not(has_aliases or self.header): @@ -1796,18 +1787,14 @@ def _format_regular_rows(self): else: index_label = self.df.index.names[0] + if isinstance(self.columns, MultiIndex): + self.rowcounter += 1 + if index_label and self.header is not False: - if self.merge_cells: - yield ExcelCell(self.rowcounter, - 0, - index_label, - header_style) - self.rowcounter += 1 - else: - yield ExcelCell(self.rowcounter - 1, - 0, - index_label, - header_style) + yield ExcelCell(self.rowcounter - 1, + 0, + index_label, + header_style) # write index_values index_values = self.df.index @@ -1841,19 +1828,21 @@ def _format_hierarchical_rows(self): (list, tuple, np.ndarray, Index)): index_labels = self.index_label + # MultiIndex columns require an extra row + # with index names (blank if None) for + # unambigous round-trip + if isinstance(self.columns, MultiIndex): + self.rowcounter += 1 + # if index labels are not empty go ahead and dump if (any(x is not None for x in index_labels) and self.header is not False): - if not self.merge_cells: - self.rowcounter -= 1 - for cidx, name in enumerate(index_labels): - yield ExcelCell(self.rowcounter, + yield ExcelCell(self.rowcounter - 1, cidx, name, header_style) - self.rowcounter += 1 if self.merge_cells: # Format hierarchical rows as merged cells. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e81aaebe77807..b4bb06fe83649 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1336,9 +1336,6 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', inf_rep : string, default 'inf' Representation for infinity (there is no native representation for infinity in Excel) - verbose: boolean, default True - If True, warn user that the resulting output file may not be - re-read or parsed directly by pandas. Notes ----- @@ -1371,7 +1368,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', index=index, index_label=index_label, merge_cells=merge_cells, - inf_rep=inf_rep, verbose=verbose) + inf_rep=inf_rep) formatted_cells = formatter.get_formatted_cells() excel_writer.write_cells(formatted_cells, sheet_name, startrow=startrow, startcol=startcol) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index d5258cb32e6e0..b113cbf057f39 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -18,6 +18,7 @@ BytesIO, string_types) from pandas.core import config from pandas.core.common import pprint_thing +from pandas.util.decorators import Appender import pandas.compat as compat import pandas.compat.openpyxl_compat as openpyxl_compat import pandas.core.common as com @@ -68,15 +69,11 @@ def get_writer(engine_name): raise ValueError("No Excel writer '%s'" % engine_name) -def read_excel(io, sheetname=0, **kwds): - """Read an Excel table into a pandas DataFrame +excel_doc_common = """ + Read an Excel table into a pandas DataFrame Parameters - ---------- - io : string, file-like object, or xlrd workbook. - The string could be a URL. Valid URL schemes include http, ftp, s3, - and file. For file URLs, a host is expected. For instance, a local - file could be file://localhost/path/to/workbook.xlsx + ----------%(io)s sheetname : string, int, mixed list of strings/ints, or None, default 0 Strings are used for sheet names, Integers are used in zero-indexed sheet @@ -97,20 +94,23 @@ def read_excel(io, sheetname=0, **kwds): * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames * None -> All sheets as a dictionary of DataFrames - header : int, default 0 - Row to use for the column labels of the parsed DataFrame + header : int, list of ints, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex`` skiprows : list-like Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) + index_col : int, list of ints, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex`` converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. - index_col : int, default None - Column to use as the row labels of the DataFrame. Pass None if - there is no such column parse_cols : int or list, default None * If None then parse all columns, * If int then indicates last column to be parsed @@ -119,22 +119,21 @@ def read_excel(io, sheetname=0, **kwds): column ranges (e.g. "A:E" or "A,C,E:F") na_values : list-like, default None List of additional strings to recognize as NA/NaN + thousands : str, default None + Thousands separator keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to verbose : boolean, default False - Indicate number of NA values placed in non-numeric columns - engine: string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd + Indicate number of NA values placed in non-numeric columns%(eng)s convert_float : boolean, default True convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally - has_index_names : boolean, default False - True if the cols defined in index_col have an index name and are - not in the header. Index name will be placed on a separate line below - the header. + has_index_names : boolean, default None + DEPCRECATED: for version 0.17+ index names will be automatically inferred + based on index_col. To read Excel output from 0.16.2 and prior that + had saved index names, use True. Returns ------- @@ -143,6 +142,19 @@ def read_excel(io, sheetname=0, **kwds): for more information on when a Dict of Dataframes is returned. """ +read_excel_kwargs = dict() +read_excel_kwargs['io'] = """ + io : string, file-like object, or xlrd workbook. + The string could be a URL. Valid URL schemes include http, ftp, s3, + and file. For file URLs, a host is expected. For instance, a local + file could be file://localhost/path/to/workbook.xlsx""" +read_excel_kwargs['eng'] = """ + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd""" + +@Appender(excel_doc_common % read_excel_kwargs) +def read_excel(io, sheetname=0, **kwds): engine = kwds.pop('engine', None) return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds) @@ -193,83 +205,23 @@ def __init__(self, io, **kwds): raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.') + @Appender(excel_doc_common % dict(io='', eng='')) def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, - convert_float=True, has_index_names=False, converters=None, **kwds): - """Read an Excel table into DataFrame + convert_float=True, has_index_names=None, converters=None, **kwds): - Parameters - ---------- - sheetname : string, int, mixed list of strings/ints, or None, default 0 - - Strings are used for sheet names, Integers are used in zero-indexed sheet - positions. - - Lists of strings/integers are used to request multiple sheets. - - Specify None to get all sheets. - - str|int -> DataFrame is returned. - list|None -> Dict of DataFrames is returned, with keys representing sheets. - - Available Cases - - * Defaults to 0 -> 1st sheet as a DataFrame - * 1 -> 2nd sheet as a DataFrame - * "Sheet1" -> 1st sheet as a DataFrame - * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames - * None -> All sheets as a dictionary of DataFrames - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Rows to skip at the beginning (0-indexed) - skip_footer : int, default 0 - Rows at the end to skip (0-indexed) - converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels - index_col : int, default None - Column to use as the row labels of the DataFrame. Pass None if - there is no such column - parse_cols : int or list, default None - * If None then parse all columns - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be - parsed - * If string then indicates comma separated list of column names and - column ranges (e.g. "A:E" or "A,C,E:F") - parse_dates : boolean, default False - Parse date Excel values, - date_parser : function default None - Date parsing function - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - thousands : str, default None - Thousands separator - chunksize : int, default None - Size of file chunk to read for lazy evaluation. - convert_float : boolean, default True - convert integral floats to int (i.e., 1.0 --> 1). If False, all - numeric data will be read in as floats: Excel stores all numbers as - floats internally. - has_index_names : boolean, default False - True if the cols defined in index_col have an index name and are - not in the header - verbose : boolean, default False - Set to True to print a single statement when reading each - excel sheet. - - Returns - ------- - parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheetname argument - for more information on when a Dict of Dataframes is returned. - """ skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter + if has_index_names is not None: + warn("\nThe has_index_names argument is deprecated; index names " + "will be automatically inferred based on index_col.\n" + "This argmument is still necessary if reading Excel output " + "from 0.16.2 or prior with index names.", FutureWarning, + stacklevel=3) + return self._parse_excel(sheetname=sheetname, header=header, skiprows=skiprows, index_col=index_col, @@ -418,8 +370,40 @@ def _parse_cell(cell_contents,cell_typ): if sheet.nrows == 0: return DataFrame() + if com.is_list_like(header) and len(header) == 1: + header = header[0] + + # forward fill and pull out names for MultiIndex column + header_names = None if header is not None: - data[header] = _trim_excel_header(data[header]) + if com.is_list_like(header): + header_names = [] + for row in header: + if com.is_integer(skiprows): + row += skiprows + data[row] = _fill_mi_header(data[row]) + header_name, data[row] = _pop_header_name(data[row], index_col) + header_names.append(header_name) + else: + data[header] = _trim_excel_header(data[header]) + + if com.is_list_like(index_col): + # forward fill values for MultiIndex index + if not com.is_list_like(header): + offset = 1 + header + else: + offset = 1 + max(header) + + for col in index_col: + last = data[offset][col] + for row in range(offset + 1, len(data)): + if data[row][col] == '' or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + if com.is_list_like(header) and len(header) > 1: + has_index_names = True parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, @@ -433,6 +417,7 @@ def _parse_cell(cell_contents,cell_typ): **kwds) output[asheetname] = parser.read() + output[asheetname].columns = output[asheetname].columns.set_names(header_names) if ret_dict: return output @@ -463,6 +448,29 @@ def _trim_excel_header(row): row = row[1:] return row +def _fill_mi_header(row): + # forward fill blanks entries + # from headers if parsing as MultiIndex + last = row[0] + for i in range(1, len(row)): + if row[i] == '' or row[i] is None: + row[i] = last + else: + last = row[i] + return row + +# fill blank if index_col not None +def _pop_header_name(row, index_col): + """ (header, new_data) for header rows in MultiIndex parsing""" + none_fill = lambda x: None if x == '' else x + + if index_col is None: + # no index col specified, trim data for inference path + return none_fill(row[0]), row[1:] + else: + # pop out header name and fill w/ blank + i = index_col if not com.is_list_like(index_col) else max(index_col) + return none_fill(row[i]), row[:i] + [''] + row[i+1:] def _conv_value(val): # Convert numpy types to Python types for the Excel writers. diff --git a/pandas/io/tests/data/test_index_name_pre17.xls b/pandas/io/tests/data/test_index_name_pre17.xls new file mode 100644 index 0000000000000..2ab13105e7925 Binary files /dev/null and b/pandas/io/tests/data/test_index_name_pre17.xls differ diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsm b/pandas/io/tests/data/test_index_name_pre17.xlsm new file mode 100644 index 0000000000000..33c0d7949531c Binary files /dev/null and b/pandas/io/tests/data/test_index_name_pre17.xlsm differ diff --git a/pandas/io/tests/data/test_index_name_pre17.xlsx b/pandas/io/tests/data/test_index_name_pre17.xlsx new file mode 100644 index 0000000000000..ce66c40cda141 Binary files /dev/null and b/pandas/io/tests/data/test_index_name_pre17.xlsx differ diff --git a/pandas/io/tests/data/testmultiindex.xls b/pandas/io/tests/data/testmultiindex.xls new file mode 100644 index 0000000000000..3664c5c8dedcc Binary files /dev/null and b/pandas/io/tests/data/testmultiindex.xls differ diff --git a/pandas/io/tests/data/testmultiindex.xlsm b/pandas/io/tests/data/testmultiindex.xlsm new file mode 100644 index 0000000000000..8f359782b57bb Binary files /dev/null and b/pandas/io/tests/data/testmultiindex.xlsm differ diff --git a/pandas/io/tests/data/testmultiindex.xlsx b/pandas/io/tests/data/testmultiindex.xlsx new file mode 100644 index 0000000000000..a70110caf1ec7 Binary files /dev/null and b/pandas/io/tests/data/testmultiindex.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 073fc55357df7..0aee2af6ad166 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -14,6 +14,7 @@ import numpy as np from numpy.testing.decorators import slow +import pandas as pd from pandas import DataFrame, Index, MultiIndex from pandas.io.parsers import read_csv from pandas.io.excel import ( @@ -21,7 +22,7 @@ _Openpyxl2Writer, register_writer, _XlsxWriter ) from pandas.io.common import URLError -from pandas.util.testing import ensure_clean +from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf from pandas.core.config import set_option, get_option import pandas.util.testing as tm @@ -415,11 +416,8 @@ def test_read_xlrd_Book(self): @tm.network def test_read_from_http_url(self): - # TODO: remove this when merging into master - url = ('https://raw.github.com/davidovitch/pandas/master/' + url = ('https://raw.github.com/pydata/pandas/master/' 'pandas/io/tests/data/test1' + self.ext) -# url = ('https://raw.github.com/pydata/pandas/master/' -# 'pandas/io/tests/data/test' + self.ext) url_table = read_excel(url) local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) @@ -518,6 +516,132 @@ def test_reader_seconds(self): actual = self.get_exceldf('times_1904', 'Sheet1') tm.assert_frame_equal(actual, expected) + def test_read_excel_multiindex(self): + #GH 4679 + mi = MultiIndex.from_product([['foo','bar'],['a','b']]) + mi_file = os.path.join(self.dirpath, 'testmultiindex' + self.ext) + + expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], + [2, 3.5, pd.Timestamp('2015-01-02'), False], + [3, 4.5, pd.Timestamp('2015-01-03'), False], + [4, 5.5, pd.Timestamp('2015-01-04'), True]], + columns = mi) + + actual = read_excel(mi_file, 'mi_column', header=[0,1]) + tm.assert_frame_equal(actual, expected) + actual = read_excel(mi_file, 'mi_column', header=[0,1], index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.columns = ['a', 'b', 'c', 'd'] + expected.index = mi + actual = read_excel(mi_file, 'mi_index', index_col=[0,1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + expected.columns = mi + actual = read_excel(mi_file, 'both', index_col=[0,1], header=[0,1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + expected.index = mi.set_names(['ilvl1', 'ilvl2']) + expected.columns = ['a', 'b', 'c', 'd'] + actual = read_excel(mi_file, 'mi_index_name', index_col=[0,1]) + tm.assert_frame_equal(actual, expected) + + expected.index = list(range(4)) + expected.columns = mi.set_names(['c1', 'c2']) + actual = read_excel(mi_file, 'mi_column_name', header=[0,1], index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi.set_names(['ilvl1', 'ilvl2']) + actual = read_excel(mi_file, 'both_name', index_col=[0,1], header=[0,1]) + tm.assert_frame_equal(actual, expected) + + actual = read_excel(mi_file, 'both_name', index_col=[0,1], header=[0,1]) + tm.assert_frame_equal(actual, expected) + + actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0,1], + header=[0,1], skiprows=2) + tm.assert_frame_equal(actual, expected) + + + def test_excel_multindex_roundtrip(self): + #GH 4679 + _skip_if_no_xlsxwriter() + with ensure_clean('.xlsx') as pth: + for c_idx_names in [True, False]: + for r_idx_names in [True, False]: + for c_idx_levels in [1, 3]: + for r_idx_levels in [1, 3]: + # column index name can't be serialized unless MultiIndex + if (c_idx_levels == 1 and c_idx_names): + continue + + # empty name case current read in as unamed levels, not Nones + check_names = True + if not r_idx_names and r_idx_levels > 1: + check_names = False + + df = mkdf(5, 5, c_idx_names, + r_idx_names, c_idx_levels, + r_idx_levels) + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + def test_excel_oldindex_format(self): + #GH 4679 + data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], + ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], + ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], + ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], + ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) + columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] + mi = MultiIndex(levels=[['R_l0_g0', 'R_l0_g1', 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], + ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=['R0', 'R1']) + si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], name='R0') + + in_file = os.path.join(self.dirpath, 'test_index_name_pre17' + self.ext) + + expected = pd.DataFrame(data, index=si, columns=columns) + with tm.assert_produces_warning(FutureWarning): + actual = pd.read_excel(in_file, 'single_names', has_index_names=True) + tm.assert_frame_equal(actual, expected) + + expected.index.name = None + actual = pd.read_excel(in_file, 'single_no_names') + tm.assert_frame_equal(actual, expected) + with tm.assert_produces_warning(FutureWarning): + actual = pd.read_excel(in_file, 'single_no_names', has_index_names=False) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + with tm.assert_produces_warning(FutureWarning): + actual = pd.read_excel(in_file, 'multi_names', has_index_names=True) + tm.assert_frame_equal(actual, expected) + + expected.index.names = [None, None] + actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0,1]) + tm.assert_frame_equal(actual, expected, check_names=False) + with tm.assert_produces_warning(FutureWarning): + actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0,1], + has_index_names=False) + tm.assert_frame_equal(actual, expected, check_names=False) + + class XlsReaderTests(XlrdTests, tm.TestCase): ext = '.xls' @@ -537,6 +661,8 @@ class XlsmReaderTests(XlrdTests, tm.TestCase): check_skip = staticmethod(_skip_if_no_xlrd) + + class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. # To add a writer test, define the following: @@ -781,7 +907,6 @@ def test_roundtrip_indexlabels(self): reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, - has_index_names=self.merge_cells ).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) @@ -794,7 +919,6 @@ def test_roundtrip_indexlabels(self): reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, - has_index_names=self.merge_cells ).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) @@ -807,7 +931,6 @@ def test_roundtrip_indexlabels(self): reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, - has_index_names=self.merge_cells ).astype(np.int64) frame.index.names = ['test'] tm.assert_frame_equal(frame, recons.astype(bool)) @@ -837,8 +960,7 @@ def test_excel_roundtrip_indexname(self): xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], - index_col=0, - has_index_names=self.merge_cells) + index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo') @@ -925,8 +1047,7 @@ def test_to_excel_multiindex(self): frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = reader.parse('test1', index_col=[0, 1], - parse_dates=False, - has_index_names=self.merge_cells) + parse_dates=False) tm.assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names) @@ -943,8 +1064,7 @@ def test_to_excel_multiindex_dates(self): tsframe.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) recons = reader.parse('test1', - index_col=[0, 1], - has_index_names=self.merge_cells) + index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) self.assertEqual(recons.index.names, ('time', 'foo')) @@ -1475,15 +1595,14 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(self): with ensure_clean(self.ext) as path: df.to_excel(path, index=False) - def test_excel_warns_verbosely_on_multiindex_columns_and_index_true(self): + def test_excel_multiindex_columns_and_index_true(self): _skip_if_no_xlwt() cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) - df = DataFrame(np.random.randn(10, 3), columns=cols) - with tm.assert_produces_warning(UserWarning): - with ensure_clean(self.ext) as path: - df.to_excel(path, index=True) + df = pd.DataFrame(np.random.randn(10, 3), columns=cols) + with ensure_clean(self.ext) as path: + df.to_excel(path, index=True) def test_excel_multiindex_index(self): _skip_if_no_xlwt()