diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9bd4ddbb624d91..3c1439823ea44f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -820,6 +820,7 @@ Deprecations precision through the ``rtol``, and ``atol`` parameters, thus deprecating the ``check_less_precise`` parameter. (:issue:`13357`). - :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) +- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4fa4f158e9c3c4..3a2fe012c2d413 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,6 +4,7 @@ import os from textwrap import fill from typing import Union +import warnings from pandas._config import config @@ -810,8 +811,7 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - - Uses xlrd engine by default. See read_excel for more documentation + Uses xlrd, openpyxl or odf. See read_excel for more documentation Parameters ---------- @@ -822,7 +822,7 @@ class ExcelFile: engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd``. + default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files. Engine compatibility : - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. @@ -844,7 +844,7 @@ class ExcelFile: def __init__(self, path_or_buffer, engine=None): if engine is None: - engine = "xlrd" + engine = "openpyxl" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): if _is_ods_stream(path_or_buffer): engine = "odf" @@ -852,6 +852,16 @@ def __init__(self, path_or_buffer, engine=None): ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + elif ext == ".xls": + engine = "xlrd" + + elif engine == "xlrd": + warnings.warn( + 'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. ' + 'Specify engine="openpyxl" to suppress this warning.', + FutureWarning, + stacklevel=2, + ) if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f346..5a3e1119173667 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List import numpy as np @@ -511,7 +512,11 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: # TODO: replace with openpyxl constants if cell.is_date: - return cell.value + try: + # workaround for inaccurate timestamp notation in excel + return datetime.fromtimestamp(round(cell.value.timestamp())) + except (AttributeError, OSError): + return cell.value elif cell.data_type == "e": return np.nan elif cell.data_type == "b": diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 955db982f8300f..c3dac83fde3b50 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -40,6 +40,9 @@ def ignore_xlrd_time_clock_warning(): marks=[ td.skip_if_no("xlrd"), pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated,' + ), ], ), pytest.param( @@ -52,8 +55,8 @@ def ignore_xlrd_time_clock_warning(): pytest.param( None, marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), ], ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), @@ -69,6 +72,8 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: engine = engine.values[0] if engine == "openpyxl" and read_ext == ".xls": return False + if engine is None and read_ext == ".xls": + return False if engine == "odf" and read_ext != ".ods": return False if read_ext == ".ods" and engine != "odf": @@ -579,7 +584,7 @@ def test_date_conversion_overflow(self, read_ext): columns=["DateColWithBigInt", "StringCol"], ) - if pd.read_excel.keywords["engine"] == "openpyxl": + if pd.read_excel.keywords["engine"] in ["openpyxl", None]: pytest.xfail("Maybe not supported by openpyxl") result = pd.read_excel("testdateoverflow" + read_ext) @@ -962,12 +967,28 @@ def test_read_excel_squeeze(self, read_ext): expected = pd.Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, read_ext): + def test_deprecated_kwargs(self, engine, read_ext): + if engine == "xlrd": + pytest.skip("Use of xlrd engine produces a FutureWarning as well") + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel("test1" + read_ext, "Sheet1", 0) pd.read_excel("test1" + read_ext) + def test_excel_high_surrogate(self, engine, read_ext): + # GH 23809 + if read_ext != ".xlsx": + pytest.skip("Test is only applicable to .xlsx file") + if engine in ["openpyxl", None]: + pytest.skip("Test does not work for openpyxl") + + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) + class TestExcelFileRead: @pytest.fixture(autouse=True) @@ -1123,14 +1144,6 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) - def test_excel_high_surrogate(self, engine): - # GH 23809 - expected = pd.DataFrame(["\udc88"], columns=["Column1"]) - - # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") - tm.assert_frame_equal(expected, actual) - @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): # GH 33476 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e3ee53b63e102a..1063834b8367c1 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,12 +351,16 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - else: + elif engine == "xlwt": import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") + else: # openpyxl + msg = "Worksheet 0 does not exist." + with pytest.raises(KeyError, match=msg): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1199,6 +1203,9 @@ def test_datetimes(self, path): tm.assert_series_equal(write_frame["A"], read_frame["A"]) + @pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' + ) def test_bytes_io(self, engine): # see gh-7074 bio = BytesIO() @@ -1209,8 +1216,15 @@ def test_bytes_io(self, engine): df.to_excel(writer) writer.save() + if engine == "xlwt": + read_engine = "xlrd" + elif engine == "xlsxwriter": + read_engine = "openpyxl" + else: + read_engine = engine + bio.seek(0) - reread_df = pd.read_excel(bio, index_col=0) + reread_df = pd.read_excel(bio, index_col=0, engine=read_engine) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 1c9c514b20f462..ec9386f3672455 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -17,6 +17,9 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") +@pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' +) def test_read_xlrd_book(read_ext, frame): df = frame @@ -36,8 +39,31 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well +@pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' +) def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") - with pd.ExcelFile(path) as excel: + with pd.ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") + + +def test_excel_file_warning_with_xlsx_file(datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False + ) as w: + pd.ExcelFile(path, engine="xlrd") + assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message) + + +def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ) as w: + pd.read_excel(path, "Sheet1", engine="xlrd") + assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)