diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2b35cfa044ae9..6063ac098a4dc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -780,143 +780,195 @@ def parse( output[asheetname] = DataFrame() continue - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) - - data[row], control_row = fill_mi_header(data[row], control_row) - - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) - - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] - if isinstance(index_col, int): - index_col_list = [index_col] - else: - assert isinstance(index_col, Sequence) - index_col_list = index_col - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - potential_data = [ - x - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) - - if is_list_like(index_col): - # Forward fill values for MultiIndex index. - if header is None: - offset = 0 - elif isinstance(header, int): - offset = 1 + header - else: - offset = 1 + max(header) + output = self._parse_sheet( + data=data, + output=output, + asheetname=asheetname, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + if last_sheetname is None: + raise ValueError("Sheet name is an empty list") - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if ret_dict: + return output + else: + return output[last_sheetname] - for col in index_col: - last = data[offset][col] + def _parse_sheet( + self, + data: list, + output: dict, + asheetname: str | int | None = None, + header: int | Sequence[int] | None = 0, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | Sequence[int] | None = None, + usecols=None, + dtype: DtypeArg | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + na_values=None, + parse_dates: list | dict | bool = False, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: dict[Hashable, str] | str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwds, + ): + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + assert isinstance(skiprows, int) + row += skiprows + + if row > len(data) - 1: + raise ValueError( + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", + ) - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + data[row], control_row = fill_mi_header(data[row], control_row) - # GH 12292 : error when read one empty column from excel file - try: - parser = TextParser( - data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - skip_blank_lines=False, # GH 39808 - parse_dates=parse_dates, - date_parser=date_parser, - date_format=date_format, - thousands=thousands, - decimal=decimal, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - dtype_backend=dtype_backend, - **kwds, - ) + if index_col is not None: + header_name, _ = pop_header_name(data[row], index_col) + header_names.append(header_name) - output[asheetname] = parser.read(nrows=nrows) + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_list: Sequence[int] + if isinstance(index_col, int): + index_col_list = [index_col] + else: + assert isinstance(index_col, Sequence) + index_col_list = index_col + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + potential_data = [ + x + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_list + ] + has_index_names = all(x == "" or x is None for x in potential_data) + + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if header is None: + offset = 0 + elif isinstance(header, int): + offset = 1 + header + else: + offset = 1 + max(header) + + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + assert isinstance(index_col, Sequence) + + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + skip_blank_lines=False, # GH 39808 + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + dtype_backend=dtype_backend, + **kwds, + ) - if header_names: - output[asheetname].columns = output[asheetname].columns.set_names( - header_names - ) + output[asheetname] = parser.read(nrows=nrows) - except EmptyDataError: - # No Data, return an empty DataFrame - output[asheetname] = DataFrame() + if header_names: + output[asheetname].columns = output[asheetname].columns.set_names( + header_names + ) - except Exception as err: - err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err + except EmptyDataError: + # No Data, return an empty DataFrame + output[asheetname] = DataFrame() - if last_sheetname is None: - raise ValueError("Sheet name is an empty list") + except Exception as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err - if ret_dict: - return output - else: - return output[last_sheetname] + return output @doc(storage_options=_shared_docs["storage_options"])