Skip to content

Commit

Permalink
REF: Read excel parse refactor (#58497)
Browse files Browse the repository at this point in the history
  • Loading branch information
iangainey committed May 1, 2024
1 parent 7320430 commit f6932cb
Showing 1 changed file with 178 additions and 126 deletions.
304 changes: 178 additions & 126 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,143 +780,195 @@ def parse(
output[asheetname] = DataFrame()
continue

is_list_header = False
is_len_one_list_header = False
if is_list_like(header):
assert isinstance(header, Sequence)
is_list_header = True
if len(header) == 1:
is_len_one_list_header = True

if is_len_one_list_header:
header = cast(Sequence[int], header)[0]

# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
assert isinstance(header, Sequence)

header_names = []
control_row = [True] * len(data[0])

for row in header:
if is_integer(skiprows):
assert isinstance(skiprows, int)
row += skiprows

if row > len(data) - 1:
raise ValueError(
f"header index {row} exceeds maximum index "
f"{len(data) - 1} of data.",
)

data[row], control_row = fill_mi_header(data[row], control_row)

if index_col is not None:
header_name, _ = pop_header_name(data[row], index_col)
header_names.append(header_name)

# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
if isinstance(index_col, int):
index_col_list = [index_col]
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
if header is None:
offset = 0
elif isinstance(header, int):
offset = 1 + header
else:
offset = 1 + max(header)
output = self._parse_sheet(
data=data,
output=output,
asheetname=asheetname,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
dtype=dtype,
skiprows=skiprows,
nrows=nrows,
true_values=true_values,
false_values=false_values,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
dtype_backend=dtype_backend,
**kwds,
)

# GH34673: if MultiIndex names present and not defined in the header,
# offset needs to be incremented so that forward filling starts
# from the first MI value instead of the name
if has_index_names:
offset += 1
if last_sheetname is None:
raise ValueError("Sheet name is an empty list")

# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
assert isinstance(index_col, Sequence)
if ret_dict:
return output
else:
return output[last_sheetname]

for col in index_col:
last = data[offset][col]
def _parse_sheet(
self,
data: list,
output: dict,
asheetname: str | int | None = None,
header: int | Sequence[int] | None = 0,
names: SequenceNotStr[Hashable] | range | None = None,
index_col: int | Sequence[int] | None = None,
usecols=None,
dtype: DtypeArg | None = None,
skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
nrows: int | None = None,
true_values: Iterable[Hashable] | None = None,
false_values: Iterable[Hashable] | None = None,
na_values=None,
parse_dates: list | dict | bool = False,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: dict[Hashable, str] | str | None = None,
thousands: str | None = None,
decimal: str = ".",
comment: str | None = None,
skipfooter: int = 0,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
**kwds,
):
is_list_header = False
is_len_one_list_header = False
if is_list_like(header):
assert isinstance(header, Sequence)
is_list_header = True
if len(header) == 1:
is_len_one_list_header = True

if is_len_one_list_header:
header = cast(Sequence[int], header)[0]

# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
assert isinstance(header, Sequence)

header_names = []
control_row = [True] * len(data[0])

for row in header:
if is_integer(skiprows):
assert isinstance(skiprows, int)
row += skiprows

if row > len(data) - 1:
raise ValueError(
f"header index {row} exceeds maximum index "
f"{len(data) - 1} of data.",
)

for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]
data[row], control_row = fill_mi_header(data[row], control_row)

# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
dtype_backend=dtype_backend,
**kwds,
)
if index_col is not None:
header_name, _ = pop_header_name(data[row], index_col)
header_names.append(header_name)

output[asheetname] = parser.read(nrows=nrows)
# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
if isinstance(index_col, int):
index_col_list = [index_col]
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
if header is None:
offset = 0
elif isinstance(header, int):
offset = 1 + header
else:
offset = 1 + max(header)

# GH34673: if MultiIndex names present and not defined in the header,
# offset needs to be incremented so that forward filling starts
# from the first MI value instead of the name
if has_index_names:
offset += 1

# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
assert isinstance(index_col, Sequence)

for col in index_col:
last = data[offset][col]

for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]

# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
dtype_backend=dtype_backend,
**kwds,
)

if header_names:
output[asheetname].columns = output[asheetname].columns.set_names(
header_names
)
output[asheetname] = parser.read(nrows=nrows)

except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()
if header_names:
output[asheetname].columns = output[asheetname].columns.set_names(
header_names
)

except Exception as err:
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
raise err
except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()

if last_sheetname is None:
raise ValueError("Sheet name is an empty list")
except Exception as err:
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
raise err

if ret_dict:
return output
else:
return output[last_sheetname]
return output


@doc(storage_options=_shared_docs["storage_options"])
Expand Down

0 comments on commit f6932cb

Please sign in to comment.