Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Use more lazy iterators #58808

Merged
merged 5 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
# people may aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
assert not any(kwarg == "axis" for kwarg in kwargs)
return f
elif hasattr(np, func) and hasattr(obj, "__array__"):
# in particular exclude Window
Expand Down
12 changes: 9 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,19 +1750,25 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
if `key` matches multiple labels
"""
axis = self._get_axis_number(axis)
other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
first_other_axes = next(
(ax for ax in range(self._AXIS_LEN) if ax != axis), None
)

if self._is_label_reference(key, axis=axis):
self._check_label_or_level_ambiguity(key, axis=axis)
values = self.xs(key, axis=other_axes[0])._values
if first_other_axes is None:
raise ValueError("axis matched all axes")
values = self.xs(key, axis=first_other_axes)._values
elif self._is_level_reference(key, axis=axis):
values = self.axes[axis].get_level_values(key)._values
else:
raise KeyError(key)

# Check for duplicates
if values.ndim > 1:
if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
if first_other_axes is not None and isinstance(
self._get_axis(first_other_axes), MultiIndex
):
multi_message = (
"\n"
"For a multi-index, the label must be a "
Expand Down
19 changes: 9 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,24 +857,23 @@ def _parse_sheet(
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
index_col_set: set[int]
if isinstance(index_col, int):
index_col_list = [index_col]
index_col_set = {index_col}
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col
index_col_set = set(index_col)

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
has_index_names = all(
x == "" or x is None
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)
if not control_row[i] and i not in index_col_set
)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
Expand Down Expand Up @@ -1457,9 +1456,9 @@ def inspect_excel_format(
with zipfile.ZipFile(stream) as zf:
# Workaround for some third party files that use forward slashes and
# lower case names.
component_names = [
component_names = {
name.replace("\\", "/").lower() for name in zf.namelist()
]
}

if "xl/workbook.xml" in component_names:
return "xlsx"
Expand Down
36 changes: 16 additions & 20 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,29 +122,25 @@ def get_sheet_data(
table: list[list[Scalar | NaTType]] = []

for sheet_row in sheet_rows:
sheet_cells = [
x
for x in sheet_row.childNodes
if hasattr(x, "qname") and x.qname in cell_names
]
empty_cells = 0
table_row: list[Scalar | NaTType] = []

for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
for sheet_cell in sheet_row.childNodes:
if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)

if max_row_len < len(table_row):
max_row_len = len(table_row)
Expand Down
11 changes: 4 additions & 7 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data = []

nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
return [
[
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)

return data
for i in range(nrows)
]
7 changes: 4 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
) -> DataFrame:
content = lib.to_object_array_tuples(data)
idx_len = content.shape[0]
arrays = convert_object_array(
list(content.T),
dtype=None,
Expand All @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(range(len(columns)), arrays)))
df.columns = columns
return df
return DataFrame._from_arrays(
arrays, columns=columns, index=range(idx_len), verify_integrity=False
)
else:
return DataFrame(columns=columns)

Expand Down