Skip to content

Commit

Permalink
REF: Break up stack_v3
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed May 19, 2024
1 parent 3f8f704 commit 281f288
Showing 1 changed file with 78 additions and 51 deletions.
129 changes: 78 additions & 51 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,27 +925,99 @@ def _reorder_for_extension_array_stack(
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
if frame.columns.nunique() != len(frame.columns):
raise ValueError("Columns with duplicate values are not supported in stack")

# If we need to drop `level` from columns, it needs to be in descending order
set_levels = set(level)
drop_levnums = sorted(level, reverse=True)
stack_cols = frame.columns._drop_level_numbers(
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
)

result = stack_reshape(frame, level, set_levels, stack_cols)

# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
ratio = 0 if frame.empty else len(result) // len(frame)

index_levels: list | FrozenList
if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
else:
codes, uniques = factorize(frame.index, use_na_sentinel=False)
index_levels = [uniques]
index_codes = list(np.tile(codes, (1, ratio)))

if len(level) > 1:
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
sorter = np.argsort(level)
assert isinstance(stack_cols, MultiIndex)
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
else:
ordered_stack_cols = stack_cols

stack_cols_unique = stack_cols.unique()
ordered_stack_cols_unique = ordered_stack_cols.unique()
if isinstance(ordered_stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols_unique]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]

# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
# dtype[Any]]]", variable has type "FrozenList")
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)

# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)

# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None

return result


def stack_reshape(
frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index
) -> Series | DataFrame:
"""Reshape the data of a frame for stack.
This function takes care of most of the work that stack needs to do. Caller
will sort the result once the appropriate index is set.
Parameters
----------
frame: DataFrame
DataFrame that is to be stacked.
level: list of ints.
Levels of the columns to stack.
set_levels: set of ints.
Same as level, but as a set.
stack_cols: Index.
Columns of the result when the DataFrame is stacked.
Returns
-------
The data of behind the stacked DataFrame.
"""
# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level, reverse=True)

# Grab data for each unique index to be stacked
buf = []
for idx in stack_cols_unique:
for idx in stack_cols.unique():
if len(frame.columns) == 1:
data = frame.copy()
else:
Expand All @@ -972,10 +1044,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
data.columns = RangeIndex(len(data.columns))
buf.append(data)

result: Series | DataFrame
if len(buf) > 0 and not frame.empty:
result = concat(buf, ignore_index=True)
ratio = len(result) // len(frame)
else:
# input is empty
if len(level) < frame.columns.nlevels:
Expand All @@ -984,54 +1054,11 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
else:
new_columns = [0]
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
ratio = 0

if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]

# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
index_levels: list | FrozenList
if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
else:
codes, uniques = factorize(frame.index, use_na_sentinel=False)
index_levels = [uniques]
index_codes = list(np.tile(codes, (1, ratio)))
if isinstance(ordered_stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols.unique()]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
# dtype[Any]]]", variable has type "FrozenList")
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)

# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)

# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None

return result

0 comments on commit 281f288

Please sign in to comment.