Skip to content

Commit

Permalink
PERF: remove use of Panel & perf in rolling corr/cov (#19257)
Browse files Browse the repository at this point in the history
* PERF: remove use of Panel & perf in rolling corr/cov

closes #17917
  • Loading branch information
jreback committed Feb 1, 2018
1 parent 4eb0cec commit b5dd6a3
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 35 deletions.
25 changes: 23 additions & 2 deletions asv_bench/benchmarks/rolling.py
Expand Up @@ -11,8 +11,8 @@ class Methods(object):
[10, 1000],
['int', 'float'],
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
'sum', 'corr', 'cov'])
param_names = ['constructor', 'window', 'dtype', 'method']
'sum'])
param_names = ['contructor', 'window', 'dtype', 'method']

def setup(self, constructor, window, dtype, method):
N = 10**5
Expand All @@ -23,6 +23,27 @@ def time_rolling(self, constructor, window, dtype, method):
getattr(self.roll, method)()


class Pairwise(object):

sample_time = 0.2
params = ([10, 1000, None],
['corr', 'cov'],
[True, False])
param_names = ['window', 'method', 'pairwise']

def setup(self, window, method, pairwise):
N = 10**4
arr = np.random.random(N)
self.df = pd.DataFrame(arr)

def time_pairwise(self, window, method, pairwise):
if window is None:
r = self.df.expanding()
else:
r = self.df.rolling(window=window)
getattr(r, method)(self.df, pairwise=pairwise)


class Quantile(object):

sample_time = 0.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.0.txt
Expand Up @@ -383,7 +383,7 @@ Performance Improvements
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)

- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)

.. _whatsnew_0230.docs:

Expand Down
8 changes: 2 additions & 6 deletions pandas/core/reshape/pivot.py
Expand Up @@ -99,19 +99,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',

if not dropna:
from pandas import MultiIndex
try:
if table.index.nlevels > 1:
m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
names=table.index.names)
table = table.reindex(m, axis=0)
except AttributeError:
pass # it's a single level

try:
if table.columns.nlevels > 1:
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
names=table.columns.names)
table = table.reindex(m, axis=1)
except AttributeError:
pass # it's a single level or a series

if isinstance(table, ABCDataFrame):
table = table.sort_index(axis=1)
Expand Down
47 changes: 30 additions & 17 deletions pandas/core/window.py
Expand Up @@ -1863,25 +1863,38 @@ def dataframe_from_int_dict(data, frame_template):
results[i][j] = f(*_prep_binary(arg1.iloc[:, i],
arg2.iloc[:, j]))

# TODO: not the most efficient (perf-wise)
# though not bad code-wise
from pandas import Panel, MultiIndex, concat

with warnings.catch_warnings(record=True):
p = Panel.from_dict(results).swapaxes('items', 'major')
if len(p.major_axis) > 0:
p.major_axis = arg1.columns[p.major_axis]
if len(p.minor_axis) > 0:
p.minor_axis = arg2.columns[p.minor_axis]

if len(p.items):
from pandas import MultiIndex, concat

result_index = arg1.index.union(arg2.index)
if len(result_index):

# construct result frame
result = concat(
[p.iloc[i].T for i in range(len(p.items))],
keys=p.items)
[concat([results[i][j]
for j, c in enumerate(arg2.columns)],
ignore_index=True)
for i, c in enumerate(arg1.columns)],
ignore_index=True,
axis=1)
result.columns = arg1.columns

# set the index and reorder
if arg2.columns.nlevels > 1:
result.index = MultiIndex.from_product(
arg2.columns.levels + [result_index])
result = result.reorder_levels([2, 0, 1]).sort_index()
else:
result.index = MultiIndex.from_product(
[range(len(arg2.columns)),
range(len(result_index))])
result = result.swaplevel(1, 0).sort_index()
result.index = MultiIndex.from_product(
[result_index] + [arg2.columns])
else:

# empty result
result = DataFrame(
index=MultiIndex(levels=[arg1.index, arg1.columns],
index=MultiIndex(levels=[arg1.index, arg2.columns],
labels=[[], []]),
columns=arg2.columns,
dtype='float64')
Expand All @@ -1890,9 +1903,9 @@ def dataframe_from_int_dict(data, frame_template):
# reset our column names to arg2 names
# careful not to mutate the original names
result.columns = result.columns.set_names(
arg2.columns.names)
arg1.columns.names)
result.index = result.index.set_names(
arg1.index.names + arg1.columns.names)
result_index.names + arg2.columns.names)

return result

Expand Down
22 changes: 13 additions & 9 deletions pandas/tests/test_window.py
Expand Up @@ -14,6 +14,7 @@
import pandas.tseries.offsets as offsets
from pandas.core.base import SpecificationError
from pandas.errors import UnsupportedFunctionCall
from pandas.core.sorting import safe_sort
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.compat import range, zip
Expand Down Expand Up @@ -1645,7 +1646,7 @@ def compare(self, result, expected):
result = result.dropna().values
expected = expected.dropna().values

tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

@pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()])
def test_no_flex(self, f):
Expand All @@ -1670,15 +1671,19 @@ def test_no_flex(self, f):
def test_pairwise_with_self(self, f):

# DataFrame with itself, pairwise=True
results = [f(df) for df in self.df1s]
for (df, result) in zip(self.df1s, results):
# note that we may construct the 1st level of the MI
# in a non-motononic way, so compare accordingly
results = []
for i, df in enumerate(self.df1s):
result = f(df)
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_index_equal(result.index.levels[1],
df.columns,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(df.columns.unique()))
tm.assert_index_equal(result.columns, df.columns)
results.append(df)

for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
Expand Down Expand Up @@ -1716,9 +1721,8 @@ def test_pairwise_with_other(self, f):
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_index_equal(result.index.levels[1],
self.df2.columns,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(self.df2.columns.unique()))
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
Expand Down

0 comments on commit b5dd6a3

Please sign in to comment.