Skip to content

Commit

Permalink
ENH/DOC: wide_to_long performance and functionality improvements (#14779
Browse files Browse the repository at this point in the history
)

Speed up by avoiding big copies, and regex on categorical column

Add functionality to deal with "pathological" input

Add docstring examples and more test cases
  • Loading branch information
erikcs committed Dec 10, 2016
1 parent 06f26b5 commit 5747a25
Show file tree
Hide file tree
Showing 5 changed files with 406 additions and 36 deletions.
27 changes: 26 additions & 1 deletion asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .pandas_vb_common import *
from pandas.core.reshape import melt
from pandas.core.reshape import melt, wide_to_long


class melt_dataframe(object):
Expand Down Expand Up @@ -74,3 +74,28 @@ def setup(self):

def time_unstack_sparse_keyspace(self):
self.idf.unstack()


class wide_to_long_big(object):
goal_time = 0.2

def setup(self):
vars = 'ABCD'
nyrs = 20
nidvars = 20
N = 5000
yrvars = []
for var in vars:
for yr in range(1, nyrs + 1):
yrvars.append(var + str(yr))

yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N)))
idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N)))

self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)],
axis=1)
self.vars = vars

def time_wide_to_long_big(self):
self.df['id'] = self.df.index
wide_to_long(self.df, list(self.vars), i='id', j='year')
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Data manipulations
concat
get_dummies
factorize
wide_to_long

Top-level missing data
~~~~~~~~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)



Expand Down
213 changes: 179 additions & 34 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pandas.compat import range, zip
from pandas import compat
import itertools
import re

import numpy as np

Expand Down Expand Up @@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None):
return DataFrame(mdata, columns=id_cols + pivot_cols)


def wide_to_long(df, stubnames, i, j):
def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
"""
Wide panel to long format. Less flexible but more user-friendly than melt.
With stubnames ['A', 'B'], this function expects to find one or more
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
You specify what you want to call this suffix in the resulting long format
with `j` (for example `j`='year')
Each row of these wide variables are assumed to be uniquely identified by
`i` (can be a single column name or a list of column names)
All remaining variables in the data frame are left intact.
Parameters
----------
df : DataFrame
The wide-format DataFrame
stubnames : list
A list of stub names. The wide format variables are assumed to
stubnames : list or string
The stub name(s). The wide format variables are assumed to
start with the stub names.
i : str
The name of the id variable.
i : list or string
Column(s) to use as id variable(s)
j : str
The name of the subobservation variable.
stubend : str
Regex to match for the end of the stubs.
The name of the subobservation variable. What you wish to name your
suffix in the long format.
sep : str, default ""
A character indicating the separation of the variable names
in the wide format, to be stripped from the names in the long format.
For example, if your column names are A-suffix1, A-suffix2, you
can strip the hypen by specifying `sep`='-'
numeric_suffix : bool, default True
Whether the stub suffix is assumed to be numeric or not.
Returns
-------
DataFrame
A DataFrame that contains each stub name as a variable as well as
variables for i and j.
A DataFrame that contains each stub name as a variable, with new index
(i, j)
Examples
--------
Expand All @@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j):
0 a d 2.5 3.2 -1.085631 0
1 b e 1.2 1.3 0.997345 1
2 c f 0.7 0.1 0.282978 2
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
X A B
id year
0 1970 -1.085631 a 2.5
Expand All @@ -926,38 +943,166 @@ def wide_to_long(df, stubnames, i, j):
1 1980 0.997345 e 1.3
2 1980 0.282978 f 0.1
With multuple id columns
>>> df = pd.DataFrame({
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
... })
>>> df
birth famid ht1 ht2
0 1 1 2.8 3.4
1 2 1 2.9 3.8
2 3 1 2.2 2.9
3 1 2 2.0 3.2
4 2 2 1.8 2.8
5 3 2 1.9 2.4
6 1 3 2.2 3.3
7 2 3 2.3 3.4
8 3 3 2.1 2.9
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
>>> l
ht
famid birth age
1 1 1 2.8
2 3.4
2 1 2.9
2 3.8
3 1 2.2
2 2.9
2 1 1 2.0
2 3.2
2 1 1.8
2 2.8
3 1 1.9
2 2.4
3 1 1 2.2
2 3.3
2 1 2.3
2 3.4
3 1 2.1
2 2.9
Going from long back to wide just takes some creative use of `unstack`
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
>>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()]
>>> w.reset_index()
famid birth ht1 ht2
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9
Less wieldy column names are also handled
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
... 'A(quarterly)-2011': np.random.rand(3),
... 'B(quarterly)-2010': np.random.rand(3),
... 'B(quarterly)-2011': np.random.rand(3),
... 'X' : np.random.randint(3, size=3)})
>>> df['id'] = df.index
>>> df
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
0 0.531828 0.724455 0.322959 0.293714
1 0.634401 0.611024 0.361789 0.630976
2 0.849432 0.722443 0.228263 0.092105
\
X id
0 0 0
1 1 1
2 2 2
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
i='id', j='year', sep='-')
X A(quarterly) B(quarterly)
id year
0 2010 0 0.531828 0.322959
1 2010 2 0.634401 0.361789
2 2010 2 0.849432 0.228263
0 2011 0 0.724455 0.293714
1 2011 2 0.611024 0.630976
2 2011 2 0.722443 0.092105
If we have many columns, we could also use a regex to find our
stubnames and pass that list on to wide_to_long
>>> stubnames = set([match[0] for match in
df.columns.str.findall('[A-B]\(.*\)').values
if match != [] ])
>>> list(stubnames)
['B(quarterly)', 'A(quarterly)']
Notes
-----
All extra variables are treated as extra id variables. This simply uses
All extra variables are left untouched. This simply uses
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
in a typicaly case.
"""

def get_var_names(df, regex):
return df.filter(regex=regex).columns.tolist()

def melt_stub(df, stub, i, j):
varnames = get_var_names(df, "^" + stub)
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
var_name=j)
newdf_j = newdf[j].str.replace(stub, "")
try:
newdf_j = newdf_j.astype(int)
except ValueError:
pass
newdf[j] = newdf_j
return newdf

id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
if i not in id_vars:
id_vars += [i]

newdf = melt_stub(df, stubnames[0], id_vars, j)

for stub in stubnames[1:]:
new = melt_stub(df, stub, id_vars, j)
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
return newdf.set_index([i, j])
def melt_stub(df, stub, i, j, value_vars, sep):
newdf = melt(df, id_vars=i, value_vars=value_vars,
value_name=stub.rstrip(sep), var_name=j)
newdf[j] = Categorical(newdf[j])
newdf[j] = newdf[j].str.replace(re.escape(stub), "")

return newdf.set_index(i + [j])

if any(map(lambda s: s in df.columns.tolist(), stubnames)):
raise ValueError("stubname can't be identical to a column name")

if not isinstance(stubnames, list):
stubnames = [stubnames]

if not isinstance(i, list):
i = [i]

stubs = list(map(lambda x: x + sep, stubnames))

# This regex is needed to avoid multiple "greedy" matches with stubs
# that have overlapping substrings
# For example A2011, A2012 are separate from AA2011, AA2012
# And BBone, BBtwo is different from Bone, Btwo, and BBBrating
value_vars = list(map(lambda x: get_var_names(
df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs))

value_vars_flattened = [e for sublist in value_vars for e in sublist]
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))

# If we know the stub end type is a number we can disambiguate potential
# misclassified value_vars, for ex, with stubname A: A2011, A2012 and
# Arating would all be found as value_vars. If the suffix is numeric we
# know the last one should be an id_var. (Note the converse disambiguation
# is not possible)
if numeric_suffix:
for s, v in zip(stubs, value_vars):
for vname in v[:]:
end = vname.replace(s, "")
if not end.isdigit():
v.remove(vname)
id_vars.append(vname)

melted = []
for s, v in zip(stubs, value_vars):
melted.append(melt_stub(df, s, i, j, v, sep))
melted = melted[0].join(melted[1:], how='outer')

if len(i) == 1:
new = df[id_vars].set_index(i).join(melted)
return new

new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])

return new


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Expand Down

0 comments on commit 5747a25

Please sign in to comment.