diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index ab235e085986c..6ebf414de52f3 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas.core.reshape import melt +from pandas.core.reshape import melt, wide_to_long class melt_dataframe(object): @@ -74,3 +74,28 @@ def setup(self): def time_unstack_sparse_keyspace(self): self.idf.unstack() + + +class wide_to_long_big(object): + goal_time = 0.2 + + def setup(self): + vars = 'ABCD' + nyrs = 20 + nidvars = 20 + N = 5000 + yrvars = [] + for var in vars: + for yr in range(1, nyrs + 1): + yrvars.append(var + str(yr)) + + yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N))) + idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N))) + + self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)], + axis=1) + self.vars = vars + + def time_wide_to_long_big(self): + self.df['id'] = self.df.index + wide_to_long(self.df, list(self.vars), i='id', j='year') diff --git a/doc/source/api.rst b/doc/source/api.rst index a510f663d19ee..3766093cabed0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -156,6 +156,7 @@ Data manipulations concat get_dummies factorize + wide_to_long Top-level missing data ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ff086380fdb05..5d99308abfb63 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -88,6 +88,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 055a0041b181a..e231aabfce8ae 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -3,6 +3,7 @@ from pandas.compat import range, zip from pandas import compat import itertools +import re import numpy as np @@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j): +def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): """ Wide panel to long format. Less flexible but more user-friendly than melt. + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j`='year') + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + Parameters ---------- df : DataFrame The wide-format DataFrame - stubnames : list - A list of stub names. The wide format variables are assumed to + stubnames : list or string + The stub name(s). The wide format variables are assumed to start with the stub names. - i : str - The name of the id variable. + i : list or string + Column(s) to use as id variable(s) j : str - The name of the subobservation variable. - stubend : str - Regex to match for the end of the stubs. + The name of the subobservation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hypen by specifying `sep`='-' + numeric_suffix : bool, default True + Whether the stub suffix is assumed to be numeric or not. Returns ------- DataFrame - A DataFrame that contains each stub name as a variable as well as - variables for i and j. + A DataFrame that contains each stub name as a variable, with new index + (i, j) Examples -------- @@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j): 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 - >>> wide_to_long(df, ["A", "B"], i="id", j="year") + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") X A B id year 0 1970 -1.085631 a 2.5 @@ -926,9 +943,105 @@ def wide_to_long(df, stubnames, i, j): 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 + With multuple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + birth famid ht1 ht2 + 0 1 1 2.8 3.4 + 1 2 1 2.9 3.8 + 2 3 1 2.2 2.9 + 3 1 2 2.0 3.2 + 4 2 2 1.8 2.8 + 5 3 2 1.9 2.4 + 6 1 3 2.2 3.3 + 7 2 3 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() + >>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()] + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), + ... 'A(quarterly)-2011': np.random.rand(3), + ... 'B(quarterly)-2010': np.random.rand(3), + ... 'B(quarterly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 + 0 0.531828 0.724455 0.322959 0.293714 + 1 0.634401 0.611024 0.361789 0.630976 + 2 0.849432 0.722443 0.228263 0.092105 + \ + X id + 0 0 0 + 1 1 1 + 2 2 2 + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], + i='id', j='year', sep='-') + X A(quarterly) B(quarterly) + id year + 0 2010 0 0.531828 0.322959 + 1 2010 2 0.634401 0.361789 + 2 2010 2 0.849432 0.228263 + 0 2011 0 0.724455 0.293714 + 1 2011 2 0.611024 0.630976 + 2 2011 2 0.722443 0.092105 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = set([match[0] for match in + df.columns.str.findall('[A-B]\(.*\)').values + if match != [] ]) + >>> list(stubnames) + ['B(quarterly)', 'A(quarterly)'] + Notes ----- - All extra variables are treated as extra id variables. This simply uses + All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ @@ -936,28 +1049,60 @@ def wide_to_long(df, stubnames, i, j): def get_var_names(df, regex): return df.filter(regex=regex).columns.tolist() - def melt_stub(df, stub, i, j): - varnames = get_var_names(df, "^" + stub) - newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, - var_name=j) - newdf_j = newdf[j].str.replace(stub, "") - try: - newdf_j = newdf_j.astype(int) - except ValueError: - pass - newdf[j] = newdf_j - return newdf - - id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) - if i not in id_vars: - id_vars += [i] - - newdf = melt_stub(df, stubnames[0], id_vars, j) - - for stub in stubnames[1:]: - new = melt_stub(df, stub, id_vars, j) - newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) - return newdf.set_index([i, j]) + def melt_stub(df, stub, i, j, value_vars, sep): + newdf = melt(df, id_vars=i, value_vars=value_vars, + value_name=stub.rstrip(sep), var_name=j) + newdf[j] = Categorical(newdf[j]) + newdf[j] = newdf[j].str.replace(re.escape(stub), "") + + return newdf.set_index(i + [j]) + + if any(map(lambda s: s in df.columns.tolist(), stubnames)): + raise ValueError("stubname can't be identical to a column name") + + if not isinstance(stubnames, list): + stubnames = [stubnames] + + if not isinstance(i, list): + i = [i] + + stubs = list(map(lambda x: x + sep, stubnames)) + + # This regex is needed to avoid multiple "greedy" matches with stubs + # that have overlapping substrings + # For example A2011, A2012 are separate from AA2011, AA2012 + # And BBone, BBtwo is different from Bone, Btwo, and BBBrating + value_vars = list(map(lambda x: get_var_names( + df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs)) + + value_vars_flattened = [e for sublist in value_vars for e in sublist] + id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + + # If we know the stub end type is a number we can disambiguate potential + # misclassified value_vars, for ex, with stubname A: A2011, A2012 and + # Arating would all be found as value_vars. If the suffix is numeric we + # know the last one should be an id_var. (Note the converse disambiguation + # is not possible) + if numeric_suffix: + for s, v in zip(stubs, value_vars): + for vname in v[:]: + end = vname.replace(s, "") + if not end.isdigit(): + v.remove(vname) + id_vars.append(vname) + + melted = [] + for s, v in zip(stubs, value_vars): + melted.append(melt_stub(df, s, i, j, v, sep)) + melted = melted[0].join(melted[1:], how='outer') + + if len(i) == 1: + new = df[id_vars].set_index(i).join(melted) + return new + + new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) + + return new def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 80d1f5f76e5a9..a7d7dbb2e0a24 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -698,7 +698,7 @@ def test_simple(self): exp_data = {"X": x.tolist() + x.tolist(), "A": ['a', 'b', 'c', 'd', 'e', 'f'], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], "id": [0, 1, 2, 0, 1, 2]} exp_frame = DataFrame(exp_data) exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] @@ -716,6 +716,204 @@ def test_stubs(self): self.assertEqual(stubs, ['inc', 'edu']) + def test_separating_character(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A.1970": {0: "a", + 1: "b", + 2: "c"}, + "A.1980": {0: "d", + 1: "e", + 2: "f"}, + "B.1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B.1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A(quarterly)1970": {0: "a", + 1: "b", + 2: "c"}, + "A(quarterly)1980": {0: "d", + 1: "e", + 2: "f"}, + "B(quarterly)1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B(quarterly)1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index( + ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] + long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], + i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], + 'A': [1.0, 3.0, 2.0, 4.0], + 'B': [5.0, np.nan, 6.0, np.nan], + 'id': [0, 0, 1, 1], + 'year': ['2010', '2011', '2010', '2011']} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'BBBX': [91, 92, 93], + 'BBBZ': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'BBBX': [91, 92, 93, 91, 92, 93], + 'BBBZ': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['BBBX', 'BBBZ', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = 'nope!' + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'A2010': [], + 'A2011': [], + 'B2010': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'Arating': [91, 92, 93], + 'Arating_old': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'Arating': [91, 92, 93, 91, 92, 93], + 'Arating_old': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['Arating', 'Arating_old', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame({'Aone': [1.0, 2.0], + 'Atwo': [3.0, 4.0], + 'Bone': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'Aone': [], + 'Atwo': [], + 'Bone': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame({ + 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + }) + exp_frame = pd.DataFrame({ + 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, + 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], + 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', + '2', '1', '2', '1', '2', '1', '2', '1', '2'] + }) + exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] + long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + tm.assert_frame_equal(long_frame, exp_frame) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],