Skip to content

Commit

Permalink
ENH: add melt function, speed up DataFrame.apply
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Oct 31, 2011
1 parent e34c8a9 commit cf32be2
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 11 deletions.
8 changes: 8 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ pandas 0.5.1

**Release date:** Not yet released

**New features / modules**

- Add `melt` function to `pandas.core.reshape`

**Improvements to existing features**

- Sped up `DataFrame.apply` performance in most cases

**Bug fixes**

- Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index
Expand Down
22 changes: 13 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# pylint: disable=E1101,E1103
# pylint: disable=W0212,W0231,W0703,W0622

from itertools import izip
from StringIO import StringIO
import csv
import operator
Expand Down Expand Up @@ -1994,26 +1995,29 @@ def apply(self, func, axis=0, broadcast=False):

def _apply_standard(self, func, axis):
if axis == 0:
target = self
agg_index = self.columns
series_gen = ((c, self[c]) for c in self.columns)
res_index = self.columns
res_columns = self.index
elif axis == 1:
target = self.T
agg_index = self.index
res_index = self.index
res_columns = self.columns
series_gen = ((i, Series(v, self.columns))
for i, v in izip(self.index, self.values))

results = {}
for k in target.columns:
results[k] = func(target[k])
for k, v in series_gen:
results[k] = func(v)

if hasattr(results.values()[0], '__iter__'):
result = self._constructor(data=results, index=target.index,
columns=target.columns)
result = self._constructor(data=results, index=res_columns,
columns=res_index)

if axis == 1:
result = result.T

return result
else:
return Series(results, index=agg_index)
return Series(results, index=res_index)

def _apply_broadcast(self, func, axis):
if axis == 0:
Expand Down
49 changes: 49 additions & 0 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,3 +391,52 @@ def _stack_multi_columns(frame, level=-1, dropna=True):

return result


def melt(frame, id_vars=None, value_vars=None):
"""
"Unpivots" a DataFrame from wide format to long format, optionally leaving
id variables set
Parameters
----------
frame : DataFrame
id_vars :
value_vars :
Examples
--------
>>> df
A B C
a 1 2
b 3 4
c 5 6
>>> melt(df, ['A'])
A variable value
a B 1
b B 3
c B 5
a C 2
b C 4
c C 6
"""
# TODO: what about the existing index?

N, K = frame.shape

mdata = {}

if id_vars is not None:
idvars = list(idvars)
frame = frame.copy()
K -= len(idvars)
for col in idvars:
mdata[col] = np.tile(frame.pop(col).values, K)
else:
idvars = []

mcolumns = idvars + ['variable', 'value']

mdata['value'] = frame.values.ravel('F')
mdata['variable'] = np.asarray(frame.columns).repeat(N)
return DataFrame(mdata, columns=mcolumns)
4 changes: 2 additions & 2 deletions pandas/core/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ def transpose(self):
T = property(transpose)

def count(self, axis=0, **kwds):
return self.apply(SparseSeries.count, axis=axis)
return self.apply(lambda x: x.count(), axis=axis)
count.__doc__ = DataFrame.count.__doc__

def cumsum(self, axis=0):
Expand All @@ -1178,7 +1178,7 @@ def cumsum(self, axis=0):
-------
y : SparseDataFrame
"""
return self.apply(SparseSeries.cumsum, axis=axis)
return self.apply(lambda x: x.cumsum(), axis=axis)

def shift(self, periods, offset=None, timeRule=None):
"""
Expand Down
10 changes: 10 additions & 0 deletions pandas/rpy/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def _convert_array(obj):
def _convert_vector(obj):
if isinstance(obj, robj.IntVector):
return _convert_int_vector(obj)
elif isinstance(obj, robj.StrVector):
return _convert_str_vector(obj)

return list(obj)

NA_INTEGER = -2147483648
Expand All @@ -86,6 +89,13 @@ def _convert_int_vector(obj):
arr[mask] = np.nan
return arr

def _convert_str_vector(obj):
arr = np.asarray(obj, dtype=object)
mask = arr == robj.NA_Character
if mask.any():
arr[mask] = np.nan
return arr

def _convert_DataFrame(rdf):
columns = list(rdf.colnames)
rows = np.array(rdf.rownames)
Expand Down

0 comments on commit cf32be2

Please sign in to comment.