ENH: add melt function, speed up DataFrame.apply

neurodebian · Oct 31, 2011 · cf32be2 · cf32be2
1 parent e34c8a9
commit cf32be2
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 11 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -27,6 +27,14 @@ pandas 0.5.1
 
 **Release date:** Not yet released
 
+**New features / modules**
+
+  - Add `melt` function to `pandas.core.reshape`
+
+**Improvements to existing features**
+
+  - Sped up `DataFrame.apply` performance in most cases
+
 **Bug fixes**
 
   - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -12,6 +12,7 @@
 # pylint: disable=E1101,E1103
 # pylint: disable=W0212,W0231,W0703,W0622
 
+from itertools import izip
 from StringIO import StringIO
 import csv
 import operator
@@ -1994,26 +1995,29 @@ def apply(self, func, axis=0, broadcast=False):
 
     def _apply_standard(self, func, axis):
         if axis == 0:
-            target = self
-            agg_index = self.columns
+            series_gen = ((c, self[c]) for c in self.columns)
+            res_index = self.columns
+            res_columns = self.index
         elif axis == 1:
-            target = self.T
-            agg_index = self.index
+            res_index = self.index
+            res_columns = self.columns
+            series_gen = ((i, Series(v, self.columns))
+                          for i, v in izip(self.index, self.values))
 
         results = {}
-        for k in target.columns:
-            results[k] = func(target[k])
+        for k, v in series_gen:
+            results[k] = func(v)
 
         if hasattr(results.values()[0], '__iter__'):
-            result = self._constructor(data=results, index=target.index,
-                                       columns=target.columns)
+            result = self._constructor(data=results, index=res_columns,
+                                       columns=res_index)
 
             if axis == 1:
                 result = result.T
 
             return result
         else:
-            return Series(results, index=agg_index)
+            return Series(results, index=res_index)
 
     def _apply_broadcast(self, func, axis):
         if axis == 0:

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -391,3 +391,52 @@ def _stack_multi_columns(frame, level=-1, dropna=True):
 
     return result
 
+
+def melt(frame, id_vars=None, value_vars=None):
+    """
+    "Unpivots" a DataFrame from wide format to long format, optionally leaving
+    id variables set
+
+    Parameters
+    ----------
+    frame : DataFrame
+    id_vars :
+    value_vars :
+
+    Examples
+    --------
+    >>> df
+    A B C
+    a 1 2
+    b 3 4
+    c 5 6
+
+    >>> melt(df, ['A'])
+    A variable value
+    a B        1
+    b B        3
+    c B        5
+    a C        2
+    b C        4
+    c C        6
+    """
+    # TODO: what about the existing index?
+
+    N, K = frame.shape
+
+    mdata = {}
+
+    if id_vars is not None:
+        idvars = list(idvars)
+        frame = frame.copy()
+        K -= len(idvars)
+        for col in idvars:
+            mdata[col] = np.tile(frame.pop(col).values, K)
+    else:
+        idvars = []
+
+    mcolumns = idvars + ['variable', 'value']
+
+    mdata['value'] = frame.values.ravel('F')
+    mdata['variable'] = np.asarray(frame.columns).repeat(N)
+    return DataFrame(mdata, columns=mcolumns)
diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py
@@ -1162,7 +1162,7 @@ def transpose(self):
     T = property(transpose)
 
     def count(self, axis=0, **kwds):
-        return self.apply(SparseSeries.count, axis=axis)
+        return self.apply(lambda x: x.count(), axis=axis)
     count.__doc__ = DataFrame.count.__doc__
 
     def cumsum(self, axis=0):
@@ -1178,7 +1178,7 @@ def cumsum(self, axis=0):
         -------
         y : SparseDataFrame
         """
-        return self.apply(SparseSeries.cumsum, axis=axis)
+        return self.apply(lambda x: x.cumsum(), axis=axis)
 
     def shift(self, periods, offset=None, timeRule=None):
         """

diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py
@@ -74,6 +74,9 @@ def _convert_array(obj):
 def _convert_vector(obj):
     if isinstance(obj, robj.IntVector):
         return _convert_int_vector(obj)
+    elif isinstance(obj, robj.StrVector):
+        return _convert_str_vector(obj)
+
     return list(obj)
 
 NA_INTEGER = -2147483648
@@ -86,6 +89,13 @@ def _convert_int_vector(obj):
         arr[mask] = np.nan
     return arr
 
+def _convert_str_vector(obj):
+    arr = np.asarray(obj, dtype=object)
+    mask = arr == robj.NA_Character
+    if mask.any():
+        arr[mask] = np.nan
+    return arr
+
 def _convert_DataFrame(rdf):
     columns = list(rdf.colnames)
     rows = np.array(rdf.rownames)