pandas-dev · pdpark · Dec 27, 2017 · Dec 28, 2017 · Jan 5, 2018 · Jan 7, 2018
diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
@@ -332,3 +332,97 @@ using something similar to the following:
 See `the NumPy documentation on byte order
 <https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html>`__ for more
 details.
+
+
+Alternative to storing lists in Pandas DataFrame Cells
+------------------------------------------------------
+Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
+
+Example of exploding nested lists into a DataFrame:
+
+.. ipython:: python
+
+   from collections import OrderedDict
+   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
+                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
+                                   ('attribute x', ['A','B','C'])
+                                  ])
+                     ))
+   df
+
+   nn = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3
+   nn
+
+   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nn)], axis=1)
+   df2
+
+   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.set_index(['name', 'opponent'])
+   df3
+
+   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   # Step 4: Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   # Step 5: Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df2.set_index(['name', 'opponent'])
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4
+
+Example of exploding a list embedded in a dataframe:
+
+.. ipython:: python
+
+   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
+                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
+                                   ('attribute x', ['A','B','C']),
+                                   ('nearest_neighbors', [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3)
+                                  ])
+                     ))
+
+   df
+
+   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = df.set_index(['name', 'opponent'])
+   df2
+
+   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.nearest_neighbors.apply(pd.Series)
+   df3
+
+   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   # Step 4: Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   # Step 5: Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df.set_index(['name', 'opponent'])
+           .nearest_neighbors.apply(pd.Series)
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -915,13 +915,17 @@ The dimension of the returned result can also change:
    So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in
    the output as well as set the indices.
 
-.. warning::
+.. warnings::
 
-    In the current implementation apply calls func twice on the
+    * In the current implementation apply calls func twice on the
     first group to decide whether it can take a fast or slow code
     path. This can lead to unexpected behavior if func has
     side-effects, as they will take effect twice for the first
     group.
+
+    * Apply should not perform in-place operations on the group chunk. 
+    Group chunks should be treated as immutable, and changes to a 
+    group chunk may produce unexpected results.
 
     .. ipython:: python
 
@@ -955,6 +959,42 @@ will be (silently) dropped. Thus, this does not pose any problems:
 
    df.groupby('A').std()
 
+.. note::
+   Decimal columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby.
+
+   If you do wish to include decimal columns in the aggregation, you must do so explicitly:
+
+.. ipython:: python
+
+    from decimal import Decimal
+    dec = pd.DataFrame(
+                {'name': ['foo', 'bar', 'foo', 'bar'], 
+                    'title': ['boo', 'far', 'boo', 'far'], 
+                    'id': [123, 456, 123, 456], 
+                    'int_column': [1, 2, 3, 4], 
+                    'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')], 
+                    'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')]
+                },
+            columns=['name','title','id','int_column','dec_column1','dec_column2']
+            )
+
+    dec.head()
+
+    dec.dtypes
+
+    # Decimal columns excluded from sum by default
+    dec.groupby(['name', 'title', 'id'], as_index=False).sum()
+
+    # Decimal columns can be sum'd explicitly by themselves...
+    dec.groupby(['name', 'title', 'id'], as_index=False)['dec_column1','dec_column2'].sum()
+
+    # ...but cannot be combined with standard data types or they will be excluded
+    dec.groupby(['name', 'title', 'id'], as_index=False)['int_column','dec_column1','dec_column2'].sum()
+
+    # Use .agg function to aggregate over standard and "nuisance" data types at the same time 
+    dec.groupby(['name', 'title', 'id'], as_index=False).agg({'int_column': 'sum', 'dec_column1': 'sum', 'dec_column2': 'sum'})
+
+
 .. _groupby.missing:
 
 NA and NaT group handling