In [1]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

# ex.0a Create a `DataFrame` from dictionary

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,a,b,c
0,0.966273,0.299399,-1.044924
1,1.327576,1.278127,1.013426
2,2.515124,0.160159,0.879753
3,-1.156102,1.537934,-0.638214
4,-0.320623,-1.022967,0.28838


# ex.0b Select with a boolean array

In [4]:
mask = df['b'].values > 0

In [5]:
mask

array([ True,  True,  True,  True, False], dtype=bool)

In [6]:
g = df.groupby(mask)
g.size()

False    1
True     4
dtype: int64

# ex.1 Use `groupby` with a single key

In [7]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.124489,0.31811,a,one
1,1.213962,0.947897,a,two
2,-0.118728,-1.011203,b,one
3,0.155637,-1.156875,b,two
4,0.986994,0.109572,a,one


In [9]:
g = df['data1'].groupby(df['key1'])

In [10]:
g.mean()

key1
a    0.692155
b    0.018454
Name: data1, dtype: float64

# ex.2 Use `groupby` with arrays not in the `DataFrame`

In [11]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005    1.213962
            2006   -0.118728
Ohio        2005    0.015574
            2006    0.986994
Name: data1, dtype: float64

# ex.3 Iterate over `grouped` object

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.124489  0.318110    a  one
1  1.213962  0.947897    a  two
4  0.986994  0.109572    a  one
b
      data1     data2 key1 key2
2 -0.118728 -1.011203    b  one
3  0.155637 -1.156875    b  two


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.124489  0.318110    a  one
4  0.986994  0.109572    a  one
('a', 'two')
      data1     data2 key1 key2
1  1.213962  0.947897    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.118728 -1.011203    b  one
('b', 'two')
      data1     data2 key1 key2
3  0.155637 -1.156875    b  two


# ex.4 Groub by columns based on `dtype`

In [15]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
g = df.groupby(df.dtypes, axis=1)

In [17]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.124489  0.318110
1  1.213962  0.947897
2 -0.118728 -1.011203
3  0.155637 -1.156875
4  0.986994  0.109572
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# ex.5 Return `DataFrame` or `Series` from `groupby`

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.213841
a,two,0.947897
b,one,-1.011203
b,two,-1.156875


In [19]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.213841
      two     0.947897
b     one    -1.011203
      two    -1.156875
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [20]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7f44f034dfd0>

In [21]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x7f44f05a05f8>

# ex.6a Use `iloc` for selecting cells

In [22]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.149193,-0.670221,-0.299238,0.191345,0.178385
Steve,-0.071161,0.865419,-0.314956,0.140077,-0.55849
Wes,-0.515224,,0.867893,,-0.335958
Jim,0.767961,0.751226,1.718989,0.742853,0.234961
Travis,-1.030884,-0.647332,-0.148332,-0.451267,-0.068723


# ex.6b Same thing

In [23]:
df = pd.DataFrame()

for i in range(10):
    df[i] = np.arange(10)

df.iloc[[4, 8], 2:7]

Unnamed: 0,2,3,4,5,6
4,4,4,4,4,4
8,8,8,8,8,8


# ex.7a Dict comprehensions to create a mapping

In [37]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

# ex.7b Use a mapping to group a `DataFrame`

In [25]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.149193,-0.670221,-0.299238,0.191345,0.178385
Steve,-0.071161,0.865419,-0.314956,0.140077,-0.55849
Wes,-0.515224,,0.867893,,-0.335958
Jim,0.767961,0.751226,1.718989,0.742853,0.234961
Travis,-1.030884,-0.647332,-0.148332,-0.451267,-0.068723


In [26]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,-0.107892,-1.641029
Steve,-0.17488,0.235768
Wes,0.867893,-0.851181
Jim,2.461843,1.754148
Travis,-0.599599,-1.746939


# ex.7c New mapping

In [40]:
new_map = {i:j for i, j in zip(
    people.columns,
    'fee fi foe fi fum'.split()
)}
g = people.groupby(new_map, axis=1)
g.sum()

Unnamed: 0,fee,fi,foe,fum
Joe,-1.149193,-0.478876,-0.299238,0.178385
Steve,-0.071161,1.005495,-0.314956,-0.55849
Wes,-0.515224,,0.867893,-0.335958
Jim,0.767961,1.494079,1.718989,0.234961
Travis,-1.030884,-1.098599,-0.148332,-0.068723


# ex.8a Convert `dict` to `Series`

In [27]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

# ex.8b Use `Series` to group a `DataFrame`

In [28]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# ex.9 Use `groupby` with a function

ie. the function takes as an argument, the index value of the record.

In [29]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.896455,0.081005,2.287645,0.934199,0.077388
5,-0.071161,0.865419,-0.314956,0.140077,-0.55849
6,-1.030884,-0.647332,-0.148332,-0.451267,-0.068723


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

# ex.10 Use `groupby` with mixed grouping types

In [30]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.149193,-0.670221,-0.299238,0.191345,-0.335958
3,two,0.767961,0.751226,1.718989,0.742853,0.234961
5,one,-0.071161,0.865419,-0.314956,0.140077,-0.55849
6,two,-1.030884,-0.647332,-0.148332,-0.451267,-0.068723


# ex.11a Create a hierarchical index

In [31]:
columns = pd.MultiIndex.from_arrays([
    'US US US JP JP'.split(),
    [1, 3, 5, 1, 3]],
    names=['city', 'tenor'])

In [32]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.930593,-0.450047,0.905084,0.040623,0.325154
1,-0.160713,-1.602184,1.722683,-1.251997,1.509847
2,1.682043,-0.403944,-1.034016,-0.32588,-0.774049
3,0.898256,1.755736,0.629347,-1.355565,0.20868


# ex. 11b Use `groupby` with hierarchical index

In [33]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# ex.12 Get a group's `quantile`

The key here is that `quantile()` is a method of the `Series` object, not the `grouped` object. However, it is available as the result of a `groupby` operation.

In [34]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2
3,3,3,3,3,3,3,3,3,3,3
4,4,4,4,4,4,4,4,4,4,4
5,5,5,5,5,5,5,5,5,5,5
6,6,6,6,6,6,6,6,6,6,6
7,7,7,7,7,7,7,7,7,7,7
8,8,8,8,8,8,8,8,8,8,8
9,9,9,9,9,9,9,9,9,9,9


In [35]:
g = df.groupby('key1')
g['data1'].quantile(0.9)

KeyError: 'key1'

# ex.13 Create your own aggregation function

In [None]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [None]:
g.agg(peak_to_peak)

# ex.14 Create a new column from existing columns

In [None]:
tips = pd.read_csv('../data/tips.csv')

In [None]:
tips.head(1)

In [None]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head(1)

# ex.15 Aggregate with multiple functions at a time

In [None]:
g = tips.groupby(['day', 'smoker'])
g_pct = g['tip_pct']
g_pct.agg('mean')

In [None]:
g_pct.agg(['mean', 'std', peak_to_peak], as_index=False)

Left off on page 302