In [2]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

# ex.0

In [21]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [22]:
df

Unnamed: 0,a,b,c
0,1.129468,0.112991,-0.674198
1,0.189847,-0.238003,0.977634
2,0.470459,-0.673462,0.80518
3,0.722056,0.032414,-0.844928
4,0.483032,-1.105151,0.076161


In [23]:
mask = df['b'].values > 0

In [24]:
mask

array([ True, False, False,  True, False], dtype=bool)

In [25]:
g = df.groupby(mask)
g.size()

False    3
True     2
dtype: int64

# ex.1

In [30]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [31]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.628585,0.494158,a,one
1,-1.052151,-0.125664,a,two
2,0.216189,0.103146,b,one
3,0.029706,-0.484219,b,two
4,-0.108002,0.129474,a,one


In [32]:
g = df['data1'].groupby(df['key1'])

In [33]:
g.mean()

key1
a   -0.596246
b    0.122947
Name: data1, dtype: float64

# ex.2

In [34]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [35]:
df['data1'].groupby([states, years]).mean()

California  2005   -1.052151
            2006    0.216189
Ohio        2005   -0.299440
            2006   -0.108002
Name: data1, dtype: float64

# ex.3

In [39]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.628585  0.494158    a  one
1 -1.052151 -0.125664    a  two
4 -0.108002  0.129474    a  one
b
      data1     data2 key1 key2
2  0.216189  0.103146    b  one
3  0.029706 -0.484219    b  two


In [40]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.628585  0.494158    a  one
4 -0.108002  0.129474    a  one
('a', 'two')
      data1     data2 key1 key2
1 -1.052151 -0.125664    a  two
('b', 'one')
      data1     data2 key1 key2
2  0.216189  0.103146    b  one
('b', 'two')
      data1     data2 key1 key2
3  0.029706 -0.484219    b  two


# ex.4

In [41]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [43]:
g = df.groupby(df.dtypes, axis=1)

In [44]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.628585  0.494158
1 -1.052151 -0.125664
2  0.216189  0.103146
3  0.029706 -0.484219
4 -0.108002  0.129474
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# ex.5

In [45]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.311816
a,two,-0.125664
b,one,0.103146
b,two,-0.484219


In [46]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.311816
      two    -0.125664
b     one     0.103146
      two    -0.484219
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [50]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x000000000968C7B8>

In [49]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x00000000094C6C18>

# ex.6

In [52]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.744554,0.100793,0.474875,1.151924,-0.405654
Steve,0.211749,1.075391,-0.912954,-0.995649,0.241971
Wes,-0.799246,,-2.218327,,-1.457126
Jim,0.550608,-0.462659,-1.323681,-0.276781,1.876627
Travis,-1.597903,0.651608,-0.223674,0.186746,-0.008754


# ex.7

In [54]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [55]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,1.626799,-1.049415
Steve,-1.908604,1.529111
Wes,-2.218327,-2.256372
Jim,-1.600462,1.964575
Travis,-0.036928,-0.955048


# ex.8

In [56]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [57]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# ex.9

In [58]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.993192,-0.361866,-3.067133,0.875142,0.013847
5,0.211749,1.075391,-0.912954,-0.995649,0.241971
6,-1.597903,0.651608,-0.223674,0.186746,-0.008754


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

# ex.10

In [59]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.799246,0.100793,-2.218327,1.151924,-1.457126
3,two,0.550608,-0.462659,-1.323681,-0.276781,1.876627
5,one,0.211749,1.075391,-0.912954,-0.995649,0.241971
6,two,-1.597903,0.651608,-0.223674,0.186746,-0.008754


left off on page 295