In [1]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

# ex.0a Create a `DataFrame` from dictionary

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,a,b,c
0,-0.585493,0.567396,-2.028321
1,-0.031684,0.183487,-0.304099
2,0.20155,0.182747,0.421581
3,-0.570686,-1.502682,0.51999
4,-1.428974,-0.667893,-0.30328


# ex.0b Select with a boolean array

In [4]:
mask = df['b'].values > 0

In [5]:
mask

array([ True,  True,  True, False, False], dtype=bool)

In [6]:
g = df.groupby(mask)
g.size()

False    2
True     3
dtype: int64

# ex.1 Use `groupby` with a single key

In [7]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.579549,-0.592059,a,one
1,-0.481855,0.416635,a,two
2,-0.969665,1.007255,b,one
3,0.230472,0.893711,b,two
4,1.53406,-0.936078,a,one


In [9]:
g = df['data1'].groupby(df['key1'])

In [10]:
g.mean()

key1
a    0.543918
b   -0.369596
Name: data1, dtype: float64

# ex.2 Use `groupby` with arrays not in the `DataFrame`

In [11]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005   -0.481855
            2006   -0.969665
Ohio        2005    0.405011
            2006    1.534060
Name: data1, dtype: float64

# ex.3 Iterate over `grouped` object

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  0.579549 -0.592059    a  one
1 -0.481855  0.416635    a  two
4  1.534060 -0.936078    a  one
b
      data1     data2 key1 key2
2 -0.969665  1.007255    b  one
3  0.230472  0.893711    b  two


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0  0.579549 -0.592059    a  one
4  1.534060 -0.936078    a  one
('a', 'two')
      data1     data2 key1 key2
1 -0.481855  0.416635    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.969665  1.007255    b  one
('b', 'two')
      data1     data2 key1 key2
3  0.230472  0.893711    b  two


# ex.4 Groub by columns based on `dtype`

In [15]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
g = df.groupby(df.dtypes, axis=1)

In [17]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.579549 -0.592059
1 -0.481855  0.416635
2 -0.969665  1.007255
3  0.230472  0.893711
4  1.534060 -0.936078
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# ex.5 Return `DataFrame` or `Series` from `groupby`

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.764068
a,two,0.416635
b,one,1.007255
b,two,0.893711


In [19]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one    -0.764068
      two     0.416635
b     one     1.007255
      two     0.893711
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [20]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7fee85fa0ba8>

In [21]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x7fee85fa06a0>

# ex.6a Use `iloc` for selecting cells

In [22]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.840707,-1.003906,-0.571512,-0.476063,0.547562
Steve,1.394323,-0.705663,0.113912,3.141251,-1.980103
Wes,-0.393769,,-0.50009,,1.723688
Jim,-0.655611,-0.141937,0.136639,1.256529,0.447758
Travis,1.22215,0.065389,1.480143,1.268961,-0.662691


# ex.6b Same thing

In [34]:
df = pd.DataFrame()

for i in range(10):
    df[i] = np.arange(10)

df.iloc[[4, 8], 2:7]

Unnamed: 0,2,3,4,5,6
4,4,4,4,4,4
8,8,8,8,8,8


# ex.7a Dict comprehensions to create a mapping

In [23]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

# ex.7b Use a mapping to group a `DataFrame`

In [35]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.840707,-1.003906,-0.571512,-0.476063,0.547562
Steve,1.394323,-0.705663,0.113912,3.141251,-1.980103
Wes,-0.393769,,-0.50009,,1.723688
Jim,-0.655611,-0.141937,0.136639,1.256529,0.447758
Travis,1.22215,0.065389,1.480143,1.268961,-0.662691


In [24]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,-1.047575,0.384362
Steve,3.255164,-1.291443
Wes,-0.50009,1.329919
Jim,1.393168,-0.34979
Travis,2.749104,0.624848


# ex.8a Convert `dict` to `Series`

In [25]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

# ex.8b Use `Series` to group a `DataFrame`

In [26]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# ex.9 Use `groupby` with a function

ie. the function takes as an argument, the index value of the record.

In [27]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.208673,-1.145844,-0.934963,0.780465,2.719008
5,1.394323,-0.705663,0.113912,3.141251,-1.980103
6,1.22215,0.065389,1.480143,1.268961,-0.662691


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

# ex.10 Use `groupby` with mixed grouping types

In [28]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.393769,-1.003906,-0.571512,-0.476063,0.547562
3,two,-0.655611,-0.141937,0.136639,1.256529,0.447758
5,one,1.394323,-0.705663,0.113912,3.141251,-1.980103
6,two,1.22215,0.065389,1.480143,1.268961,-0.662691


# ex.11a Create a hierarchical index

In [29]:
columns = pd.MultiIndex.from_arrays([
    'US US US JP JP'.split(),
    [1, 3, 5, 1, 3]],
    names=['city', 'tenor'])

In [30]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.701307,-1.154742,-0.719456,-0.275203,-1.675678
1,-0.977431,-1.145637,-1.038227,-0.029095,0.445898
2,1.514782,0.641052,0.127103,0.458284,0.208249
3,-0.522998,0.965964,1.166711,1.283251,-0.238757


# ex. 11b Use `groupby` with hierarchical index

In [31]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# ex.12 Get a group's `quantile`

In [32]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.579549,-0.592059,a,one
1,-0.481855,0.416635,a,two
2,-0.969665,1.007255,b,one
3,0.230472,0.893711,b,two
4,1.53406,-0.936078,a,one


In [33]:
g = df.groupby('key1')
g['data1'].quantile(0.9)

key1
a    1.343158
b    0.110459
Name: data1, dtype: float64