In [1]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

# ex.0

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,a,b,c
0,-0.25239,1.440717,-1.026496
1,-0.666791,0.246711,0.543587
2,-0.50272,-0.475237,0.362402
3,1.763368,0.810041,-0.020765
4,1.541223,0.192489,-1.15092


In [4]:
mask = df['b'].values > 0

In [5]:
mask

array([ True,  True, False,  True,  True], dtype=bool)

In [6]:
g = df.groupby(mask)
g.size()

False    1
True     4
dtype: int64

# ex.1

In [7]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.428534,0.352563,a,one
1,0.270488,-0.949608,a,two
2,-0.499462,-0.655156,b,one
3,-0.082271,-1.034241,b,two
4,1.266016,0.646985,a,one


In [9]:
g = df['data1'].groupby(df['key1'])

In [10]:
g.mean()

key1
a    0.655013
b   -0.290866
Name: data1, dtype: float64

# ex.2

In [11]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005    0.270488
            2006   -0.499462
Ohio        2005    0.173132
            2006    1.266016
Name: data1, dtype: float64

# ex.3

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  0.428534  0.352563    a  one
1  0.270488 -0.949608    a  two
4  1.266016  0.646985    a  one
b
      data1     data2 key1 key2
2 -0.499462 -0.655156    b  one
3 -0.082271 -1.034241    b  two


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0  0.428534  0.352563    a  one
4  1.266016  0.646985    a  one
('a', 'two')
      data1     data2 key1 key2
1  0.270488 -0.949608    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.499462 -0.655156    b  one
('b', 'two')
      data1     data2 key1 key2
3 -0.082271 -1.034241    b  two


# ex.4

In [15]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
g = df.groupby(df.dtypes, axis=1)

In [17]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.428534  0.352563
1  0.270488 -0.949608
2 -0.499462 -0.655156
3 -0.082271 -1.034241
4  1.266016  0.646985
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# ex.5

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.499774
a,two,-0.949608
b,one,-0.655156
b,two,-1.034241


In [19]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.499774
      two    -0.949608
b     one    -0.655156
      two    -1.034241
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [20]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7ff8859a3898>

In [21]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x7ff8859a3320>

# ex.6

In [22]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.895551,1.213096,0.412801,-0.13892,0.916721
Steve,-1.663056,1.515942,-0.78187,1.33881,0.297856
Wes,3.006434,,0.068785,,-1.553862
Jim,0.764662,-0.50558,-1.350002,-0.428992,-1.005325
Travis,-0.121403,-0.762929,0.35416,-0.027214,0.255774


# ex.7

In [23]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [24]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,0.273881,3.025368
Steve,0.556939,0.150742
Wes,0.068785,1.452572
Jim,-1.778994,-0.746243
Travis,0.326946,-0.628558


# ex.8

In [25]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [26]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# ex.9

In [27]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,4.666648,0.707516,-0.868416,-0.567913,-1.642466
5,-1.663056,1.515942,-0.78187,1.33881,0.297856
6,-0.121403,-0.762929,0.35416,-0.027214,0.255774


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

# ex.10

In [28]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.895551,1.213096,0.068785,-0.13892,-1.553862
3,two,0.764662,-0.50558,-1.350002,-0.428992,-1.005325
5,one,-1.663056,1.515942,-0.78187,1.33881,0.297856
6,two,-0.121403,-0.762929,0.35416,-0.027214,0.255774


# ex.11

In [29]:
columns = pd.MultiIndex.from_arrays([
    'US US US JP JP'.split(),
    [1, 3, 5, 1, 3]],
    names=['city', 'tenor'])

In [30]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.537685,-0.37541,0.172259,1.230566,0.556403
1,0.896546,0.154765,0.972543,0.771345,-0.858178
2,1.338857,0.802659,-1.04813,-1.398645,-1.353674
3,-1.20479,-0.432103,0.371022,0.21208,-1.040149


In [31]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# ex.12

In [32]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.428534,0.352563,a,one
1,0.270488,-0.949608,a,two
2,-0.499462,-0.655156,b,one
3,-0.082271,-1.034241,b,two
4,1.266016,0.646985,a,one


In [33]:
g = df.groupby('key1')
g['data1'].quantile(0.9)

key1
a    1.09852
b   -0.12399
Name: data1, dtype: float64

# ex.13

In [35]:
df = pd.DataFrame()

for i in range(10):
    df[i] = np.arange(10)

df.iloc[[4, 8], 2:7]

Unnamed: 0,2,3,4,5,6
4,4,4,4,4,4
8,8,8,8,8,8
