In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                  'key2':['one','two','one','two','one'],
                  'data1' : np.random.randn(5),
                  'data2' : np.random.randn(5)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.499259,-0.128157
1,a,two,-1.941311,-0.11642
2,b,one,0.078492,1.07564
3,b,two,0.481779,-0.130795
4,a,one,-1.912944,-0.69045


In [4]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000216C88D7860>

In [5]:
grouped.mean()

key1
a   -1.451171
b    0.280135
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [7]:
means

key1  key2
a     one    -1.206102
      two    -1.941311
b     one     0.078492
      two     0.481779
Name: data1, dtype: float64

In [8]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -1.941311
            2006    0.078492
Ohio        2005   -0.008740
            2006   -1.912944
Name: data1, dtype: float64

In [9]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [12]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.499259 -0.128157
1    a  two -1.941311 -0.116420
4    a  one -1.912944 -0.690450
b
  key1 key2     data1     data2
2    b  one  0.078492  1.075640
3    b  two  0.481779 -0.130795


In [13]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.499259 -0.128157
4    a  one -1.912944 -0.690450
('a', 'two')
  key1 key2     data1    data2
1    a  two -1.941311 -0.11642
('b', 'one')
  key1 key2     data1    data2
2    b  one  0.078492  1.07564
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.481779 -0.130795


In [14]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.499259,-0.128157
1,a,two,-1.941311,-0.11642
2,b,one,0.078492,1.07564
3,b,two,0.481779,-0.130795
4,a,one,-1.912944,-0.69045


In [15]:
df.groupby('key1')['data1']

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000216C88F6080>

In [17]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.409304
a,two,-0.11642
b,one,1.07564
b,two,-0.130795


In [18]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.67797,1.30705,1.374535,-0.488495,0.322755
Steve,0.373143,0.853888,-0.916482,-0.35811,0.57259
Wes,-0.17348,0.691674,0.042524,0.61461,-0.624164
Jim,0.527311,0.243718,1.020112,0.4651,1.009559
Travis,-0.084168,-1.483793,1.452733,-1.158965,1.163507


In [19]:
people.iloc[2:3, [1, 2]] = np.nan

In [20]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.67797,1.30705,1.374535,-0.488495,0.322755
Steve,0.373143,0.853888,-0.916482,-0.35811,0.57259
Wes,-0.17348,,,0.61461,-0.624164
Jim,0.527311,0.243718,1.020112,0.4651,1.009559
Travis,-0.084168,-1.483793,1.452733,-1.158965,1.163507


In [21]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [22]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.886039,0.951835
Steve,-1.274592,1.79962
Wes,0.61461,-0.797645
Jim,1.485212,1.780588
Travis,0.293769,-0.404454


In [23]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [24]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [25]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.67797,1.30705,1.374535,-0.488495,0.322755
Steve,0.373143,0.853888,-0.916482,-0.35811,0.57259
Wes,-0.17348,,,0.61461,-0.624164
Jim,0.527311,0.243718,1.020112,0.4651,1.009559
Travis,-0.084168,-1.483793,1.452733,-1.158965,1.163507


In [26]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.67797,1.30705,1.374535,-0.488495,-0.624164
3,two,0.527311,0.243718,1.020112,0.4651,1.009559
5,one,0.373143,0.853888,-0.916482,-0.35811,0.57259
6,two,-0.084168,-1.483793,1.452733,-1.158965,1.163507


In [27]:
tips = pd.read_csv('../examples/tips.csv')

In [28]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [29]:
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [30]:
grouped = tips.groupby(['day', 'smoker'])

In [31]:
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64