# The `pandas` Groupby

I'm just going through the examples in the book here. Nothing groundbreaking or anything.

In [1]:
# The maths, graphs, stats and style libs

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

In [2]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5)
})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,98.073066,104.17562,a,one
1,91.486775,88.745638,a,two
2,119.443616,81.841537,b,one
3,96.612368,103.424475,b,two
4,110.714988,97.652918,a,one


In [4]:
g = df['data1'].groupby(df['key1'])

In [5]:
g

<pandas.core.groupby.SeriesGroupBy object at 0x7fb90bcee438>

In [6]:
g.mean()

key1
a    100.091610
b    108.027992
Name: data1, dtype: float64

In [7]:
g.std()

key1
a     9.771742
b    16.144130
Name: data1, dtype: float64

## Multiple layers of grouping?

In [8]:
m = df['data1'].groupby([df['key1'], df['key2']])

In [9]:
m.median()

key1  key2
a     one     104.394027
      two      91.486775
b     one     119.443616
      two      96.612368
Name: data1, dtype: float64

In this summary we have the word 'one' appearing twice. Same with the word 'two'. That is visually inefficient because we have this stack of ones and twos there and we can't quickly compare side by side...

## And check this out...

In [10]:
m.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,104.394027,91.486775
b,119.443616,96.612368


Natually this would only work nicely with two dimensions. I wonder what happens with three.

In [11]:
df2 = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'key3': 'fee fi foe foe fum'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5),
    'data3': np.random.chisquare(100, 5)
})

In [12]:
df2

Unnamed: 0,data1,data2,data3,key1,key2,key3
0,111.382896,91.022385,114.365499,a,one,fee
1,116.187856,104.051808,75.837945,a,two,fi
2,124.62343,98.036837,124.448139,b,one,foe
3,96.88191,79.842177,90.42365,b,two,foe
4,122.676174,111.990521,97.545838,a,one,fum


In [13]:
t = df2['data1'].groupby([df2['key1'], df2['key2'], df2['key3']])

In [14]:
t.mean()

key1  key2  key3
a     one   fee     111.382896
            fum     122.676174
      two   fi      116.187856
b     one   foe     124.623430
      two   foe      96.881910
Name: data1, dtype: float64

In [15]:
t.mean().unstack()

Unnamed: 0_level_0,key3,fee,fi,foe,fum
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,111.382896,,,122.676174
a,two,,116.187856,,
b,one,,,124.62343,
b,two,,,96.88191,


Well I'll be damned it still behaves nicely. But still doesn't work as well as the two dimensional example.

## Group keys

They don't have to be part of the dataframe. They just have to be arrays of the right length.

In [16]:
states = np.array('Ohio California California Ohio Ohio'.split())

In [17]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [18]:
df['data1'].groupby([states, years]).mean()

California  2005     91.486775
            2006    119.443616
Ohio        2005     97.342717
            2006    110.714988
Name: data1, dtype: float64

Wow. I'm amazed. This is too easy.

In [19]:
# But if they are part of the dataframe, there is a shortcut

df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,100.09161,96.858059
b,108.027992,92.633006


In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,104.394027,100.914269
a,two,91.486775,88.745638
b,one,119.443616,81.841537
b,two,96.612368,103.424475


In [21]:
# And a useful aggregator is 

df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## Iterating over groups

In [22]:
# With a single group key

for name, group in df.groupby('key1'):
    print(name)
    print(group.std())

a
data1    9.771742
data2    7.745640
dtype: float64
b
data1    16.144130
data2    15.261442
dtype: float64


In [23]:
# With multiple group keys, the first element is always a tuple

for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group.mean(), '\n')

('a', 'one')
data1    104.394027
data2    100.914269
dtype: float64 

('a', 'two')
data1    91.486775
data2    88.745638
dtype: float64 

('b', 'one')
data1    119.443616
data2     81.841537
dtype: float64 

('b', 'two')
data1     96.612368
data2    103.424475
dtype: float64 



### Nice recipe here

In [24]:
pieces = dict(list(df.groupby('key1')))

In [25]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,119.443616,81.841537,b,one
3,96.612368,103.424475,b,two


In [26]:
df

Unnamed: 0,data1,data2,key1,key2
0,98.073066,104.17562,a,one
1,91.486775,88.745638,a,two
2,119.443616,81.841537,b,one
3,96.612368,103.424475,b,two
4,110.714988,97.652918,a,one


In [27]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [28]:
g = df.groupby(df.dtypes, axis=1)

In [29]:
for dtype, group in g:
    print(dtype)
    print(group, '\n')

float64
        data1       data2
0   98.073066  104.175620
1   91.486775   88.745638
2  119.443616   81.841537
3   96.612368  103.424475
4  110.714988   97.652918 

object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one 



## Selecting a column or subset of columns

In [30]:
# This,

a = df.groupby('key1')['data1']
a

<pandas.core.groupby.SeriesGroupBy object at 0x7fb90bc3fa20>

In [31]:
# is the same as this

b = df['data1'].groupby(df['key1'])
b

<pandas.core.groupby.SeriesGroupBy object at 0x7fb90bc3f438>

In [32]:
# check it

print(a.mean(), '\n')
print(b.mean())

key1
a    100.091610
b    108.027992
Name: data1, dtype: float64 

key1
a    100.091610
b    108.027992
Name: data1, dtype: float64


In [33]:
# Getting fancy with it

df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,100.914269
a,two,88.745638
b,one,81.841537
b,two,103.424475


Objects returned are pd.DataFrames unless a single column is used. In that case it is a Series.

In [34]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7fb90bcad400>

In [35]:
s_grouped.mean()

key1  key2
a     one     100.914269
      two      88.745638
b     one      81.841537
      two     103.424475
Name: data2, dtype: float64

## Grouping with Dicts and Series

You can create a mapping of columns. Maybe a few columns are similare and they should be aggregated together but you need something to aggregate them by. So you can use a dictionary for that. And because this is a way of grouping columns, it makes sense that we use `axis=1`.

In [36]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns='a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people

Unnamed: 0,a,b,c,d,e
Joe,0.625,0.955645,0.733409,0.789165,0.894043
Steve,-0.738202,-0.663504,0.222396,-0.69256,1.067821
Wes,-2.560253,-0.305034,0.552343,-0.455979,-0.071548
Jim,-0.215582,-0.049461,-1.351012,0.733767,0.967355
Travis,2.148819,-0.720668,-0.178876,-0.494203,0.917753


In [37]:
people.iloc[2:3, [1, 2]] = np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,0.625,0.955645,0.733409,0.789165,0.894043
Steve,-0.738202,-0.663504,0.222396,-0.69256,1.067821
Wes,-2.560253,,,-0.455979,-0.071548
Jim,-0.215582,-0.049461,-1.351012,0.733767,0.967355
Travis,2.148819,-0.720668,-0.178876,-0.494203,0.917753


In [38]:
mapping = {
    'a': 'red',
    'b': 'red',
    'c': 'blue',
    'd': 'blue',
    'e': 'red',
    'f': 'orange'
}

In [39]:
by_col = people.groupby(mapping, axis=1)

In [40]:
by_col.sum()

Unnamed: 0,blue,red
Joe,1.522574,2.474687
Steve,-0.470164,-0.333885
Wes,-0.455979,-2.631802
Jim,-0.617245,0.702313
Travis,-0.673079,2.345905


In [41]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [42]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## Grouping with functions

Ok what??? This is black magic.

In [43]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [44]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-2.150835,0.906183,-0.617603,1.066954,1.78985
5,-0.738202,-0.663504,0.222396,-0.69256,1.067821
6,2.148819,-0.720668,-0.178876,-0.494203,0.917753


In [45]:
key_list = 'one one one two two'.split()
key_list

['one', 'one', 'one', 'two', 'two']

Mix and match:

In [46]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.560253,0.955645,0.733409,-0.455979,-0.071548
3,two,-0.215582,-0.049461,-1.351012,0.733767,0.967355
5,one,-0.738202,-0.663504,0.222396,-0.69256,1.067821
6,two,2.148819,-0.720668,-0.178876,-0.494203,0.917753


## Groupping by index levels

In [47]:
cols = pd.MultiIndex.from_arrays(['US US US JP JP'.split(),
                                  [1, 3, 5, 1, 3]],
                                names=['city', 'tenor'])

In [48]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=cols)

hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.393396,0.04017,0.788026,0.018686,-0.091766
1,0.634656,1.928694,-0.128317,-0.021943,-0.041879
2,-0.261851,-0.804695,-0.072753,0.213046,-0.064592
3,0.185771,-0.135472,-0.780823,-0.295686,-1.382279


In [49]:
hier_df.groupby(level='city', axis=1).min()

city,JP,US
0,-0.091766,0.04017
1,-0.041879,-0.128317
2,-0.064592,-0.804695
3,-1.382279,-0.780823


Here we've created an index with two layers. We named one layer `city` and the other layer `tenor`. Those are the names we use to refer to those layers. The `groupby` statement shows how this is done.

## Data aggregation

In [50]:
# Quantile is available for Series objects, thus also available for groupby objects

df

Unnamed: 0,data1,data2,key1,key2
0,98.073066,104.17562,a,one
1,91.486775,88.745638,a,two
2,119.443616,81.841537,b,one
3,96.612368,103.424475,b,two
4,110.714988,97.652918,a,one


In [51]:
g = df.groupby('key1')

g['data1'].quantile(0.9)

key1
a    108.186604
b    117.160492
Name: data1, dtype: float64

### DIY aggregation with the `agg` method

Just write a function that aggregates arrays, then pass it to the grouped object's `agg` method.

In [52]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [53]:
g.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,19.228213,15.429982
b,22.831248,21.582938


### Other methods

In [54]:
g.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,100.09161,9.771742,91.486775,94.77992,98.073066,104.394027,110.714988,3.0,96.858059,7.74564,88.745638,93.199278,97.652918,100.914269,104.17562
b,2.0,108.027992,16.14413,96.612368,102.32018,108.027992,113.735804,119.443616,2.0,92.633006,15.261442,81.841537,87.237272,92.633006,98.028741,103.424475


`describe` is not an aggregation function. But it still works.

## Column-wise and multiple function application

Here we use the `tips.csv` dataset provided by Wes on the GitHub for the book.

In [55]:
tips = pd.read_csv('data/tips.csv')

tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [56]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

tips.head(6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [57]:
g = tips.groupby(['day', 'smoker'])

In [58]:
g_pct = g['tip_pct']

In [59]:
g_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

This is black magic. I swear it's too easy!! I'm not doing any work here!

In [60]:
g_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


But maybe you want different names for the columns?

In [61]:
# You can pass a tuple with ('name', 'func') elements

g_pct.agg([('Average', 'mean'), ('Std. Dev', 'std'), ('Range', peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,Average,Std. Dev,Range
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [62]:
funcs = 'count mean max'.split()
funcs

['count', 'mean', 'max']

In [63]:
result = g['tip_pct', 'total_bill'].agg(funcs)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


I swear that's just black magic. Really? All that as a one liner? That line is selecting just two columns from the original dataset. Then it is running three aggregation functions on each of them. And it gives you detail on day of the week and smoker/non-smoker?

Ok maybe that took three lines.

1. Group
1. List of functions
1. Aggregation

But still. Nice.

In [64]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
ftuples

[('Durchschnitt', 'mean'),
 ('Abweichung', <function numpy.core.fromnumeric.var>)]

In [65]:
result = g['tip_pct', 'total_bill'].agg(ftuples)

result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [66]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.000791
Fri,Yes,0.174783,0.002631
Sat,No,0.158048,0.001581
Sat,Yes,0.147906,0.003767
Sun,No,0.160113,0.001793
Sun,Yes,0.18725,0.023757
Thur,No,0.160298,0.001503
Thur,Yes,0.163863,0.001551


### What happens with a `dict`?

In [67]:
g.agg({
    'tip': np.max,
    'size': 'sum'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [68]:
g.agg({
    'tip_pct': 'min max mean std'.split(),
    'size': 'sum'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### Return data with non-hierarchical index

Sometimes the index doesn't need to be fancy.

In [69]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## Apply: General split-apply-combine

In [70]:
# Top five values by group

def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [71]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Top `n` rows by group using `apply`

In [72]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [73]:
# With args

tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


## Examples

### Describe by group

In [74]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [75]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

### Suppressing the group keys

In [76]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Quantile and bucket analysis

In [77]:
frame = pd.DataFrame({
    'data1': np.random.randn(1000),
    'data2': np.random.randn(1000)
})
frame.head()

Unnamed: 0,data1,data2
0,2.481046,0.81874
1,0.506036,-0.162651
2,1.549301,-0.331843
3,0.672003,0.944377
4,0.109757,-0.648359


In [78]:
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0    (1.979, 3.723]
1    (0.235, 1.979]
2    (0.235, 1.979]
3    (0.235, 1.979]
4    (-1.51, 0.235]
5    (-1.51, 0.235]
6    (0.235, 1.979]
7    (-1.51, 0.235]
8    (0.235, 1.979]
9    (-1.51, 0.235]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.261, -1.51] < (-1.51, 0.235] < (0.235, 1.979] < (1.979, 3.723]]

In [79]:
def get_stats(group):
    return {
        'min': group.min(),
        'max': group.max(),
        'count': group.count(),
        'mean': group.mean()
    }

In [80]:
g = frame.data2.groupby(quartiles)

In [81]:
g.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.261, -1.51]",66.0,2.298369,-0.205017,-2.442728
"(-1.51, 0.235]",512.0,2.720298,0.038238,-2.818363
"(0.235, 1.979]",393.0,3.106901,0.05687,-2.859846
"(1.979, 3.723]",29.0,2.34149,0.042806,-1.800225


Above are equal length buckets. Below are equal size buckets.

In [82]:
quantiles = pd.qcut(frame.data1, 10, labels=False)

In [83]:
g2 = frame.data2.groupby(quantiles)

In [84]:
g2.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.298369,-0.216447,-2.442728
1,100.0,2.234411,-0.133343,-2.705703
2,100.0,2.279797,0.029806,-2.818363
3,100.0,2.392729,0.013145,-2.526324
4,100.0,2.720298,0.213209,-1.660718
5,100.0,2.591239,0.190432,-1.92427
6,100.0,2.995725,-0.122513,-2.859846
7,100.0,2.82889,0.097003,-2.500367
8,100.0,3.106901,0.066778,-2.302912
9,100.0,2.34149,0.158309,-2.074274


### Fill missing values with group specific values

In [85]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    1.422159
2         NaN
3    1.430462
4         NaN
5   -0.547534
dtype: float64

In [86]:
s.fillna(s.mean())

0    0.768362
1    1.422159
2    0.768362
3    1.430462
4    0.768362
5   -0.547534
dtype: float64

In [87]:
states = 'Ohio NewYork Vermont Florida Oregon Nevada California Idaho'.split()
states[1] = 'New York'
states

['Ohio',
 'New York',
 'Vermont',
 'Florida',
 'Oregon',
 'Nevada',
 'California',
 'Idaho']

In [88]:
group_key = ['East'] * 4 + ['West'] * 4
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [89]:
data = pd.Series(np.random.randn(8), index=states)
data

Ohio          0.282326
New York     -1.770923
Vermont      -0.347095
Florida       1.228223
Oregon        0.725817
Nevada        1.404449
California   -0.266275
Idaho         0.232256
dtype: float64

In [90]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio          0.282326
New York     -1.770923
Vermont            NaN
Florida       1.228223
Oregon        0.725817
Nevada             NaN
California   -0.266275
Idaho              NaN
dtype: float64

In [91]:
data.groupby(group_key).mean()

East   -0.086791
West    0.229771
dtype: float64

In [92]:
fill_mean = lambda g: g.fillna(g.mean())

In [93]:
data.groupby(group_key).apply(fill_mean)

Ohio          0.282326
New York     -1.770923
Vermont      -0.086791
Florida       1.228223
Oregon        0.725817
Nevada        0.229771
California   -0.266275
Idaho         0.229771
dtype: float64

And maybe we just have the fill value hard coded somewhere...

In [94]:
fill_values = {'East':0.5, 'West':-1}
fill_func = lambda g: g.fillna(fill_values[g.name])

In [95]:
data.groupby(group_key).apply(fill_func)

Ohio          0.282326
New York     -1.770923
Vermont       0.500000
Florida       1.228223
Oregon        0.725817
Nevada       -1.000000
California   -0.266275
Idaho        -1.000000
dtype: float64

### Random sampling and permutation

A French deck with `pandas`. Aka, picking random cards.

In [99]:
suits = 'H S C D'.split()
card_val = (list(range(1,11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + 'J Q K'.split()
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)
deck = pd.Series(card_val, index=cards)
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
QH     10
KH     10
dtype: int64

In [100]:
def draw(deck, n=5):
    return deck.sample(n)

draw(deck)

7D      7
8S      8
10C    10
4D      4
JH     10
dtype: int64

In [108]:
get_suit = lambda card: card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  3C     3
   2C     2
D  3D     3
   7D     7
H  QH    10
   4H     4
S  KS    10
   QS    10
dtype: int64

In [109]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

AC     1
9C     9
5D     5
3D     3
4H     4
3H     3
4S     4
QS    10
dtype: int64

## Group weighted average and correlation

In [110]:
df = pd.DataFrame({
    'category': 'a a a a b b b b'.split(),
    'data': np.random.randn(8),
    'weights': np.random.rand(8)
})
df

Unnamed: 0,category,data,weights
0,a,0.681669,0.068617
1,a,0.01426,0.671362
2,a,-0.534757,0.884725
3,a,-1.455699,0.806743
4,b,0.09287,0.533737
5,b,-0.16243,0.38754
6,b,-0.575126,0.507369
7,b,1.693126,0.742446


In [112]:
g = df.groupby('category')

get_wavg = lambda g: np.average(g['data'], weights=g['weights'])

In [113]:
g.apply(get_wavg)

category
a   -0.654400
b    0.438431
dtype: float64

Left off on page 311