# The `pandas` Groupby

I'm just going through the examples in the book here. Nothing groundbreaking or anything.

In [1]:
# The maths, graphs, stats and style libs

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

In [2]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5)
})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [4]:
g = df['data1'].groupby(df['key1'])

In [5]:
g

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c220518>

In [6]:
g.mean()

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64

In [7]:
g.std()

key1
a    8.518086
b    0.554989
Name: data1, dtype: float64

## Multiple layers of grouping?

In [8]:
m = df['data1'].groupby([df['key1'], df['key2']])

In [9]:
m.median()

key1  key2
a     one      91.349491
      two     105.495440
b     one      92.350768
      two      91.565894
Name: data1, dtype: float64

In this summary we have the word 'one' appearing twice. Same with the word 'two'. That is visually inefficient because we have this stack of ones and twos there and we can't quickly compare side by side...

## And check this out...

In [10]:
m.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,91.349491,105.49544
b,92.350768,91.565894


Natually this would only work nicely with two dimensions. I wonder what happens with three.

In [11]:
df2 = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'key3': 'fee fi foe foe fum'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5),
    'data3': np.random.chisquare(100, 5)
})

In [12]:
df2

Unnamed: 0,data1,data2,data3,key1,key2,key3
0,90.213662,103.744094,104.014927,a,one,fee
1,70.967124,125.536304,82.580021,a,two,fi
2,104.530486,97.036733,125.646992,b,one,foe
3,91.550514,75.67487,105.720274,b,two,foe
4,109.021724,120.929474,118.741643,a,one,fum


In [13]:
t = df2['data1'].groupby([df2['key1'], df2['key2'], df2['key3']])

In [14]:
t.mean()

key1  key2  key3
a     one   fee      90.213662
            fum     109.021724
      two   fi       70.967124
b     one   foe     104.530486
      two   foe      91.550514
Name: data1, dtype: float64

In [15]:
t.mean().unstack()

Unnamed: 0_level_0,key3,fee,fi,foe,fum
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,90.213662,,,109.021724
a,two,,70.967124,,
b,one,,,104.530486,
b,two,,,91.550514,


Well I'll be damned it still behaves nicely. But still doesn't work as well as the two dimensional example.

## Group keys

They don't have to be part of the dataframe. They just have to be arrays of the right length.

In [16]:
states = np.array('Ohio California California Ohio Ohio'.split())

In [17]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [18]:
df['data1'].groupby([states, years]).mean()

California  2005    105.495440
            2006     92.350768
Ohio        2005     90.247820
            2006     93.769236
Name: data1, dtype: float64

Wow. I'm amazed. This is too easy.

In [19]:
# But if they are part of the dataframe, there is a shortcut

df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,96.064807,104.447443
b,91.958331,88.898824


In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,91.349491,100.393143
a,two,105.49544,112.556045
b,one,92.350768,74.731556
b,two,91.565894,103.066093


In [21]:
# And a useful aggregator is 

df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## Iterating over groups

In [22]:
# With a single group key

for name, group in df.groupby('key1'):
    print(name)
    print(group.std())

a
data1     8.518086
data2    20.073792
dtype: float64
b
data1     0.554989
data2    20.035543
dtype: float64


In [23]:
# With multiple group keys, the first element is always a tuple

for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group.mean(), '\n')

('a', 'one')
data1     91.349491
data2    100.393143
dtype: float64 

('a', 'two')
data1    105.495440
data2    112.556045
dtype: float64 

('b', 'one')
data1    92.350768
data2    74.731556
dtype: float64 

('b', 'two')
data1     91.565894
data2    103.066093
dtype: float64 



### Nice recipe here

In [24]:
pieces = dict(list(df.groupby('key1')))

In [25]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two


In [26]:
df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [27]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [28]:
g = df.groupby(df.dtypes, axis=1)

In [29]:
for dtype, group in g:
    print(dtype)
    print(group, '\n')

float64
        data1       data2
0   88.929747  119.198595
1  105.495440  112.556045
2   92.350768   74.731556
3   91.565894  103.066093
4   93.769236   81.587690 

object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one 



## Selecting a column or subset of columns

In [30]:
# This,

a = df.groupby('key1')['data1']
a

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c20b3c8>

In [31]:
# is the same as this

b = df['data1'].groupby(df['key1'])
b

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c201978>

In [32]:
# check it

print(a.mean(), '\n')
print(b.mean())

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64 

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64


In [33]:
# Getting fancy with it

df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,100.393143
a,two,112.556045
b,one,74.731556
b,two,103.066093


Objects returned are pd.DataFrames unless a single column is used. In that case it is a Series.

In [34]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c2011d0>

In [35]:
s_grouped.mean()

key1  key2
a     one     100.393143
      two     112.556045
b     one      74.731556
      two     103.066093
Name: data2, dtype: float64

## Grouping with Dicts and Series

You can create a mapping of columns. Maybe a few columns are similare and they should be aggregated together but you need something to aggregate them by. So you can use a dictionary for that. And because this is a way of grouping columns, it makes sense that we use `axis=1`.

In [36]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns='a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people

Unnamed: 0,a,b,c,d,e
Joe,-0.702408,0.549243,-0.128486,0.445961,0.013229
Steve,0.277814,-0.500106,-0.755237,-0.208514,0.805241
Wes,0.832879,-0.369921,-0.764429,-2.323744,-0.162126
Jim,-0.705547,-2.762352,-1.271556,2.428349,0.205313
Travis,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [37]:
people.iloc[2:3, [1, 2]] = np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,-0.702408,0.549243,-0.128486,0.445961,0.013229
Steve,0.277814,-0.500106,-0.755237,-0.208514,0.805241
Wes,0.832879,,,-2.323744,-0.162126
Jim,-0.705547,-2.762352,-1.271556,2.428349,0.205313
Travis,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [38]:
mapping = {
    'a': 'red',
    'b': 'red',
    'c': 'blue',
    'd': 'blue',
    'e': 'red',
    'f': 'orange'
}

In [39]:
by_col = people.groupby(mapping, axis=1)

In [40]:
by_col.sum()

Unnamed: 0,blue,red
Joe,0.317475,-0.139936
Steve,-0.963751,0.58295
Wes,-2.323744,0.670753
Jim,1.156793,-3.262586
Travis,-0.173036,-0.420009


In [41]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [42]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## Grouping with functions

Ok what??? This is black magic.

In [44]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [43]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.575077,-2.213109,-1.400041,0.550565,0.056416
5,0.277814,-0.500106,-0.755237,-0.208514,0.805241
6,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [45]:
key_list = 'one one one two two'.split()
key_list

['one', 'one', 'one', 'two', 'two']

Mix and match:

In [46]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.702408,0.549243,-0.128486,-2.323744,-0.162126
3,two,-0.705547,-2.762352,-1.271556,2.428349,0.205313
5,one,0.277814,-0.500106,-0.755237,-0.208514,0.805241
6,two,-0.898226,0.471578,0.534362,-0.707398,0.006639


## Groupping by index levels

In [51]:
cols = pd.MultiIndex.from_arrays(['US US US JP JP'.split(),
                                  [1, 3, 5, 1, 3]],
                                names=['city', 'tenor'])

In [52]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=cols)

hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.160615,1.629688,0.875059,-0.976987,1.773561
1,-0.030164,0.157077,-0.207484,0.324323,-1.660056
2,-1.030605,0.663487,0.244933,-0.246046,-0.081602
3,0.561404,-1.095277,0.157359,-2.13683,0.075684


In [56]:
hier_df.groupby(level='city', axis=1).min()

city,JP,US
0,-0.976987,-0.160615
1,-1.660056,-0.207484
2,-0.246046,-1.030605
3,-2.13683,-1.095277


Here we've created an index with two layers. We named one layer `city` and the other layer `tenor`. Those are the names we use to refer to those layers. The `groupby` statement shows how this is done.

## Data aggregation

In [57]:
# Quantile is available for Series objects, thus also available for groupby objects

df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [60]:
g = df.groupby('key1')

g['data1'].quantile(0.9)

key1
a    103.150199
b     92.272280
Name: data1, dtype: float64

### DIY aggregation with the `agg` method

Just write a function that aggregates arrays, then pass it to the grouped object's `agg` method.

In [61]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [62]:
g.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,16.565693,37.610905
b,0.784874,28.334537


### Other methods

In [63]:
g.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,96.064807,8.518086,88.929747,91.349491,93.769236,99.632338,105.49544,3.0,104.447443,20.073792,81.58769,97.071868,112.556045,115.87732,119.198595
b,2.0,91.958331,0.554989,91.565894,91.762112,91.958331,92.154549,92.350768,2.0,88.898824,20.035543,74.731556,81.81519,88.898824,95.982459,103.066093


`describe` is not an aggregation function. But it still works.

## Column-wise and multiple function application

Here we use the `tips.csv` dataset provided by Wes on the GitHub for the book.

In [67]:
tips = pd.read_csv('data/tips.csv')

tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [68]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

tips.head(6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [69]:
g = tips.groupby(['day', 'smoker'])

In [70]:
g_pct = g['tip_pct']

In [71]:
g_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

This is black magic. I swear it's too easy!! I'm not doing any work here!

In [72]:
g_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


But maybe you want different names for the columns?

In [74]:
# You can pass a tuple with ('name', 'func') elements

g_pct.agg([('Average', 'mean'), ('Std. Dev', 'std'), ('Range', peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,Average,Std. Dev,Range
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [75]:
funcs = 'count mean max'.split()
funcs

['count', 'mean', 'max']

In [77]:
result = g['tip_pct', 'total_bill'].agg(funcs)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


I swear that's just black magic. Really? All that as a one liner? That line is selecting just two columns from the original dataset. Then it is running three aggregation functions on each of them. And it gives you detail on day of the week and smoker/non-smoker?

Ok maybe that took three lines.

1. Group
1. List of functions
1. Aggregation

But still. Nice.

In [84]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
ftuples

[('Durchschnitt', 'mean'),
 ('Abweichung', <function numpy.core.fromnumeric.var>)]

In [87]:
result = g['tip_pct', 'total_bill'].agg(ftuples)

result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [88]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.000791
Fri,Yes,0.174783,0.002631
Sat,No,0.158048,0.001581
Sat,Yes,0.147906,0.003767
Sun,No,0.160113,0.001793
Sun,Yes,0.18725,0.023757
Thur,No,0.160298,0.001503
Thur,Yes,0.163863,0.001551


### What happens with a `dict`?

In [89]:
g.agg({
    'tip': np.max,
    'size': 'sum'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [90]:
g.agg({
    'tip_pct': 'min max mean std'.split(),
    'size': 'sum'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### Return data with non-hierarchical index

Sometimes the index doesn't need to be fancy.

In [91]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## Apply: General split-apply-combine

In [93]:
# Top five values by group

def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [94]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Top `n` rows by group using `apply`

In [95]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [97]:
# With args

tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


## Examples

### Describe by group

In [99]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [100]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

### Suppressing the group keys

In [102]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Quantile and bucket analysis

In [103]:
frame = pd.DataFrame({
    'data1': np.random.randn(1000),
    'data2': np.random.randn(1000)
})
frame.head()

Unnamed: 0,data1,data2
0,-0.39919,-0.849692
1,-0.012028,0.089293
2,0.560203,0.183646
3,-0.270896,-0.342244
4,0.379988,-0.848556


In [105]:
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0    (-1.494, -0.0043]
1    (-1.494, -0.0043]
2     (-0.0043, 1.486]
3    (-1.494, -0.0043]
4     (-0.0043, 1.486]
5    (-1.494, -0.0043]
6    (-1.494, -0.0043]
7     (-0.0043, 1.486]
8       (1.486, 2.976]
9    (-1.494, -0.0043]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.99, -1.494] < (-1.494, -0.0043] < (-0.0043, 1.486] < (1.486, 2.976]]

In [106]:
def get_stats(group):
    return {
        'min': group.min(),
        'max': group.max(),
        'count': group.count(),
        'mean': group.mean()
    }

In [107]:
g = frame.data2.groupby(quartiles)

In [108]:
g.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.99, -1.494]",69.0,2.432903,0.04593,-1.770942
"(-1.494, -0.0043]",429.0,2.407081,0.003552,-2.420095
"(-0.0043, 1.486]",442.0,3.217749,0.011094,-2.339463
"(1.486, 2.976]",60.0,2.487929,-0.112333,-2.829044


Above are equal length buckets. Below are equal size buckets.

In [110]:
quantiles = pd.qcut(frame.data1, 10, labels=False)

In [111]:
g2 = frame.data2.groupby(quantiles)

In [112]:
g2.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.432903,-0.046249,-2.202923
1,100.0,2.169054,-0.119736,-2.420095
2,100.0,2.407081,0.160601,-2.384459
3,100.0,2.164791,0.152721,-2.222238
4,100.0,2.030675,-0.103884,-1.873006
5,100.0,2.468457,0.027379,-2.156809
6,100.0,1.981655,-0.060126,-1.735148
7,100.0,2.268994,0.04745,-2.339463
8,100.0,3.217749,0.080313,-1.90771
9,100.0,2.487929,-0.109899,-2.829044


### Fill missing values with group specific values

In [113]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1   -0.121250
2         NaN
3   -1.135249
4         NaN
5   -0.499479
dtype: float64

In [114]:
s.fillna(s.mean())

0   -0.585326
1   -0.121250
2   -0.585326
3   -1.135249
4   -0.585326
5   -0.499479
dtype: float64

In [116]:
states = 'Ohio NewYork Vermont Florida Oregon Nevada California Idaho'.split()
states[1] = 'New York'
states

['Ohio',
 'New York',
 'Vermont',
 'Florida',
 'Oregon',
 'Nevada',
 'California',
 'Idaho']

In [117]:
group_key = ['East'] * 4 + ['West'] * 4
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [118]:
data = pd.Series(np.random.randn(8), index=states)
data

Ohio         -0.090136
New York     -0.074366
Vermont      -0.260029
Florida       0.078719
Oregon        0.760067
Nevada        0.585331
California    1.215953
Idaho         0.210420
dtype: float64

In [119]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.090136
New York     -0.074366
Vermont            NaN
Florida       0.078719
Oregon        0.760067
Nevada             NaN
California    1.215953
Idaho              NaN
dtype: float64

In [121]:
data.groupby(group_key).mean()

East   -0.028594
West    0.988010
dtype: float64

In [122]:
fill_mean = lambda g: g.fillna(g.mean())

In [123]:
data.groupby(group_key).apply(fill_mean)

Ohio         -0.090136
New York     -0.074366
Vermont      -0.028594
Florida       0.078719
Oregon        0.760067
Nevada        0.988010
California    1.215953
Idaho         0.988010
dtype: float64

And maybe we just have the fill value hard coded somewhere...

In [124]:
fill_values = {'East':0.5, 'West':-1}
fill_func = lambda g: g.fillna(fill_values[g.name])

In [125]:
data.groupby(group_key).apply(fill_func)

Ohio         -0.090136
New York     -0.074366
Vermont       0.500000
Florida       0.078719
Oregon        0.760067
Nevada       -1.000000
California    1.215953
Idaho        -1.000000
dtype: float64

### Random sampling and permutation

on page 308