# The `pandas` Groupby

I'm just going through the examples in the book here. Nothing groundbreaking or anything.

In [1]:
# The maths, graphs, stats and style libs

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

In [2]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5)
})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [4]:
g = df['data1'].groupby(df['key1'])

In [5]:
g

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c220518>

In [6]:
g.mean()

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64

In [7]:
g.std()

key1
a    8.518086
b    0.554989
Name: data1, dtype: float64

## Multiple layers of grouping?

In [8]:
m = df['data1'].groupby([df['key1'], df['key2']])

In [9]:
m.median()

key1  key2
a     one      91.349491
      two     105.495440
b     one      92.350768
      two      91.565894
Name: data1, dtype: float64

In this summary we have the word 'one' appearing twice. Same with the word 'two'. That is visually inefficient because we have this stack of ones and twos there and we can't quickly compare side by side...

## And check this out...

In [10]:
m.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,91.349491,105.49544
b,92.350768,91.565894


Natually this would only work nicely with two dimensions. I wonder what happens with three.

In [11]:
df2 = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'key3': 'fee fi foe foe fum'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5),
    'data3': np.random.chisquare(100, 5)
})

In [12]:
df2

Unnamed: 0,data1,data2,data3,key1,key2,key3
0,90.213662,103.744094,104.014927,a,one,fee
1,70.967124,125.536304,82.580021,a,two,fi
2,104.530486,97.036733,125.646992,b,one,foe
3,91.550514,75.67487,105.720274,b,two,foe
4,109.021724,120.929474,118.741643,a,one,fum


In [13]:
t = df2['data1'].groupby([df2['key1'], df2['key2'], df2['key3']])

In [14]:
t.mean()

key1  key2  key3
a     one   fee      90.213662
            fum     109.021724
      two   fi       70.967124
b     one   foe     104.530486
      two   foe      91.550514
Name: data1, dtype: float64

In [15]:
t.mean().unstack()

Unnamed: 0_level_0,key3,fee,fi,foe,fum
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,90.213662,,,109.021724
a,two,,70.967124,,
b,one,,,104.530486,
b,two,,,91.550514,


Well I'll be damned it still behaves nicely. But still doesn't work as well as the two dimensional example.

## Group keys

They don't have to be part of the dataframe. They just have to be arrays of the right length.

In [16]:
states = np.array('Ohio California California Ohio Ohio'.split())

In [17]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [18]:
df['data1'].groupby([states, years]).mean()

California  2005    105.495440
            2006     92.350768
Ohio        2005     90.247820
            2006     93.769236
Name: data1, dtype: float64

Wow. I'm amazed. This is too easy.

In [19]:
# But if they are part of the dataframe, there is a shortcut

df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,96.064807,104.447443
b,91.958331,88.898824


In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,91.349491,100.393143
a,two,105.49544,112.556045
b,one,92.350768,74.731556
b,two,91.565894,103.066093


In [21]:
# And a useful aggregator is 

df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## Iterating over groups

In [22]:
# With a single group key

for name, group in df.groupby('key1'):
    print(name)
    print(group.std())

a
data1     8.518086
data2    20.073792
dtype: float64
b
data1     0.554989
data2    20.035543
dtype: float64


In [23]:
# With multiple group keys, the first element is always a tuple

for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group.mean(), '\n')

('a', 'one')
data1     91.349491
data2    100.393143
dtype: float64 

('a', 'two')
data1    105.495440
data2    112.556045
dtype: float64 

('b', 'one')
data1    92.350768
data2    74.731556
dtype: float64 

('b', 'two')
data1     91.565894
data2    103.066093
dtype: float64 



### Nice recipe here

In [24]:
pieces = dict(list(df.groupby('key1')))

In [25]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two


In [26]:
df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [27]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [28]:
g = df.groupby(df.dtypes, axis=1)

In [29]:
for dtype, group in g:
    print(dtype)
    print(group, '\n')

float64
        data1       data2
0   88.929747  119.198595
1  105.495440  112.556045
2   92.350768   74.731556
3   91.565894  103.066093
4   93.769236   81.587690 

object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one 



## Selecting a column or subset of columns

In [30]:
# This,

a = df.groupby('key1')['data1']
a

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c20b3c8>

In [31]:
# is the same as this

b = df['data1'].groupby(df['key1'])
b

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c201978>

In [32]:
# check it

print(a.mean(), '\n')
print(b.mean())

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64 

key1
a    96.064807
b    91.958331
Name: data1, dtype: float64


In [33]:
# Getting fancy with it

df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,100.393143
a,two,112.556045
b,one,74.731556
b,two,103.066093


Objects returned are pd.DataFrames unless a single column is used. In that case it is a Series.

In [34]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7fa35c2011d0>

In [35]:
s_grouped.mean()

key1  key2
a     one     100.393143
      two     112.556045
b     one      74.731556
      two     103.066093
Name: data2, dtype: float64

## Grouping with Dicts and Series

You can create a mapping of columns. Maybe a few columns are similare and they should be aggregated together but you need something to aggregate them by. So you can use a dictionary for that. And because this is a way of grouping columns, it makes sense that we use `axis=1`.

In [36]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns='a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people

Unnamed: 0,a,b,c,d,e
Joe,-0.702408,0.549243,-0.128486,0.445961,0.013229
Steve,0.277814,-0.500106,-0.755237,-0.208514,0.805241
Wes,0.832879,-0.369921,-0.764429,-2.323744,-0.162126
Jim,-0.705547,-2.762352,-1.271556,2.428349,0.205313
Travis,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [37]:
people.iloc[2:3, [1, 2]] = np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,-0.702408,0.549243,-0.128486,0.445961,0.013229
Steve,0.277814,-0.500106,-0.755237,-0.208514,0.805241
Wes,0.832879,,,-2.323744,-0.162126
Jim,-0.705547,-2.762352,-1.271556,2.428349,0.205313
Travis,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [38]:
mapping = {
    'a': 'red',
    'b': 'red',
    'c': 'blue',
    'd': 'blue',
    'e': 'red',
    'f': 'orange'
}

In [39]:
by_col = people.groupby(mapping, axis=1)

In [40]:
by_col.sum()

Unnamed: 0,blue,red
Joe,0.317475,-0.139936
Steve,-0.963751,0.58295
Wes,-2.323744,0.670753
Jim,1.156793,-3.262586
Travis,-0.173036,-0.420009


In [41]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [42]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## Grouping with functions

Ok what??? This is black magic.

In [44]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [43]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.575077,-2.213109,-1.400041,0.550565,0.056416
5,0.277814,-0.500106,-0.755237,-0.208514,0.805241
6,-0.898226,0.471578,0.534362,-0.707398,0.006639


In [45]:
key_list = 'one one one two two'.split()
key_list

['one', 'one', 'one', 'two', 'two']

Mix and match:

In [46]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.702408,0.549243,-0.128486,-2.323744,-0.162126
3,two,-0.705547,-2.762352,-1.271556,2.428349,0.205313
5,one,0.277814,-0.500106,-0.755237,-0.208514,0.805241
6,two,-0.898226,0.471578,0.534362,-0.707398,0.006639


## Groupping by index levels

In [51]:
cols = pd.MultiIndex.from_arrays(['US US US JP JP'.split(),
                                  [1, 3, 5, 1, 3]],
                                names=['city', 'tenor'])

In [52]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=cols)

hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.160615,1.629688,0.875059,-0.976987,1.773561
1,-0.030164,0.157077,-0.207484,0.324323,-1.660056
2,-1.030605,0.663487,0.244933,-0.246046,-0.081602
3,0.561404,-1.095277,0.157359,-2.13683,0.075684


In [56]:
hier_df.groupby(level='city', axis=1).min()

city,JP,US
0,-0.976987,-0.160615
1,-1.660056,-0.207484
2,-0.246046,-1.030605
3,-2.13683,-1.095277


Here we've created an index with two layers. We named one layer `city` and the other layer `tenor`. Those are the names we use to refer to those layers. The `groupby` statement shows how this is done.

## Data aggregation

In [57]:
# Quantile is available for Series objects, thus also available for groupby objects

df

Unnamed: 0,data1,data2,key1,key2
0,88.929747,119.198595,a,one
1,105.49544,112.556045,a,two
2,92.350768,74.731556,b,one
3,91.565894,103.066093,b,two
4,93.769236,81.58769,a,one


In [60]:
g = df.groupby('key1')

g['data1'].quantile(0.9)

key1
a    103.150199
b     92.272280
Name: data1, dtype: float64

### DIY aggregation with the `agg` method

Just write a function that aggregates arrays, then pass it to the grouped object's `agg` method.

In [61]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [62]:
g.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,16.565693,37.610905
b,0.784874,28.334537


### Other methods

In [63]:
g.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,96.064807,8.518086,88.929747,91.349491,93.769236,99.632338,105.49544,3.0,104.447443,20.073792,81.58769,97.071868,112.556045,115.87732,119.198595
b,2.0,91.958331,0.554989,91.565894,91.762112,91.958331,92.154549,92.350768,2.0,88.898824,20.035543,74.731556,81.81519,88.898824,95.982459,103.066093


`describe` is not an aggregation function. But it still works. 