# The `pandas` Groupby

I'm just going through the examples in the book here. Nothing groundbreaking or anything.

In [1]:
# The maths, graphs, stats and style libs

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

In [2]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5)
})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,110.534413,87.064785,a,one
1,109.722563,92.842673,a,two
2,98.777993,115.828911,b,one
3,98.585044,109.121345,b,two
4,103.015888,89.436595,a,one


In [4]:
g = df['data1'].groupby(df['key1'])

In [5]:
g

<pandas.core.groupby.SeriesGroupBy object at 0x000000000A7C97B8>

In [6]:
g.mean()

key1
a    107.757621
b     98.681519
Name: data1, dtype: float64

In [7]:
g.std()

key1
a    4.126476
b    0.136435
Name: data1, dtype: float64

## Multiple layers of grouping?

In [8]:
m = df['data1'].groupby([df['key1'], df['key2']])

In [9]:
m.median()

key1  key2
a     one     106.775151
      two     109.722563
b     one      98.777993
      two      98.585044
Name: data1, dtype: float64

In this summary we have the word 'one' appearing twice. Same with the word 'two'. That is visually inefficient because we have this stack of ones and twos there and we can't quickly compare side by side...

## And check this out...

In [10]:
m.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,106.775151,109.722563
b,98.777993,98.585044


Natually this would only work nicely with two dimensions. I wonder what happens with three.

In [11]:
df2 = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'key3': 'fee fi foe foe fum'.split(),
    'data1': np.random.chisquare(100, 5),
    'data2': np.random.chisquare(100, 5),
    'data3': np.random.chisquare(100, 5)
})

In [12]:
df2

Unnamed: 0,data1,data2,data3,key1,key2,key3
0,108.015516,89.195856,106.929298,a,one,fee
1,98.625413,108.621694,110.341848,a,two,fi
2,99.799879,96.551635,94.379668,b,one,foe
3,101.66995,97.389008,87.059394,b,two,foe
4,97.819453,110.437041,94.904733,a,one,fum


In [13]:
t = df2['data1'].groupby([df2['key1'], df2['key2'], df2['key3']])

In [14]:
t.mean()

key1  key2  key3
a     one   fee     108.015516
            fum      97.819453
      two   fi       98.625413
b     one   foe      99.799879
      two   foe     101.669950
Name: data1, dtype: float64

In [15]:
t.mean().unstack()

Unnamed: 0_level_0,key3,fee,fi,foe,fum
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,108.015516,,,97.819453
a,two,,98.625413,,
b,one,,,99.799879,
b,two,,,101.66995,


Well I'll be damned it still behaves nicely. But still doesn't work as well as the two dimensional example.

## Group keys

They don't have to be part of the dataframe. They just have to be arrays of the right length.

In [16]:
states = np.array('Ohio California California Ohio Ohio'.split())

In [17]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [18]:
df['data1'].groupby([states, years]).mean()

California  2005    109.722563
            2006     98.777993
Ohio        2005    104.559729
            2006    103.015888
Name: data1, dtype: float64

Wow. I'm amazed. This is too easy.

In [19]:
# But if they are part of the dataframe, there is a shortcut

df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,107.757621,89.781351
b,98.681519,112.475128


In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,106.775151,88.25069
a,two,109.722563,92.842673
b,one,98.777993,115.828911
b,two,98.585044,109.121345


In [21]:
# And a useful aggregator is 

df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## Iterating over groups

In [22]:
# With a single group key

for name, group in df.groupby('key1'):
    print(name)
    print(group.std())

a
data1    4.126476
data2    2.904331
dtype: float64
b
data1    0.136435
data2    4.742966
dtype: float64


In [24]:
# With multiple group keys, the first element is always a tuple

for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group.mean(), '\n')

('a', 'one')
data1    106.775151
data2     88.250690
dtype: float64 

('a', 'two')
data1    109.722563
data2     92.842673
dtype: float64 

('b', 'one')
data1     98.777993
data2    115.828911
dtype: float64 

('b', 'two')
data1     98.585044
data2    109.121345
dtype: float64 



### Nice recipe here

In [25]:
pieces = dict(list(df.groupby('key1')))

In [26]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,98.777993,115.828911,b,one
3,98.585044,109.121345,b,two


In [30]:
df

Unnamed: 0,data1,data2,key1,key2
0,110.534413,87.064785,a,one
1,109.722563,92.842673,a,two
2,98.777993,115.828911,b,one
3,98.585044,109.121345,b,two
4,103.015888,89.436595,a,one


In [27]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [28]:
g = df.groupby(df.dtypes, axis=1)

In [29]:
for dtype, group in g:
    print(dtype)
    print(group, '\n')

float64
        data1       data2
0  110.534413   87.064785
1  109.722563   92.842673
2   98.777993  115.828911
3   98.585044  109.121345
4  103.015888   89.436595 

object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one 



## Selecting a column or subset of columns

In [37]:
# This,

a = df.groupby('key1')['data1']
a

<pandas.core.groupby.SeriesGroupBy object at 0x000000000AB0A828>

In [41]:
# is the same as this

b = df['data1'].groupby(df['key1'])
b

<pandas.core.groupby.SeriesGroupBy object at 0x000000000AB0AF98>

In [45]:
# check it

print(a.mean(), '\n')
print(b.mean())

key1
a    107.757621
b     98.681519
Name: data1, dtype: float64 

key1
a    107.757621
b     98.681519
Name: data1, dtype: float64


In [46]:
# Getting fancy with it

df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,88.25069
a,two,92.842673
b,one,115.828911
b,two,109.121345


Objects returned are pd.DataFrames unless a single column is used. In that case it is a Series.

In [49]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x000000000AB0ACF8>

In [50]:
s_grouped.mean()

key1  key2
a     one      88.250690
      two      92.842673
b     one     115.828911
      two     109.121345
Name: data2, dtype: float64

## Grouping with Dicts and Series

You can create a mapping of columns. Maybe a few columns are similare and they should be aggregated together but you need something to aggregate them by. So you can use a dictionary for that. And because this is a way of grouping columns, it makes sense that we use `axis=1`.

In [53]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns='a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people

Unnamed: 0,a,b,c,d,e
Joe,0.583193,0.952418,1.680842,0.22115,-0.525014
Steve,0.960286,-0.369092,-0.342649,0.330165,0.936775
Wes,-1.152404,1.085094,-0.504872,-0.083372,-1.154828
Jim,0.753973,-1.426262,0.597402,1.253065,1.209532
Travis,-0.711706,0.859084,-0.055814,0.581312,-1.275963


In [54]:
people.iloc[2:3, [1, 2]] = np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,0.583193,0.952418,1.680842,0.22115,-0.525014
Steve,0.960286,-0.369092,-0.342649,0.330165,0.936775
Wes,-1.152404,,,-0.083372,-1.154828
Jim,0.753973,-1.426262,0.597402,1.253065,1.209532
Travis,-0.711706,0.859084,-0.055814,0.581312,-1.275963


In [55]:
mapping = {
    'a': 'red',
    'b': 'red',
    'c': 'blue',
    'd': 'blue',
    'e': 'red',
    'f': 'orange'
}

In [56]:
by_col = people.groupby(mapping, axis=1)

In [57]:
by_col.sum()

Unnamed: 0,blue,red
Joe,1.901992,1.010596
Steve,-0.012484,1.52797
Wes,-0.083372,-2.307232
Jim,1.850467,0.537244
Travis,0.525498,-1.128586


In [58]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [61]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## Grouping with functions

Ok what??? This is black magic.