In [1]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

## ex.0a Create a `DataFrame` from dictionary

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,a,b,c
0,1.22482,-0.851045,-1.315811
1,-0.575095,-0.698654,2.495318
2,-0.926109,-1.50906,-0.478715
3,-0.48738,0.088879,0.472669
4,1.007726,0.748271,-0.193971


## ex.0b Select with a boolean array

In [4]:
mask = df['b'].values > 0

In [5]:
mask

array([False, False, False,  True,  True], dtype=bool)

In [6]:
g = df.groupby(mask)
g.size()

False    3
True     2
dtype: int64

## ex.1 Use `groupby` with a single key

In [7]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.555074,1.82501,a,one
1,0.916096,-1.670291,a,two
2,1.398979,0.853161,b,one
3,-1.222626,-0.531674,b,two
4,-0.183152,-1.105303,a,one


In [9]:
g = df['data1'].groupby(df['key1'])

In [10]:
g.mean()

key1
a    0.059290
b    0.088177
Name: data1, dtype: float64

## ex.2 Use `groupby` with arrays not in the `DataFrame`

In [11]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005    0.916096
            2006    1.398979
Ohio        2005   -0.888850
            2006   -0.183152
Name: data1, dtype: float64

## ex.3 Iterate over `grouped` object

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.555074  1.825010    a  one
1  0.916096 -1.670291    a  two
4 -0.183152 -1.105303    a  one
b
      data1     data2 key1 key2
2  1.398979  0.853161    b  one
3 -1.222626 -0.531674    b  two


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.555074  1.825010    a  one
4 -0.183152 -1.105303    a  one
('a', 'two')
      data1     data2 key1 key2
1  0.916096 -1.670291    a  two
('b', 'one')
      data1     data2 key1 key2
2  1.398979  0.853161    b  one
('b', 'two')
      data1     data2 key1 key2
3 -1.222626 -0.531674    b  two


## ex.4 Groub by columns based on `dtype`

In [15]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
g = df.groupby(df.dtypes, axis=1)

In [17]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.555074  1.825010
1  0.916096 -1.670291
2  1.398979  0.853161
3 -1.222626 -0.531674
4 -0.183152 -1.105303
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## ex.5 Return `DataFrame` or `Series` from `groupby`

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.359853
a,two,-1.670291
b,one,0.853161
b,two,-0.531674


In [19]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.359853
      two    -1.670291
b     one     0.853161
      two    -0.531674
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [20]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7f44914ca2b0>

In [21]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x7f44914cad30>

## ex.6a Use `iloc` for selecting cells

In [22]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.246512,-0.288287,0.237057,-0.627203,0.276669
Steve,0.310082,0.118249,-1.853393,-0.390722,-0.799456
Wes,2.121669,,0.045952,,-0.563896
Jim,0.415003,-0.93536,2.064717,-0.474141,1.294856
Travis,-0.196392,-0.605758,1.438467,-1.384634,-0.019129


## ex.6b Same thing

In [23]:
df = pd.DataFrame()

for i in range(10):
    df[i] = np.arange(10)

df.iloc[[4, 8], 2:7]

Unnamed: 0,2,3,4,5,6
4,4,4,4,4,4
8,8,8,8,8,8


## ex.7a Dict comprehensions to create a mapping

In [24]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

## ex.7b Use a mapping to group a `DataFrame`

In [25]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.246512,-0.288287,0.237057,-0.627203,0.276669
Steve,0.310082,0.118249,-1.853393,-0.390722,-0.799456
Wes,2.121669,,0.045952,,-0.563896
Jim,0.415003,-0.93536,2.064717,-0.474141,1.294856
Travis,-0.196392,-0.605758,1.438467,-1.384634,-0.019129


In [26]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,-0.390145,0.234893
Steve,-2.244115,-0.371126
Wes,0.045952,1.557773
Jim,1.590576,0.774498
Travis,0.053833,-0.821279


## ex.7c New mapping

In [27]:
new_map = {i:j for i, j in zip(
    people.columns,
    'fee fi foe fi fum'.split()
)}
g = people.groupby(new_map, axis=1)
g.sum()

Unnamed: 0,fee,fi,foe,fum
Joe,0.246512,-0.91549,0.237057,0.276669
Steve,0.310082,-0.272473,-1.853393,-0.799456
Wes,2.121669,,0.045952,-0.563896
Jim,0.415003,-1.409501,2.064717,1.294856
Travis,-0.196392,-1.990392,1.438467,-0.019129


## ex.8a Convert `dict` to `Series`

In [28]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

## ex.8b Use `Series` to group a `DataFrame`

In [29]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## ex.9 Use `groupby` with a function

ie. the function takes as an argument, the index value of the record.

In [30]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.783183,-1.223647,2.347727,-1.101344,1.007629
5,0.310082,0.118249,-1.853393,-0.390722,-0.799456
6,-0.196392,-0.605758,1.438467,-1.384634,-0.019129


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

## ex.10 Use `groupby` with mixed grouping types

In [31]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.246512,-0.288287,0.045952,-0.627203,-0.563896
3,two,0.415003,-0.93536,2.064717,-0.474141,1.294856
5,one,0.310082,0.118249,-1.853393,-0.390722,-0.799456
6,two,-0.196392,-0.605758,1.438467,-1.384634,-0.019129


## ex.11a Create a hierarchical index

In [32]:
columns = pd.MultiIndex.from_arrays([
    'US US US JP JP'.split(),
    [1, 3, 5, 1, 3]],
    names=['city', 'tenor'])

In [33]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-2.007758,1.590489,1.355574,-0.469388,-0.781467
1,1.371647,-1.058999,-0.341345,0.378692,-0.900661
2,0.917231,0.17737,0.327168,0.838609,-0.798915
3,-1.610783,0.791264,-0.647849,0.103629,-0.026357


## ex. 11b Use `groupby` with hierarchical index

In [34]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## ex.12 Get a group's `quantile`

The key here is that `quantile()` is a method of the `Series` object, not the `grouped` object. However, it is available as the result of a `groupby` operation.

In [35]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [36]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.188869,1.247236,a,one
1,1.044754,0.90257,a,two
2,0.289516,1.373465,b,one
3,-1.251932,1.141547,b,two
4,-0.218773,-0.188184,a,one


In [37]:
g = df.groupby('key1')
g['data1'].quantile(0.9)

key1
a    0.798029
b    0.135371
Name: data1, dtype: float64

## ex.13 Create your own aggregation function

In [38]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [39]:
g.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.263527,1.43542
b,1.541448,0.231917


## ex.14 Create a new column from existing columns

In [40]:
tips = pd.read_csv('../data/tips.csv')

In [41]:
tips.head(1)

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2


In [42]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head(1)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447


## ex.15 Aggregate with multiple functions at a time

In [43]:
g = tips.groupby(['day', 'smoker'])
g_pct = g['tip_pct']
g_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [44]:
g_pct.agg(['mean', 'std', peak_to_peak], as_index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


## ex.16 The `apply` method

In [45]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [46]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.17 `apply` can be used on `groupby` objects

In [47]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.18 Keyword arguments in `apply`

In [48]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


## ex.19 Suppress the group keys

In [49]:
foo = tips.groupby('smoker', group_keys=False).apply(top)
foo

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.20 Off topic: the `pd.cut()` function

This is not a `groupby` thing and requires it's own notebook. More on this another time.

In [65]:
# bins = [0, 5, 10, 15, 20, 999]  # You can specify your own buckets
# bar = pd.cut(foo['total_bill'], bins)

bar = pd.cut(foo['total_bill'], 3, precision=2)  # Or you can let pandas pick them

In [66]:
bar[:3]

88     (17.5, 24.71]
185    (17.5, 24.71]
51     (10.28, 17.5]
Name: total_bill, dtype: category
Categories (3, interval[float64]): [(3.05, 10.28] < (10.28, 17.5] < (17.5, 24.71]]

In [67]:
bar.values.codes

array([2, 2, 1, 0, 1, 1, 2, 0, 0, 0], dtype=int8)

In [68]:
pd.value_counts(bar)

(3.05, 10.28]    4
(17.5, 24.71]    3
(10.28, 17.5]    3
Name: total_bill, dtype: int64

# Part Two: Recipes

## ex.21 Quantile bucket analysis

In [89]:
frame = pd.DataFrame({
    'data1': np.random.randn(1000),
    'data2': np.random.randn(1000)
})

In [90]:
quartiles = pd.cut(frame['data1'], 4)                 # Equal length buckets
quantiles = pd.qcut(frame['data1'], 4, labels=False)  # Equal size buckets

In [91]:
quartiles[:3]

0    (-1.859, -0.2]
1    (-1.859, -0.2]
2    (1.458, 3.117]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.524, -1.859] < (-1.859, -0.2] < (-0.2, 1.458] < (1.458, 3.117]]

In [92]:
def get_stats(group):
    return {
        'min': group.min(),
        'max': group.max(),
        'count': group.count(),
        'mean': group.mean()
    }

In [93]:
gl = frame['data2'].groupby(quartiles)
gs = frame['data2'].groupby(quantiles)

In [94]:
gl.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.524, -1.859]",30.0,2.629565,0.245635,-1.892015
"(-1.859, -0.2]",392.0,2.745324,-0.069998,-2.775437
"(-0.2, 1.458]",506.0,2.663575,-0.003343,-2.727033
"(1.458, 3.117]",72.0,2.332388,0.149044,-3.008348


In [95]:
gs.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,250.0,2.629565,-0.028048,-2.775437
1,250.0,2.745324,-0.037911,-2.36393
2,250.0,2.469914,-0.060536,-2.727033
3,250.0,2.663575,0.082373,-3.008348


## ex.22 Filling in missing values with group specific values

In [96]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    1.187519
2         NaN
3   -0.513823
4         NaN
5   -0.147717
dtype: float64

In [97]:
s.fillna(s.mean())

0    0.175326
1    1.187519
2    0.175326
3   -0.513823
4    0.175326
5   -0.147717
dtype: float64

In [98]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4

In [101]:
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio          0.076907
New York     -0.196786
Vermont            NaN
Florida      -0.995140
Oregon        1.228786
Nevada             NaN
California   -0.892534
Idaho              NaN
dtype: float64