In [1]:
import pandas as pd
import numpy as np

# This is the third go around for groupby

Though I'm pretty good with DataFrames now, I do still go to the documentation quite often. I'd like to make one more pass over the groupby section and then probably go look at the braodcasting. On with it.

## ex.0a Create a `DataFrame` from dictionary

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': np.random.randn(5),
    'c': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,a,b,c
0,1.752542,0.408068,-0.222211
1,-1.281449,-1.190172,0.800039
2,-0.249958,0.133799,-1.729016
3,-0.67788,-0.270429,0.387776
4,1.459083,-2.948788,0.619667


## ex.0b Select with a boolean array

In [4]:
mask = df['b'].values > 0

In [5]:
mask

array([ True, False,  True, False, False], dtype=bool)

In [6]:
df[mask]

Unnamed: 0,a,b,c
0,1.752542,0.408068,-0.222211
2,-0.249958,0.133799,-1.729016


## ex.1 Use `groupby` with a single key

In [7]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.174921,-1.045024,a,one
1,2.881136,-1.26641,a,two
2,-0.280491,-0.589809,b,one
3,-0.87623,-1.10054,b,two
4,2.046099,-0.076936,a,one


In [9]:
g = df['data1'].groupby(df['key1'])

In [10]:
g.mean()

key1
a    1.700719
b   -0.578361
Name: data1, dtype: float64

## ex.2 Use `groupby` with arrays not in the `DataFrame`

In [11]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005    2.881136
            2006   -0.280491
Ohio        2005   -0.350655
            2006    2.046099
Name: data1, dtype: float64

## ex.3 Iterate over `grouped` object

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  0.174921 -1.045024    a  one
1  2.881136 -1.266410    a  two
4  2.046099 -0.076936    a  one
b
      data1     data2 key1 key2
2 -0.280491 -0.589809    b  one
3 -0.876230 -1.100540    b  two


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0  0.174921 -1.045024    a  one
4  2.046099 -0.076936    a  one
('a', 'two')
      data1    data2 key1 key2
1  2.881136 -1.26641    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.280491 -0.589809    b  one
('b', 'two')
     data1    data2 key1 key2
3 -0.87623 -1.10054    b  two


## ex.4 Groub by columns based on `dtype`

In [15]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
g = df.groupby(df.dtypes, axis=1)

In [17]:
for dtype, group in g:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.174921 -1.045024
1  2.881136 -1.266410
2 -0.280491 -0.589809
3 -0.876230 -1.100540
4  2.046099 -0.076936
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## ex.5 Return `DataFrame` or `Series` from `groupby`

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.56098
a,two,-1.26641
b,one,-0.589809
b,two,-1.10054


In [19]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one    -0.560980
      two    -1.266410
b     one    -0.589809
      two    -1.100540
Name: data2, dtype: float64

Well that's interesting. I wonder why that happens.

In [20]:
# DataFrame
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x00000000095B4278>

In [21]:
# Series
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x0000000009578B00>

## ex.6a Use `iloc` for selecting cells

In [22]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns = 'a b c d e'.split(),
                     index='Joe Steve Wes Jim Travis'.split())
people.iloc[2:3, [1, 3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,2.537695,-1.012514,0.946977,0.099186,-0.674226
Steve,1.0681,-0.545905,-0.969333,-0.08582,0.011308
Wes,-0.721032,,-1.7841,,0.278285
Jim,1.003784,-0.080338,-0.484385,-0.281639,0.137039
Travis,0.527543,0.295669,0.321226,0.192492,-1.339071


## ex.6b Same thing

In [23]:
df = pd.DataFrame()

for i in range(10):
    df[i] = np.arange(10)

df.iloc[[4, 8], 2:7]

Unnamed: 0,2,3,4,5,6
4,4,4,4,4,4
8,8,8,8,8,8


## ex.7a Dict comprehensions to create a mapping

In [24]:
mapping = {i:j for i, j in zip(
    'a b c d e f'.split(),
    'red red blue blue red orange'.split())
  }
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

## ex.7b Use a mapping to group a `DataFrame`

In [25]:
people

Unnamed: 0,a,b,c,d,e
Joe,2.537695,-1.012514,0.946977,0.099186,-0.674226
Steve,1.0681,-0.545905,-0.969333,-0.08582,0.011308
Wes,-0.721032,,-1.7841,,0.278285
Jim,1.003784,-0.080338,-0.484385,-0.281639,0.137039
Travis,0.527543,0.295669,0.321226,0.192492,-1.339071


In [26]:
by_col = people.groupby(mapping, axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,1.046163,0.850954
Steve,-1.055153,0.533503
Wes,-1.7841,-0.442748
Jim,-0.766024,1.060484
Travis,0.513718,-0.515859


## ex.7c New mapping

In [27]:
new_map = {i:j for i, j in zip(
    people.columns,
    'fee fi foe fi fum'.split()
)}
g = people.groupby(new_map, axis=1)
g.sum()

Unnamed: 0,fee,fi,foe,fum
Joe,2.537695,-0.913328,0.946977,-0.674226
Steve,1.0681,-0.631725,-0.969333,0.011308
Wes,-0.721032,,-1.7841,0.278285
Jim,1.003784,-0.361978,-0.484385,0.137039
Travis,0.527543,0.488161,0.321226,-1.339071


## ex.8a Convert `dict` to `Series`

In [28]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

## ex.8b Use `Series` to group a `DataFrame`

In [29]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## ex.9 Use `groupby` with a function

ie. the function takes as an argument, the index value of the record.

In [30]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.820446,-1.092852,-1.321509,-0.182453,-0.258902
5,1.0681,-0.545905,-0.969333,-0.08582,0.011308
6,0.527543,0.295669,0.321226,0.192492,-1.339071


Any function passed as a group key will be called once per index value, with the return values being used as the group names.

## ex.10 Use `groupby` with mixed grouping types

In [31]:
key_list = 'one one one two two'.split()
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.721032,-1.012514,-1.7841,0.099186,-0.674226
3,two,1.003784,-0.080338,-0.484385,-0.281639,0.137039
5,one,1.0681,-0.545905,-0.969333,-0.08582,0.011308
6,two,0.527543,0.295669,0.321226,0.192492,-1.339071


## ex.11a Create a hierarchical index

In [32]:
columns = pd.MultiIndex.from_arrays([
    'US US US JP JP'.split(),
    [1, 3, 5, 1, 3]],
    names=['city', 'tenor'])

In [33]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.381718,-0.599527,0.940986,0.385178,0.325185
1,2.570295,-0.28688,-0.561581,-1.399685,1.15464
2,0.75618,-0.662956,-1.377985,0.088131,0.196116
3,-0.100253,0.161697,-0.885625,-0.387383,0.934958


## ex. 11b Use `groupby` with hierarchical index

In [34]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## ex.12 Get a group's `quantile`

The key here is that `quantile()` is a method of the `Series` object, not the `grouped` object. However, it is available as the result of a `groupby` operation.

In [35]:
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [36]:
df

Unnamed: 0,data1,data2,key1,key2
0,1.030604,-0.119197,a,one
1,-0.729972,-0.713189,a,two
2,1.421431,1.848685,b,one
3,1.902811,-1.813832,b,two
4,-0.415328,0.690393,a,one


In [37]:
g = df.groupby('key1')
g['data1'].quantile(0.9)

key1
a    0.741418
b    1.854673
Name: data1, dtype: float64

## ex.13 Create your own aggregation function

In [38]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [39]:
g.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.760576,1.403582
b,0.48138,3.662518


## ex.14 Create a new column from existing columns

In [40]:
tips = pd.read_csv('../data/tips.csv')

In [41]:
tips.head(1)

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2


In [42]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head(1)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447


## ex.15 Aggregate with multiple functions at a time

In [43]:
g = tips.groupby(['day', 'smoker'])
g_pct = g['tip_pct']
g_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [44]:
g_pct.agg(['mean', 'std', peak_to_peak], as_index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


## ex.16 The `apply` method

In [45]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [46]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.17 `apply` can be used on `groupby` objects

In [47]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.18 Keyword arguments in `apply`

In [48]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


## ex.19 Suppress the group keys

In [49]:
foo = tips.groupby('smoker', group_keys=False).apply(top)
foo

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## ex.20 Off topic: the `pd.cut()` function

This is not a `groupby` thing and requires it's own notebook. More on this another time.

In [50]:
# bins = [0, 5, 10, 15, 20, 999]  # You can specify your own buckets
# bar = pd.cut(foo['total_bill'], bins)

bar = pd.cut(foo['total_bill'], 3, precision=2)  # Or you can let pandas pick them

In [51]:
bar[:3]

88     (17.5, 24.71]
185    (17.5, 24.71]
51     (10.28, 17.5]
Name: total_bill, dtype: category
Categories (3, interval[float64]): [(3.05, 10.28] < (10.28, 17.5] < (17.5, 24.71]]

In [52]:
bar.values.codes

array([2, 2, 1, 0, 1, 1, 2, 0, 0, 0], dtype=int8)

In [53]:
pd.value_counts(bar)

(3.05, 10.28]    4
(17.5, 24.71]    3
(10.28, 17.5]    3
Name: total_bill, dtype: int64

# Part Two: Recipes

## ex.21 Quantile bucket analysis

In [54]:
frame = pd.DataFrame({
    'data1': np.random.randn(1000),
    'data2': np.random.randn(1000)
})

In [55]:
quartiles = pd.cut(frame['data1'], 4)                 # Equal length buckets
quantiles = pd.qcut(frame['data1'], 4, labels=False)  # Equal size buckets

In [56]:
quartiles[:3]

0    (-2.657, -1.066]
1     (-1.066, 0.519]
2     (-1.066, 0.519]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.657, -1.066] < (-1.066, 0.519] < (0.519, 2.103] < (2.103, 3.688]]

In [57]:
def get_stats(group):
    return {
        'min': group.min(),
        'max': group.max(),
        'count': group.count(),
        'mean': group.mean()
    }

In [58]:
gl = frame['data2'].groupby(quartiles)  # Equal length buckets
gs = frame['data2'].groupby(quantiles)  # Equal size buckets

In [59]:
gl.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.657, -1.066]",155.0,2.493077,-0.008709,-2.835377
"(-1.066, 0.519]",545.0,3.292861,0.066049,-3.010666
"(0.519, 2.103]",290.0,2.946447,-0.038269,-2.798617
"(2.103, 3.688]",10.0,0.903823,0.064984,-1.031332


In [60]:
gs.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,250.0,2.493077,0.035639,-2.835377
1,250.0,3.292861,0.059379,-3.010666
2,250.0,2.734204,0.021232,-2.549536
3,250.0,2.946447,-0.019456,-2.798617


## ex.22 Filling in missing values with group specific values

In [61]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4

data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.008536
New York     -0.678688
Vermont            NaN
Florida      -0.717572
Oregon       -0.484572
Nevada             NaN
California    0.801628
Idaho              NaN
dtype: float64

In [62]:
data.groupby(group_key).mean()

East   -0.468265
West    0.158528
dtype: float64

Fill values based on group data

In [63]:
fill_mean = lambda g: g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)

Ohio         -0.008536
New York     -0.678688
Vermont      -0.468265
Florida      -0.717572
Oregon       -0.484572
Nevada        0.158528
California    0.801628
Idaho         0.158528
dtype: float64

Fill with predefined values

In [64]:
fill_values = {'East':0.5, 'West':-1}
fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Ohio         -0.008536
New York     -0.678688
Vermont       0.500000
Florida      -0.717572
Oregon       -0.484572
Nevada       -1.000000
California    0.801628
Idaho        -1.000000
dtype: float64

## ex.23 Random sampling and permutation

In [65]:
suits = 'H S C D'.split()
card_val = (list(range(1,11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J', 'K', 'Q']
cards = []

In [66]:
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)
    
deck = pd.Series(card_val, index=cards)

In [67]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [68]:
def draw(deck, n=5):
    return deck.sample(n)

draw(deck)

2D     2
JD    10
QC    10
AD     1
6D     6
dtype: int64

Sample one card from each suit

In [69]:
get_suit = lambda card: card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  8C     8
   JC    10
D  3D     3
   5D     5
H  7H     7
   5H     5
S  8S     8
   6S     6
dtype: int64

In [70]:
# Also this
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

8C      8
9C      9
4D      4
JD     10
10H    10
JH     10
JS     10
7S      7
dtype: int64