In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

# Advanced `groupby`

In [2]:
df = pd.DataFrame({
    'key': 'a b c'.split() * 4,
    'value': np.arange(12.)
})
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [3]:
g = df.groupby('key').value

In [4]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [6]:
# g.transform(lambda x: x.mean())
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [7]:
g.transform(lambda x: x*2)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

In [9]:
# This happens because we grouped by 'key' at the beginning

df['rank'] = g.transform(lambda x: x.rank(ascending=False))
df

Unnamed: 0,key,value,rank
0,a,0.0,4.0
1,b,1.0,4.0
2,c,2.0,4.0
3,a,3.0,3.0
4,b,4.0,3.0
5,c,5.0,3.0
6,a,6.0,2.0
7,b,7.0,2.0
8,c,8.0,2.0
9,a,9.0,1.0


In the case below `transform` and `apply` return the same thing.

In [10]:
def normalize(x):
    return (x - x.mean()) / x.std()

In [11]:
g.transform(normalize)[:5]

0   -1.161895
1   -1.161895
2   -1.161895
3   -0.387298
4   -0.387298
Name: value, dtype: float64

In [12]:
g.apply(normalize)[:5]

0   -1.161895
1   -1.161895
2   -1.161895
3   -0.387298
4   -0.387298
Name: value, dtype: float64

### Grouped time resampling

In [13]:
N = 15

In [14]:
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)

df = pd.DataFrame({
    'time': times,
    'value': np.arange(N)
})
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [15]:
df.set_index('time').resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [16]:
df2 = pd.DataFrame({
    'time': times.repeat(3),
    'key':np.tile('a b c'.split(), N),
    'value': np.arange(N*3.)
})
df2[:7]

Unnamed: 0,key,time,value
0,a,2017-05-20 00:00:00,0.0
1,b,2017-05-20 00:00:00,1.0
2,c,2017-05-20 00:00:00,2.0
3,a,2017-05-20 00:01:00,3.0
4,b,2017-05-20 00:01:00,4.0
5,c,2017-05-20 00:01:00,5.0
6,a,2017-05-20 00:02:00,6.0


### The `pd.TimeGrouper`

In [17]:
time_key = pd.TimeGrouper('5min')

In [18]:
resampled = (df2.set_index('time')
            .groupby(['key', time_key])
            .sum())
resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30.0
a,2017-05-20 00:05:00,105.0
a,2017-05-20 00:10:00,180.0
b,2017-05-20 00:00:00,35.0
b,2017-05-20 00:05:00,110.0
b,2017-05-20 00:10:00,185.0
c,2017-05-20 00:00:00,40.0
c,2017-05-20 00:05:00,115.0
c,2017-05-20 00:10:00,190.0


Now, I haven't had the need to do that, but that is a pretty kick ass thing to be able to do. Disclaimer, in order for this to work, the time must be the index of the `Series` or `DataFrame`.

# Techniques for Method Chaining

How about assigning values to a column?

In [None]:
# Usual non-functional way
df2 = df.copy()
df2['k'] = v

# Functional assign way
df2 = df.assign(k=v)

This allows for nicer method chaining

In [None]:
result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean())
         .groupby('key')
         .col1_demeaned.std())

### Callables

In [None]:
# Usual non-functional way
df = load_data()
df2 = df[df['col2'] < 0]

# Using callables
df = (load_data()
     [lambda x: x['col2'] < 0])

So these outer parentheses... they are quite cool.

In [None]:
result = (load_data()
         [lambda x: x.col2 < 0]
         .assign(col1_demeaned=lambda x: x.col1 - x.col1.mean())
         .groupby('key')
         .col1_demeaned.std())

**Disclaimer from Wes**: Whether you prefer to write code in this style is a matter of taste, and splitting up the expression into multiple steps may make your code more readable.

## The `pipe` method

In [None]:
# Consider this
a = f(df, arg1=v1)
b = g(a, v2, arg3=v3)
c = h(b, arg4=v4)

# How about this then?
result = (
    df.pipe(f, arg1=v1)
    .pipe(g, v2, arg3=v3)
    .pipe(h, arg4=v4)
)

So the statements `f(df)` $\sim$ `df.pipe(f)`. This is useful for generalizing sequences of operations into reusable functions. Consider this.

In [None]:
# Old way
g = df.groupby(['key1', 'key2'])
df['col1'] = df['col1'] - g.transform('mean')

In [None]:
# An example implementation that is more flexible
def group_demean(df, by, cols):
    result = df.copy()
    g = df.groupby(by)
    for c in cols:
        result[c] = df[c] - g[c].transform('mean')
    return result

In [None]:
# Then it is possible to do this
result = (df[df.col1 < 0]
         .pipe(group_demean, ['key1', 'key2'], ['col1']))

Ok that's cool.