# HSA Accelerated Pandas GroupBy Demo

In [1]:
%cd -q ../..

In [2]:
from __future__ import print_function, division

import pandas as pd
import numpy as np
from timeit import default_timer as timer

from numba_roc_examples.pandas_groupby.groupby import ROCGrouper

## Make DataFrame

In [3]:
ngroups = 8
nelem = 2 * 10 ** 7
df = pd.DataFrame({'key': np.random.randint(0, ngroups, nelem).astype(np.intp),
                   'one': np.random.random(nelem),
                   'two': np.random.random(nelem)})


## Optimized Groupby

Compare default grouper with HSA GPU accelerated grouper.  Under the hood, the dataframe is sorted by 'key' and binned into groups.

In [4]:
grouped = df.groupby(pd.Grouper('key', sort=True))
# Redo for timing
time_grouping = %timeit -o df.groupby(pd.Grouper('key', sort=True))

2.16 s ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
grouped_hsa = df.groupby(ROCGrouper('key', sort=True))
# Redo for timing
time_grouping_hsa = %timeit -o df.groupby(HSAGrouper('key', sort=True))

TypeError: _get_grouper() got an unexpected keyword argument 'validate'

## Optimized Aggregation

Compare aggregation functions between the default grouper and the HSA GPU accelerated grouper.

### `.mean()`

In [None]:
res_mean = grouped.mean()
print(res_mean)
# Redo for timing
time_mean = %timeit -o grouped.mean()

In [None]:
res_mean_hsa = grouped_hsa.mean()
print(res_mean_hsa)
# Redo for timing
time_mean_hsa = %timeit -o grouped_hsa.mean()

### `.var()`

In [None]:
res_var = grouped.var()
print(res_var)
# Redo for timing
time_var = %timeit -o grouped.var()

In [None]:
res_var_hsa = grouped_hsa.var()
print(res_var_hsa)
# Redo for timing
time_var_hsa = %timeit -o grouped_hsa.var()

## Plot Speedup

In [None]:
from bokeh.charts import Bar, output_notebook, show
from collections import OrderedDict

output_notebook()

data = OrderedDict()
data['groupby'] = time_grouping.best / time_grouping_hsa.best
data['mean'] = time_mean.best / time_mean_hsa.best
data['var'] = time_var.best / time_var_hsa.best

p = Bar(list(data.values()), list(data.keys()), 
        title="Speedup of HSA GPU Accelerated Groupby",
        ylabel='GPU speedup over CPU')

show(p)