# Groupby Aggregations with Dask and cuDF


## Use a DGX

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(diagnostics_port=9000)
client = Client(cluster)
client

## Create Random Dataset

We create a random dataset with an integer column, on which we will group and a float column, on which we will perform aggregations.

You can change the number of elements for the integer column with the `id_lam=` parameter.

In [None]:
import dask

df = dask.datasets.timeseries(
    '2000', '2001', 
    dtypes={'id': int, 'x': float},
    freq='1s',
    partition_freq='2d',
    id_lam=1000,
).reset_index(drop=True).persist()


In [None]:
df.id.nunique().compute()

In [None]:
df.head()

## Convert data to GPU and persist in device memory

In [None]:
import cudf

df = df.map_partitions(cudf.from_pandas).persist()

## Groupby Aggregations

In [None]:
import time
start = time.time()

In [None]:
%time df.groupby('id').x.mean().compute()

In [None]:
%%time 
dask.compute(
    df.groupby('id').min(),
    df.groupby('id').max(),
    df.groupby('id').mean(),
    df.groupby('id').count(),
)

In [None]:
_ = client.profile(start=start, filename='dask-cudf-groupby-aggregations-profile.html')