# Indexed joins with Dask and cuDF



## Use a DGX

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(diagnostics_port=9000)
client = Client(cluster)
client

## Create Random Dataset

This runs on the GPU, and so is a little bit slow

Also, cudf doesn't handle datetime indexes well yet, so we convert to integer dtype

In [None]:
import dask

left = dask.datasets.timeseries(
    '2000', '2001', 
    dtypes={'id': int, 'x': float, 'y': float},
    freq='10ms',
    partition_freq='2d',
)
left.index = left.index.astype(int)
left = left.persist()

right = dask.datasets.timeseries(
    '2000', '2001', 
    dtypes={'z': float},
    freq='100ms',
    partition_freq='5d',
)
right.index = right.index.astype(int)
right = right.persist()

## Convert data to GPU and persist in device memory

In [None]:
import dask
import cudf

gleft = left.map_partitions(cudf.from_pandas)
gright = right.map_partitions(cudf.from_pandas)

gleft, gright = dask.persist(gleft, gright)  # persist data in device memory

## Join on the index

The indexes of both dataframes are co-sorted, so relatively little communication has to happen.  We just need to do a bit of rearrangement so that the 2-day partitioned dataframe aligns with the 5-day partitioned dataframe.

In [None]:
out = gleft.merge(gright, left_index=True, right_index=True, how='inner')  # this is lazy
out

In [None]:
import time
start = time.time()

In [None]:
out = out.persist()
%time _ = wait(out)

In [None]:
len(out)

In [None]:
_ = client.profile(start=start, filename='dask-cudf-join-profile.html')

## Inspect output

In [None]:
out.partitions[1].head().to_pandas()

In [None]:
from distributed.utils import format_bytes

In [None]:
format_bytes(len(left) * 8 * len(left.columns))  # TODO: cudf needs `.memory_usage()` method

In [None]:
format_bytes(len(right) * 8 * len(right.columns))  # TODO: cudf needs `.memory_usage()` method

In [None]:
format_bytes(len(out) * 8 * len(out.columns))  # TODO: cudf needs `.memory_usage()` method