# Dask DataFrame and cuDF on NYC Taxi CSV data

### Start Dask Cluster on an Eight-GPU DGX Machine

In [None]:
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()

from dask.distributed import Client
client = Client(cluster)
client

### Previously we ran this to shard the files more finely for cudf.read_csv

```python
import dask.dataframe as dd
pdf = dd.read_csv('data/nyc/yellow_tripdata_2017-*.csv',
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

pdf.repartition(npartitions=100).to_csv('data/nyc/many/*.csv', index=False)
```

## Read CSV files into Dask-GPU-DataFrame

In [None]:
import dask_cudf

# gdf = dask_cudf.read_csv('data/nyc/many/*.csv')
gdf = dask_cudf.read_csv('data/nyc/yellow_tripdata_2017-*.csv')
gdf

In [None]:
gdf.head().to_pandas()

In [None]:
!head data/nyc/yellow_tripdata_2017-02.csv

### Time a full-pass computation

Most of the time here is spent reading data from disk and parsing it.

In [None]:
%time gdf.passenger_count.sum().compute()

### Single GPU

In [None]:
%time gdf.passenger_count.sum().compute(scheduler='single-threaded')

### Single CPU

In [None]:
import dask.dataframe as dd

df = dd.read_csv('data/nyc/many/*.csv')
type(df.head())

In [None]:
%time df.passenger_count.sum().compute(scheduler='single-threaded')

### Eight CPUs, one per process

In [None]:
%time df.passenger_count.sum().compute()

### Eighty CPUs with a balance of threads and processes

In [None]:
client.close()
cluster.close()

In [None]:
client = Client(n_workers=10, threads_per_worker=8)
client

In [None]:
%time df.passenger_count.sum().compute()