# PyArrow Strings in Dask DataFrames

Blogpost: https://www.coiled.io/blog/pyarrow-strings-in-dask-dataframes

In [None]:
import dask

# Turn automatic PyArrow strings on
dask.config.set({"dataframe.convert-string": True});

In [None]:
import coiled

In [None]:
%%time
cluster = coiled.Cluster(
    name="pyarrow-dtypes-demo",
    n_workers=15,
    backend_options={"region": "us-east-2"},
)

In [None]:
client = cluster.get_client()
client

In [None]:
import dask.dataframe as dd
from dask.distributed import wait

In [None]:
df = dd.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    storage_options={"anon": True},
).persist()

wait(df); # ~95 GB

In [None]:
%%time

df["tipped"] = df.tips != 0
df.groupby(df.hvfhs_license_num).tipped.mean().compute()

## Turn off PyArrow Strings

In [None]:
# Turn automatic PyArrow strings off

dask.config.set({"dataframe.convert-string": False});

Without PyArrow strings, we need a bigger cluster

In [None]:
%%time
cluster.scale(45)
client.wait_for_workers(45)

This is the exact same computation as before

In [None]:
df = dd.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    storage_options={"anon": True},
).persist()

wait(df); # ~225 GB

In [None]:
%%time

df["tipped"] = df.tips != 0
df.groupby(df.hvfhs_license_num).tipped.mean().compute()

## Conclusion

PyArrow strings are a significant win for the entire PyData community. They're also a work in progress.

Please try them out and let us know what your experience is like here https://github.com/dask/dask/issues/10139.