# Benchmark

In [1]:
!nvidia-smi

Fri Sep 16 06:48:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.00       Driver Version: 525.26       CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:2B:00.0  On |                  N/A |
| 30%   49C    P0    58W / 250W |    600MiB / 11264MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Download Dataset

In [2]:
import requests
import pathlib
import cudf
import pandas as pd
import time

In [3]:
year = "2021"
directory = f"data/{year}/"

In [4]:
def download_files(year, download_directory):

    # Make directory if it does not exist
    pathlib.Path(download_directory).mkdir(parents=True, exist_ok=True)

    # Download yellow taxi data
    # Data from: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
    url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
    files = [f"yellow_tripdata_{year}-{n:02d}.parquet" for n in range(1, 13)]
    for file in files:
        r = requests.get(f"{url}{file}", allow_redirects=True)
        with open(f"{download_directory}/{file}", "wb") as f:
            f.write(r.content)

    print("Finished downloading files...")

# uncomment to download files
# download_files(year, directory)

## RAPIDS cudf vs pandas

## cudf

In [5]:
%%time

rapids_df = cudf.read_parquet(directory)

CPU times: user 1.18 s, sys: 302 ms, total: 1.48 s
Wall time: 1.62 s


In [6]:
%%time

rapids_df = rapids_df.query(
    "fare_amount > 0 and tip_amount > 0 and passenger_count > 0 and trip_distance != 0"
)
rapids_df["tip_percentage"] = rapids_df["tip_amount"] / rapids_df["fare_amount"]
rapids_df["pickup_hour"] = rapids_df["tpep_pickup_datetime"].dt.hour
hours = rapids_df.groupby("pickup_hour", sort=True)["tip_percentage"].mean()


CPU times: user 575 ms, sys: 79.3 ms, total: 654 ms
Wall time: 799 ms


## Pandas

In [7]:
%%time

pandas_df = pd.read_parquet(directory)

CPU times: user 13.3 s, sys: 11.7 s, total: 25.1 s
Wall time: 2.78 s


In [8]:
%%time

pandas_df = pandas_df.query(
    "fare_amount > 0 and tip_amount > 0 and passenger_count > 0 and trip_distance != 0"
)
pandas_df["tip_percentage"] = pandas_df["tip_amount"] / pandas_df["fare_amount"]
pandas_df["pickup_hour"] = pandas_df["tpep_pickup_datetime"].dt.hour
hours = pandas_df.groupby("passenger_count", sort=True)["tip_percentage"].mean()


CPU times: user 3.2 s, sys: 1.23 s, total: 4.43 s
Wall time: 4.34 s
