In [5]:
import sys
import os
import time

sys.path.append(os.path.abspath(".."))


from cluster_config.local_cluster import *
from dask import dataframe as dd

In [6]:
client = get_local_cluster(
    n_workers=4,
    threads_per_worker=2,
    memory_limit='6GB'
)

Local cluster created with 4 workers, 2 threads per worker, and 6GB memory limit.
Dashboard link: http://127.0.0.1:8787/status




In [7]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 22.35 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60989,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:61011,Total threads: 2
Dashboard: http://127.0.0.1:61017/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:60992,
Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-t5y07wta,Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-t5y07wta

0,1
Comm: tcp://127.0.0.1:61010,Total threads: 2
Dashboard: http://127.0.0.1:61015/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:60994,
Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-qnl9xipo,Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-qnl9xipo

0,1
Comm: tcp://127.0.0.1:61012,Total threads: 2
Dashboard: http://127.0.0.1:61019/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:60996,
Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-4z54g941,Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-4z54g941

0,1
Comm: tcp://127.0.0.1:61009,Total threads: 2
Dashboard: http://127.0.0.1:61013/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:60998,
Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-c4892km6,Local directory: C:\Users\natal\AppData\Local\Temp\dask-scratch-space\worker-c4892km6


## Reading the data

In [8]:
time_start = time.perf_counter()
df = dd.read_parquet("../data/yellow_tripdata_2023-*.parquet", blocksize="16MB")
time_elapsed = time.perf_counter() - time_start
print(f'Time to read data: {time_elapsed:.2f} seconds')

Time to read data: 0.24 seconds


In [9]:
n_rows = df.shape[0].compute()
n_cols = df.shape[1]
print(f"{n_rows:,} rows × {n_cols} columns")

38,310,226 rows × 19 columns


In [10]:
print("Columns:", df.columns)
print("Partition number:", df.npartitions)

Columns: Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')
Partition number: 25


In [11]:
df.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [12]:
print(df.dtypes)


VendorID                           int64
tpep_pickup_datetime      datetime64[us]
tpep_dropoff_datetime     datetime64[us]
passenger_count                  float64
trip_distance                    float64
RatecodeID                       float64
store_and_fwd_flag       string[pyarrow]
PULocationID                       int64
DOLocationID                       int64
payment_type                       int64
fare_amount                      float64
extra                            float64
mta_tax                          float64
tip_amount                       float64
tolls_amount                     float64
improvement_surcharge            float64
total_amount                     float64
congestion_surcharge             float64
airport_fee                      float64
dtype: object


### Simple data analysis operations

In [13]:
df['month'] = df['tpep_pickup_datetime'].dt.month
df['year'] = df['tpep_pickup_datetime'].dt.year
df['trip_distance_km'] = df['trip_distance'] * 1.60934

In [14]:
# Grouping
time_start = time.perf_counter()
courses_per_month_and_year = df.groupby(['month', 'year']).size().compute()
time_elapsed = time.perf_counter() - time_start
print(f'Time to group data: {time_elapsed:.2f} seconds')

Time to group data: 5.31 seconds


In [15]:
print(df['year'].value_counts().compute())

year
2008          23
2022          36
2023    38310122
2003           6
2009          15
2001           6
2014           1
2002          11
2024           6
Name: count, dtype: int64


In [16]:
df["passenger_count"].value_counts().compute()

passenger_count
0.0      583005
3.0     1394693
5.0      483233
6.0      316969
2.0     5609105
8.0         261
9.0          55
1.0    27823459
4.0      789997
7.0          93
Name: count, dtype: int64

In [17]:
time_start = time.perf_counter()
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df['trip_duration'].mean().compute()
time_elapsed = time.perf_counter() - time_start
print(f'Time to calculate mean: {time_elapsed:.2f} seconds')

Time to calculate mean: 2.89 seconds


In [19]:
close_local_cluster(client)