In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))


from cluster_config.local_cluster import *
from utils.measure import measure_time_and_memory
from utils.csv_saver import CSVSaver
from dask import dataframe as dd

In [36]:
client = get_local_cluster(
    n_workers=4,
    threads_per_worker=2,
    memory_limit='4GB'
)

Local cluster created with 4 workers, 2 threads per worker, and 4GB memory limit.
Dashboard link: http://127.0.0.1:8787/status


In [17]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 14.90 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37423,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 14.90 GiB

0,1
Comm: tcp://127.0.0.1:37663,Total threads: 2
Dashboard: http://127.0.0.1:39219/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:33477,
Local directory: /tmp/dask-scratch-space/worker-j2sekmt2,Local directory: /tmp/dask-scratch-space/worker-j2sekmt2

0,1
Comm: tcp://127.0.0.1:46723,Total threads: 2
Dashboard: http://127.0.0.1:38049/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:44049,
Local directory: /tmp/dask-scratch-space/worker-dta_6k_8,Local directory: /tmp/dask-scratch-space/worker-dta_6k_8

0,1
Comm: tcp://127.0.0.1:38499,Total threads: 2
Dashboard: http://127.0.0.1:46321/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:42243,
Local directory: /tmp/dask-scratch-space/worker-4ipx9ka5,Local directory: /tmp/dask-scratch-space/worker-4ipx9ka5

0,1
Comm: tcp://127.0.0.1:38207,Total threads: 2
Dashboard: http://127.0.0.1:36037/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:33115,
Local directory: /tmp/dask-scratch-space/worker-a4wptsvg,Local directory: /tmp/dask-scratch-space/worker-a4wptsvg


In [18]:
csv_saver = CSVSaver()

In [19]:
CLUSTER_TYPE = "LOCAL"

# 40 mln dataset analysis

In [20]:
INPUT_SIZE = "40 MLN"

In [21]:
@measure_time_and_memory("READ DATA", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def read_data():
    return dd.read_parquet("../data/yellow_tripdata_2023*.parquet", blocksize="16MB")

df = read_data()

[READ DATA] Time: 0.0139s | RAM usage: 0.00


In [22]:
@measure_time_and_memory("HEAD-5", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def head():
    return df.head(5)

head()

[HEAD-5] Time: 55.8717s | RAM usage: 0.00


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [23]:
df['month'] = df['tpep_pickup_datetime'].dt.month
df['year'] = df['tpep_pickup_datetime'].dt.year
df['trip_distance_km'] = df['trip_distance'] * 1.60934

In [24]:
@measure_time_and_memory("GROUP_BY_MONTH_AND_YEAR", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def group_by_month_and_year(df):
    return df.groupby(['month', 'year']).size().compute()

group_by_month_and_year(df)

[GROUP_BY_MONTH_AND_YEAR] Time: 253.4607s | RAM usage: 1.05


month  year
1      2001          6
       2003          6
       2009         15
       2023    3066726
2      2023    2914003
3      2023    3403660
4      2023    3288248
5      2023    3513664
6      2023    3307259
7      2023    2907093
8      2023    2824201
9      2023    2846741
10     2022         11
11     2014          1
12     2002         11
       2008         23
       2022         25
10     2023    3522269
11     2023    3339731
1      2024          6
12     2023    3376527
dtype: int64

In [25]:
@measure_time_and_memory("MEAN_TRIP_DURATION", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def get_mean_trip_duration(df):
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df['trip_duration'].mean().compute()

mean = get_mean_trip_duration(df)

[MEAN_TRIP_DURATION] Time: 210.4896s | RAM usage: 1.31


# 80 mln dataset analysis

In [26]:
INPUT_SIZE = "80 MLN"

In [27]:
@measure_time_and_memory("READ DATA", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def read_data():
    return dd.read_parquet("../data/yellow_tripdata_202[3-4]*.parquet", blocksize="16MB")

df = read_data()

[READ DATA] Time: 0.0131s | RAM usage: 0.00


Reading data - lazy operation

In [28]:
n_rows = df.shape[0].compute()
n_cols = df.shape[1]
print(f"{n_rows:,} rows × {n_cols} columns")

79,479,946 rows × 19 columns


In [29]:
print("Columns:", df.columns)
print("Partition number:", df.npartitions)

Columns: Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')
Partition number: 69


In [30]:
@measure_time_and_memory("HEAD-5", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def head():
    return df.head(5)

head()

[HEAD-5] Time: 57.3984s | RAM usage: 0.52


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [31]:
print(df.dtypes)

VendorID                           int64
tpep_pickup_datetime      datetime64[us]
tpep_dropoff_datetime     datetime64[us]
passenger_count                  float64
trip_distance                    float64
RatecodeID                       float64
store_and_fwd_flag       string[pyarrow]
PULocationID                       int64
DOLocationID                       int64
payment_type                       int64
fare_amount                      float64
extra                            float64
mta_tax                          float64
tip_amount                       float64
tolls_amount                     float64
improvement_surcharge            float64
total_amount                     float64
congestion_surcharge             float64
airport_fee                      float64
dtype: object


In [32]:
df['month'] = df['tpep_pickup_datetime'].dt.month
df['year'] = df['tpep_pickup_datetime'].dt.year
df['trip_distance_km'] = df['trip_distance'] * 1.60934

In [33]:
@measure_time_and_memory("GROUP_BY_MONTH_AND_YEAR", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def group_by_month_and_year(df):
    return df.groupby(['month', 'year']).size().compute()

grouped_one = group_by_month_and_year(df)

[GROUP_BY_MONTH_AND_YEAR] Time: 675.1720s | RAM usage: 3.41


In [34]:
@measure_time_and_memory("MEAN_TRIP_DURATION", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def get_mean_trip_duration(df):
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df['trip_duration'].mean().compute()

mean = get_mean_trip_duration(df)

[MEAN_TRIP_DURATION] Time: 431.6492s | RAM usage: 1.31


# Extremelly small dataset analysis

In [37]:
INPUT_SIZE = "3"

In [38]:
data = {'Name': ['John', 'Emma', 'Michael', 'Sophia'],
        'Age': [28, 32, 25, 30],
        'Department': ['Sales', 'Marketing', 'Finance', 'HR']
}

In [39]:
df = dd.from_dict(data, npartitions=2)

In [None]:
@measure_time_and_memory("MEAN AND FILTER", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def calculate_mean_and_filter(df):
    average_age = df['Age'].mean().compute()
    filtered_df = df[df['Age'] > 27].compute()

calculate_mean_and_filter(df)

[MEAN AND FILTER] Time: 0.0970s | RAM usage: 0.26


# Close local cluster

In [45]:
close_local_cluster(client)