In [60]:
import sys
import os

sys.path.append(os.path.abspath(".."))


from dask import dataframe as dd
from cluster_config.slurm_cluster import *
from utils.measure import measure_time_and_memory
from utils.csv_saver import CSVSaver

In [61]:
client = get_slurm_cluster()

Dashboard: http://172.23.30.9:8787/status


In [42]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.23.30.9:8787/status,

0,1
Dashboard: http://172.23.30.9:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.23.30.9:33779,Workers: 0
Dashboard: http://172.23.30.9:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [43]:
csv_saver = CSVSaver()

In [None]:
CLUSTER_TYPE = "SLURM"

# 40 mln dataset analysis

In [45]:
INPUT_SIZE = "40 MLN"

In [46]:
@measure_time_and_memory("READ DATA", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def read_data():
    return dd.read_parquet("../data/yellow_tripdata_2023*.parquet", blocksize="16MB")

df = read_data()

[READ DATA] Time: 0.0147s | RAM usage: 0.00


In [47]:
@measure_time_and_memory("HEAD-5", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def head():
    return df.head(5)

head()

[HEAD-5] Time: 1.1759s | RAM usage: 0.26


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [48]:
df['month'] = df['tpep_pickup_datetime'].dt.month
df['year'] = df['tpep_pickup_datetime'].dt.year
df['trip_distance_km'] = df['trip_distance'] * 1.60934

@measure_time_and_memory("GROUP_BY_MONTH_AND_YEAR", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def group_by_month_and_year(df):
    return df.groupby(['month', 'year']).size().compute()

group_by_month_and_year(df)

[GROUP_BY_MONTH_AND_YEAR] Time: 1.8599s | RAM usage: 0.00


month  year
1      2001          6
       2003          6
       2009         15
       2023    3066726
2      2023    2914003
3      2023    3403660
4      2023    3288248
5      2023    3513664
6      2023    3307259
7      2023    2907093
8      2023    2824201
9      2023    2846741
10     2022         11
11     2014          1
12     2002         11
       2008         23
       2022         25
10     2023    3522269
11     2023    3339731
1      2024          6
12     2023    3376527
dtype: int64

In [49]:
@measure_time_and_memory("MEAN_TRIP_DURATION", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def get_mean_trip_duration(df):
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df['trip_duration'].mean().compute()

mean = get_mean_trip_duration(df)

[MEAN_TRIP_DURATION] Time: 1.3324s | RAM usage: 0.00


# 80 mln data analysis

In [50]:
INPUT_SIZE = "80 MLN"

In [51]:
@measure_time_and_memory("READ DATA", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def read_data():
    return dd.read_parquet("../data/yellow_tripdata_202[3-4]*.parquet", blocksize="16MB")

df = read_data()

[READ DATA] Time: 0.0109s | RAM usage: 0.00


In [52]:
@measure_time_and_memory("HEAD-5", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def head():
    return df.head(5)

head()

[HEAD-5] Time: 1.1191s | RAM usage: 0.00


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [53]:
df['month'] = df['tpep_pickup_datetime'].dt.month
df['year'] = df['tpep_pickup_datetime'].dt.year
df['trip_distance_km'] = df['trip_distance'] * 1.60934

@measure_time_and_memory("GROUP_BY_MONTH_AND_YEAR", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def group_by_month_and_year(df):
    return df.groupby(['month', 'year']).size().compute()

grouped_one = group_by_month_and_year(df)

[GROUP_BY_MONTH_AND_YEAR] Time: 2.5499s | RAM usage: 0.26


In [54]:
@measure_time_and_memory("MEAN_TRIP_DURATION", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def get_mean_trip_duration(df):
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df['trip_duration'].mean().compute()

mean = get_mean_trip_duration(df)

[MEAN_TRIP_DURATION] Time: 1.9550s | RAM usage: 0.00


# Extremely small dataset analysis

In [55]:
INPUT_SIZE = "3"

In [56]:
data = {'Name': ['John', 'Emma', 'Michael', 'Sophia'],
        'Age': [28, 32, 25, 30],
        'Department': ['Sales', 'Marketing', 'Finance', 'HR']
}

In [57]:
df = dd.from_dict(data, npartitions=2)

In [58]:
@measure_time_and_memory("MEAN AND FILTER", CLUSTER_TYPE, INPUT_SIZE, csv_saver)
def calculate_mean_and_filter(df):
    average_age = df['Age'].mean().compute()
    filtered_df = df[df['Age'] > 27].compute()

calculate_mean_and_filter(df)

[MEAN AND FILTER] Time: 0.5840s | RAM usage: 0.00


# Close SLURM cluster

In [62]:
close_slurm_cluster(client)