Step 1: download and convert dataset from CSV to Parquet

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.distributed import Client, LocalCluster
import dask
from dask.distributed import get_task_stream

In [3]:
import time

In [4]:
print('pandas version: %s' % pd.__version__)
print('numpy version: %s' % np.__version__)
print('dask version: %s' % dask.__version__)

pandas version: 1.3.4
numpy version: 1.20.3
dask version: 2022.04.2


https://docs.databricks.com/_static/notebooks/koalas-benchmark-distributed-execution.html?_ga=2.216403934.95291449.1648935555-599276868.1645477063

In [5]:
filename = "taxi_dataset.txt"

In [6]:
with open(filename) as file:
    csv_files = [line.rstrip() for line in file]
# only choose yellow taxis
yellow = list(filter(lambda x: "yellow" in x, csv_files))

In [7]:
# make the list small for now
yellow = ['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-11.csv']

In [8]:
yellow

['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-11.csv']

In [9]:
import re
pattern = '/[^\/]*\.csv$/gm'

for csv_url in yellow:
#     get the filename only so I can sort by month / year
    csv_name = re.findall(r"[^\/]*\.csv$",csv_url)
    df = dd.read_csv(csv_url,dtype={'tolls_amount': 'float64'})

    df = df.repartition(npartitions=4)
    df.to_parquet(f'./tmp/trip_data_{csv_name}', write_index=False)


  df = pandas_read_text(
  df = pandas_read_text(
  df = pandas_read_text(


Read in parquet to dask

In [5]:
from dask.distributed import get_task_stream

client = Client()

2022-05-10 11:38:02,894 - distributed.diskutils - INFO - Found stale lock file and directory '/Users/mk/projects/dask_rsds_scheduler_research/dask-worker-space/worker-ri7ijjkq', purging
2022-05-10 11:38:02,895 - distributed.diskutils - INFO - Found stale lock file and directory '/Users/mk/projects/dask_rsds_scheduler_research/dask-worker-space/worker-482xi5pb', purging


# set up testbench

In [6]:
def benchmark(f, df, benchmarks, task_name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    ret_benchmark_vals = {}
    with get_task_stream(plot='save', filename="task-stream.html") as ts:
        start_time = time.time()
        ret = f(df, **kwargs)
        ret_benchmark_vals['raw_duration'] = time.time() - start_time
        ret_benchmark_vals['history'] = ts.data
    benchmarks[task_name] = ret_benchmark_vals
    print(f"{task_name} took: {benchmarks[task_name].get('raw_duration')} seconds")
    return benchmarks[task_name].get("raw_duration")

In [7]:
import collections
dask_benchmarks = collections.defaultdict(dict)
# benchmarks = {"task1" : {"stat1": val, "stat2": val}}

# Define benchmark tasks

In [8]:
all_tasks = []

In [9]:
# sum, then means = simple mapreduce
def read_to_basic_ETL(df = None):
    df = dd.read_parquet(
    "./tmp/", 
    storage_options={"anon": True, 'use_ssl': True})    
    return (df.fare_amount + df.tip_amount).mean().compute()
all_tasks.append(read_to_basic_ETL)

In [10]:
# counts of values seen = simple map, groupby, reduce
def count_values(df):
    return df.fare_amount.value_counts().compute()
all_tasks.append(count_values)

In [11]:
# cpu heavy arithmetic : mapreduce
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.compute()
all_tasks.append(complicated_arithmetic_operation)

In [12]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg(
      {
        'total_amount': ['mean', 'std'], 
        'tip_amount': ['mean', 'std']
      }
    ).compute()
all_tasks.append(groupby_statistics)

In [13]:
# join two datasets
def join_data(df):
    return dd.merge(df, df, left_index=True, right_index=True).compute()
all_tasks.append(join_data)

In [14]:
all_tasks

[<function __main__.read_to_basic_ETL(df=None)>,
 <function __main__.count_values(df)>,
 <function __main__.complicated_arithmetic_operation(df)>,
 <function __main__.groupby_statistics(df)>,
 <function __main__.join_data(df)>]

# run the tasks

In [15]:
df = dd.read_parquet(
    "./tmp/", 
    storage_options={"anon": True, 'use_ssl': True})

In [16]:
# for task in all_tasks:
#     benchmark(task, df=df, benchmarks = dask_benchmarks, task_name = task.__name__)

In [17]:
[benchmark(task, df=df, benchmarks = dask_benchmarks, task_name = task.__name__) for task in all_tasks]

read_to_basic_ETL took: 6.0014448165893555 seconds
count_values took: 3.2534680366516113 seconds
complicated_arithmetic_operation took: 65.53360509872437 seconds
groupby_statistics took: 310.51419281959534 seconds


2022-05-10 11:21:53,154 - distributed.spill - ERROR - Spill to disk failed; keeping data in memory
Traceback (most recent call last):
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/distributed/spill.py", line 115, in handle_errors
    yield
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/distributed/spill.py", line 189, in __setitem__
    super().__setitem__(key, value)
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/buffer.py", line 87, in __setitem__
    self.fast[key] = value
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/lru.py", line 70, in __setitem__
    self.evict()
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/lru.py", line 89, in evict
    cb(k, v)
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/buffer.py", line 60, in fast_to_slow
    self.slow[key] = value
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/distributed/spill.py", line 312, in __setitem__
    self.d[key] =

AssertionError: 

2022-05-10 11:22:28,812 - distributed.worker - ERROR - Exception during execution of task ('group-shuffle-0-6a69a571b550c1fb52466e67cabd61ce', (9, 6)).
Traceback (most recent call last):
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/distributed/worker.py", line 3422, in execute
    args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs)
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/distributed/worker.py", line 3549, in _prepare_args_for_execution
    data[k] = self.data[k]
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/buffer.py", line 78, in __getitem__
    return self.slow_to_fast(key)
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/buffer.py", line 69, in slow_to_fast
    self.fast[key] = value
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/lru.py", line 70, in __setitem__
    self.evict()
  File "/Users/mk/opt/anaconda3/lib/python3.9/site-packages/zict/lru.py", line 89, in evict
    cb(k, v

# history groking

In [18]:
# add the analyzed dataframes

for task_name, output_values in dask_benchmarks.items():
    dask_hx = output_values.get("history")
    hx_df = pd.DataFrame (dask_hx, columns = ['worker','status','nbytes', 'thread', 'type', 'typename', 'metadata', 'startstops', 'key'])
    hx_ddf = dd.from_pandas(hx_df, npartitions=1)
    exploded_df = hx_ddf.explode("startstops")
    exploded_df['action'] = exploded_df['startstops'].apply(lambda x: x['action'], meta = ("action", str))
    exploded_df['start'] = exploded_df['startstops'].apply(lambda x: x['start'], meta = ("start", np.float64))
    exploded_df['end'] = exploded_df['startstops'].apply(lambda x: x['stop'], meta = ("stop", np.float64))
    exploded_df['action_duration'] = exploded_df['end'] - exploded_df['start']
    exploded_df_only_agg_fields = exploded_df[['worker', 'action', 'action_duration']]
    time_per_worker_and_action = exploded_df_only_agg_fields.groupby(['worker','action']).agg("sum")
    nbytes_per_worker = hx_ddf[['worker', 'nbytes']].groupby(["worker"]).agg("sum")
    output_values["time_per_worker_and_action"] = time_per_worker_and_action.compute()
    output_values["nbytes_per_worker"] = nbytes_per_worker.compute()

In [19]:
# access the analyzed dataframes like so:
dask_benchmarks['read_to_basic_ETL']["time_per_worker_and_action"]
# dask_benchmarks['read_to_basic_ETL']["nbytes_per_worker"]

Unnamed: 0_level_0,Unnamed: 1_level_0,action_duration
worker,action,Unnamed: 2_level_1
tcp://127.0.0.1:53954,compute,9.168895
tcp://127.0.0.1:53954,deserialize,0.42163
tcp://127.0.0.1:53955,compute,9.665516
tcp://127.0.0.1:53955,deserialize,0.413303
tcp://127.0.0.1:53955,transfer,0.038979
tcp://127.0.0.1:53956,compute,9.552206
tcp://127.0.0.1:53956,deserialize,0.404499
tcp://127.0.0.1:53956,transfer,0.027879
tcp://127.0.0.1:53963,compute,9.758512
tcp://127.0.0.1:53963,deserialize,0.406832


In [20]:
dask_benchmarks['count_values']["time_per_worker_and_action"]

Unnamed: 0_level_0,Unnamed: 1_level_0,action_duration
worker,action,Unnamed: 2_level_1
tcp://127.0.0.1:53954,compute,5.984203
tcp://127.0.0.1:53955,compute,5.917521
tcp://127.0.0.1:53955,transfer,0.011927
tcp://127.0.0.1:53956,compute,6.015985
tcp://127.0.0.1:53963,compute,5.79045
tcp://127.0.0.1:53963,transfer,0.054998


In [21]:
dask_benchmarks['complicated_arithmetic_operation']["time_per_worker_and_action"]

Unnamed: 0_level_0,Unnamed: 1_level_0,action_duration
worker,action,Unnamed: 2_level_1
tcp://127.0.0.1:53954,compute,75.514493
tcp://127.0.0.1:53955,compute,76.29231
tcp://127.0.0.1:53956,compute,74.938793
tcp://127.0.0.1:53963,compute,75.167953


In [22]:
dask_benchmarks['groupby_statistics']["time_per_worker_and_action"]

Unnamed: 0_level_0,Unnamed: 1_level_0,action_duration
worker,action,Unnamed: 2_level_1
tcp://127.0.0.1:53954,compute,616.461249
tcp://127.0.0.1:53954,deserialize,0.241957
tcp://127.0.0.1:53955,compute,614.769063
tcp://127.0.0.1:53955,deserialize,0.053788
tcp://127.0.0.1:53955,transfer,0.08212
tcp://127.0.0.1:53956,compute,615.409129
tcp://127.0.0.1:53956,deserialize,0.095333
tcp://127.0.0.1:53963,compute,619.227722
tcp://127.0.0.1:53963,deserialize,0.039012
tcp://127.0.0.1:53963,transfer,0.005876


# try to do something fancy with the history

In [257]:
hx = dask_benchmarks.get("read_to_basic_ETL").get("history")

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


use dask to do dask :D

In [86]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [88]:
hx_df = pd.DataFrame (hx, columns = ['worker','status','nbytes', 'thread', 'type', 'typename', 'metadata', 'startstops', 'key'])

In [97]:
hx_ddf = dd.from_pandas(hx_df, npartitions=1)

In [133]:
# the startstops are nested. we need to unnest this for action stuff only. but use nested for all other
hx_ddf.head(20)

Unnamed: 0,worker,status,nbytes,thread,type,typename,metadata,startstops,key
0,tcp://127.0.0.1:58544,OK,27191704,123145529507840,b'\x80\x04\x95!\x00\x00\x00\x00\x00\x00\x00\x8c\x12pandas.core.series\x94\x8c\x06Series\x94\x93\x94.',pandas.core.series.Series,{},"({'action': 'compute', 'start': 1649539629.0290596, 'stop': 1649539629.2514455},)","('add-42c91fcd244541e3b459e8913af2d07a', 0)"
1,tcp://127.0.0.1:58544,OK,32,123145529507840,b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x07float64\x94\x93\x94.',numpy.float64,{},"({'action': 'compute', 'start': 1649539629.256208, 'stop': 1649539629.266967},)","('series-sum-chunk-fbd709dae3d9d13e17866642e9f8e505', 0, 0, 0)"
2,tcp://127.0.0.1:58544,OK,32,123145546297344,b'\x80\x04\x95\x13\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x05int64\x94\x93\x94.',numpy.int64,{},"({'action': 'compute', 'start': 1649539629.2559612, 'stop': 1649539629.268448},)","('series-count-chunk-33f4ebe1295ecdc32b8301290d7befce', 0, 0, 0)"
3,tcp://127.0.0.1:58545,OK,30486864,123145657192448,b'\x80\x04\x95!\x00\x00\x00\x00\x00\x00\x00\x8c\x12pandas.core.series\x94\x8c\x06Series\x94\x93\x94.',pandas.core.series.Series,{},"({'action': 'compute', 'start': 1649539629.02956, 'stop': 1649539629.2818692},)","('add-42c91fcd244541e3b459e8913af2d07a', 1)"
4,tcp://127.0.0.1:58545,OK,32,123145657192448,b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x07float64\x94\x93\x94.',numpy.float64,{},"({'action': 'compute', 'start': 1649539629.285232, 'stop': 1649539629.2956278},)","('series-sum-chunk-fbd709dae3d9d13e17866642e9f8e505', 0, 1, 0)"
5,tcp://127.0.0.1:58545,OK,32,123145640402944,b'\x80\x04\x95\x13\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x05int64\x94\x93\x94.',numpy.int64,{},"({'action': 'compute', 'start': 1649539629.2851522, 'stop': 1649539629.3018951},)","('series-count-chunk-33f4ebe1295ecdc32b8301290d7befce', 0, 1, 0)"
6,tcp://127.0.0.1:58542,OK,27417264,123145619705856,b'\x80\x04\x95!\x00\x00\x00\x00\x00\x00\x00\x8c\x12pandas.core.series\x94\x8c\x06Series\x94\x93\x94.',pandas.core.series.Series,{},"({'action': 'compute', 'start': 1649539629.0302684, 'stop': 1649539629.3189504},)","('add-42c91fcd244541e3b459e8913af2d07a', 2)"
7,tcp://127.0.0.1:58542,OK,32,123145619705856,b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x07float64\x94\x93\x94.',numpy.float64,{},"({'action': 'compute', 'start': 1649539629.3196013, 'stop': 1649539629.3293502},)","('series-sum-chunk-fbd709dae3d9d13e17866642e9f8e505', 0, 2, 0)"
8,tcp://127.0.0.1:58542,OK,32,123145602916352,b'\x80\x04\x95\x13\x00\x00\x00\x00\x00\x00\x00\x8c\x05numpy\x94\x8c\x05int64\x94\x93\x94.',numpy.int64,{},"({'action': 'compute', 'start': 1649539629.3195152, 'stop': 1649539629.335808},)","('series-count-chunk-33f4ebe1295ecdc32b8301290d7befce', 0, 2, 0)"
9,tcp://127.0.0.1:58543,OK,30016288,123145528463360,b'\x80\x04\x95!\x00\x00\x00\x00\x00\x00\x00\x8c\x12pandas.core.series\x94\x8c\x06Series\x94\x93\x94.',pandas.core.series.Series,{},"({'action': 'compute', 'start': 1649539629.0295646, 'stop': 1649539629.3822947},)","('add-42c91fcd244541e3b459e8913af2d07a', 3)"


In [188]:
# the startstops are nested. we need to unnest this for action stuff only.
exploded_df = hx_ddf.explode("startstops")

In [226]:
# AAAAGH THIS TOOK FOREVER TO FIGURE OUT O_O
exploded_df['action'] = exploded_df['startstops'].apply(lambda x: x['action'], meta = ("action", str))
exploded_df['start'] = exploded_df['startstops'].apply(lambda x: x['start'], meta = ("start", np.float64))
exploded_df['end'] = exploded_df['startstops'].apply(lambda x: x['stop'], meta = ("stop", np.float64))
exploded_df['action_duration'] = exploded_df['end'] - exploded_df['start']

In [247]:
exploded_df_only_agg_fields = exploded_df[['worker', 'action', 'action_duration']]

In [251]:
time_per_worker_and_action = exploded_df_only_agg_fields.groupby(['worker','action']).agg("sum")

This is final for time_per_worker_and_action

In [252]:
time_per_worker_and_action.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,action_duration
worker,action,Unnamed: 2_level_1
tcp://127.0.0.1:58542,compute,0.314724
tcp://127.0.0.1:58543,compute,0.377976
tcp://127.0.0.1:58543,transfer,0.005016
tcp://127.0.0.1:58544,compute,0.245632
tcp://127.0.0.1:58545,compute,0.280231
tcp://127.0.0.1:58545,transfer,0.006547


In [None]:
# now get nbytes per worker

In [255]:
nbytes_per_worker = hx_ddf[['worker', 'nbytes']].groupby(["worker"]).agg("sum")

In [256]:
nbytes_per_worker.head(20)

Unnamed: 0_level_0,nbytes
worker,Unnamed: 1_level_1
tcp://127.0.0.1:58542,27417328
tcp://127.0.0.1:58543,30016384
tcp://127.0.0.1:58544,27191768
tcp://127.0.0.1:58545,30486992


In [16]:
dfx = dd.read_parquet(
    "./tmp/trip_data_['yellow_tripdata_2013-11.csv']", 
    storage_options={"anon": True, 'use_ssl': True})

In [17]:
dfx.head(10)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2013-11-25 15:53:33,2013-11-25 16:00:51,1,0.6,-73.978104,40.752966,1,N,-73.985756,40.762685,CRD,6.0,1.0,0.5,1.0,0.0,8.5
1,CMT,2013-11-25 15:24:41,2013-11-25 15:30:18,1,0.5,-73.982313,40.764827,1,N,-73.982129,40.758889,CRD,5.5,0.0,0.5,3.0,0.0,9.0
2,CMT,2013-11-25 09:43:42,2013-11-25 10:02:57,1,3.3,-73.982013,40.762507,1,N,-74.006854,40.719582,CRD,15.0,0.0,0.5,2.0,0.0,17.5
3,CMT,2013-11-25 06:49:58,2013-11-25 07:04:22,1,3.8,-73.976005,40.744481,1,N,-74.016063,40.717298,CRD,14.0,0.0,0.5,2.9,0.0,17.4
4,CMT,2013-11-25 10:02:12,2013-11-25 10:17:15,1,2.2,-73.952625,40.780962,1,N,-73.98163,40.777978,CRD,12.0,0.0,0.5,2.0,0.0,14.5
5,CMT,2013-11-25 15:18:07,2013-11-25 15:33:25,1,1.0,-73.992423,40.749517,1,N,-73.98816,40.746557,CRD,10.0,0.0,0.5,2.22,0.0,12.72
6,CMT,2013-11-25 21:20:50,2013-11-25 21:26:22,1,1.1,-73.946371,40.775369,1,N,-73.95309,40.785103,CRD,6.5,0.5,0.5,1.5,0.0,9.0
7,CMT,2013-11-25 07:00:55,2013-11-25 07:04:37,1,1.2,-73.983357,40.767193,1,N,-73.978394,40.75558,CRD,5.5,0.0,0.5,1.0,0.0,7.0
8,CMT,2013-11-25 05:34:37,2013-11-25 05:48:15,1,3.6,-73.971555,40.794548,1,N,-73.975399,40.755404,CRD,14.5,0.5,0.5,1.0,0.0,16.5
9,CMT,2013-11-25 08:31:21,2013-11-25 08:55:05,1,5.9,-73.94764,40.830465,1,N,-73.972323,40.76332,CRD,21.0,0.0,0.5,3.0,0.0,24.5
