# PIP INSTALL DEPENDENCIES - ONLY RUN THE FIRST TIME IN FASRC

In [None]:
!pip install pyarrow

In [None]:
!pip install git+https://github.com/gjoseph92/scheduler-profilers.git


In [None]:
!pip install git+https://github.com/mlkimmins/distributed.git@explore-worker-queue-2

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install py-spy 

# MUST RESTART THE KERNEL FROM HERE

Step 1: download and convert dataset from CSV to Parquet

In [1]:
from scheduler_profilers import pyspy_on_scheduler, viztrace_scheduler

In [2]:
from dask.distributed import Client, LocalCluster
import dask.distributed

In [3]:
print('dask distributed version: %s' % dask.distributed.__version__)

dask distributed version: 2022.3.0+44.gcc96a43d


In [4]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.distributed import Client, LocalCluster
import dask
from dask.distributed import get_task_stream

In [6]:
import time

In [7]:
print('pandas version: %s' % pd.__version__)
print('numpy version: %s' % np.__version__)
print('dask version: %s' % dask.__version__)

pandas version: 1.3.2
numpy version: 1.19.5
dask version: 2022.04.0


In [8]:
print('dask distributed version: %s' % dask.distributed.__version__)

dask distributed version: 2022.3.0+44.gcc96a43d


https://docs.databricks.com/_static/notebooks/koalas-benchmark-distributed-execution.html?_ga=2.216403934.95291449.1648935555-599276868.1645477063

In [None]:
filename = "taxi_dataset.txt"

In [None]:
with open(filename) as file:
    csv_files = [line.rstrip() for line in file]
# only choose yellow taxis
yellow = list(filter(lambda x: "yellow" in x, csv_files))

In [None]:
# make the list small for now
yellow = ['https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-11.csv']

In [None]:
yellow

In [None]:
import re
pattern = '/[^\/]*\.csv$/gm'

for csv_url in yellow:
#     get the filename only so I can sort by month / year
    csv_name = re.findall(r"[^\/]*\.csv$",csv_url)
    df = dd.read_csv(csv_url,dtype={'tolls_amount': 'float64'})

    df = df.repartition(npartitions=4)
    df.to_parquet(f'./tmp/trip_data_{csv_name}', write_index=False)


Read in parquet to dask

In [9]:
from dask.distributed import get_task_stream

client = Client()

# set up testbench

In [None]:
def benchmark(f, df, benchmarks, task_name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    ret_benchmark_vals = {}
    with get_task_stream(plot='save', filename="task-stream.html") as ts:
        start_time = time.time()
        ret = f(df, **kwargs)
        ret_benchmark_vals['raw_duration'] = time.time() - start_time
        ret_benchmark_vals['history'] = ts.data
    benchmarks[task_name] = ret_benchmark_vals
    print(f"{task_name} took: {benchmarks[task_name].get('raw_duration')} seconds")
    return benchmarks[task_name].get("raw_duration")

In [10]:
import collections
dask_benchmarks = collections.defaultdict(dict)
# benchmarks = {"task1" : {"stat1": val, "stat2": val}}

# Define benchmark tasks

In [None]:
all_tasks = []

In [None]:
# sum, then means = simple mapreduce
def read_to_basic_ETL(df = None):
    df = dd.read_parquet(
    "./tmp/", 
    storage_options={"anon": True, 'use_ssl': True})    
    return (df.fare_amount + df.tip_amount).mean().compute()
all_tasks.append(read_to_basic_ETL)

In [None]:
# counts of values seen = simple map, groupby, reduce
def count_values(df):
    return df.fare_amount.value_counts().compute()
all_tasks.append(count_values)

In [None]:
# cpu heavy arithmetic : mapreduce
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.compute()
all_tasks.append(complicated_arithmetic_operation)

In [None]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg(
      {
        'total_amount': ['mean', 'std'], 
        'tip_amount': ['mean', 'std']
      }
    ).compute()
all_tasks.append(complicated_arithmetic_operation)

In [None]:
# join two datasets
def join_data(df):
    return dd.merge(df, other, left_index=True, right_index=True).compute()
all_tasks.append(complicated_arithmetic_operation)

# run the tasks

In [11]:
df = dd.read_parquet(
    "./tmp/", 
    storage_options={"anon": True, 'use_ssl': True})

In [12]:
# for task in all_tasks:
#     benchmark(task, df=df, benchmarks = dask_benchmarks, task_name = task.__name__)

In [13]:
with (
#     pyspy_on_scheduler("pyspy2.json"),
    # ^ Saves a speedscope profile to `pyspy.json` locally
    viztrace_scheduler(
        "viztracer.json", trace_sparse="distributed.Scheduler.update_graph_hlg"
    ),
    # ^ Saves a Chrome trace to `viztracer.json` locally
):
    df.fare_amount.value_counts().compute()

Saving report to /tmp/tmp4w3mw6x2viztracer..json ...Loading finish                                        

Dumping trace data to json, total entries: 1211, estimated json file size: 141.9KiB
Report saved.


In [None]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

In [None]:
df.fare_amount.value_counts().compute()

In [None]:
[benchmark(task, df=df, benchmarks = dask_benchmarks, task_name = task.__name__) for task in all_tasks]

# history groking

In [None]:
# add the analyzed dataframes

for task_name, output_values in dask_benchmarks.items():
    dask_hx = output_values.get("history")
    hx_df = pd.DataFrame (dask_hx, columns = ['worker','status','nbytes', 'thread', 'type', 'typename', 'metadata', 'startstops', 'key'])
    hx_ddf = dd.from_pandas(hx_df, npartitions=1)
    exploded_df = hx_ddf.explode("startstops")
    exploded_df['action'] = exploded_df['startstops'].apply(lambda x: x['action'], meta = ("action", str))
    exploded_df['start'] = exploded_df['startstops'].apply(lambda x: x['start'], meta = ("start", np.float64))
    exploded_df['end'] = exploded_df['startstops'].apply(lambda x: x['stop'], meta = ("stop", np.float64))
    exploded_df['action_duration'] = exploded_df['end'] - exploded_df['start']
    exploded_df_only_agg_fields = exploded_df[['worker', 'action', 'action_duration']]
    time_per_worker_and_action = exploded_df_only_agg_fields.groupby(['worker','action']).agg("sum")
    nbytes_per_worker = hx_ddf[['worker', 'nbytes']].groupby(["worker"]).agg("sum")
    output_values["time_per_worker_and_action"] = time_per_worker_and_action.compute()
    output_values["nbytes_per_worker"] = nbytes_per_worker.compute()

In [None]:
# access the analyzed dataframes like so:
dask_benchmarks['read_to_basic_ETL']["time_per_worker_and_action"]
dask_benchmarks['read_to_basic_ETL']["nbytes_per_worker"]

# try to do something fancy with the history

In [None]:
hx = dask_benchmarks.get("read_to_basic_ETL").get("history")

use dask to do dask :D

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
hx_df = pd.DataFrame (hx, columns = ['worker','status','nbytes', 'thread', 'type', 'typename', 'metadata', 'startstops', 'key'])

In [None]:
hx_ddf = dd.from_pandas(hx_df, npartitions=1)

In [None]:
# the startstops are nested. we need to unnest this for action stuff only. but use nested for all other
hx_ddf.head(20)

In [None]:
# the startstops are nested. we need to unnest this for action stuff only.
exploded_df = hx_ddf.explode("startstops")

In [None]:
# AAAAGH THIS TOOK FOREVER TO FIGURE OUT O_O
exploded_df['action'] = exploded_df['startstops'].apply(lambda x: x['action'], meta = ("action", str))
exploded_df['start'] = exploded_df['startstops'].apply(lambda x: x['start'], meta = ("start", np.float64))
exploded_df['end'] = exploded_df['startstops'].apply(lambda x: x['stop'], meta = ("stop", np.float64))
exploded_df['action_duration'] = exploded_df['end'] - exploded_df['start']

In [None]:
exploded_df_only_agg_fields = exploded_df[['worker', 'action', 'action_duration']]

In [None]:
time_per_worker_and_action = exploded_df_only_agg_fields.groupby(['worker','action']).agg("sum")

This is final for time_per_worker_and_action

In [None]:
time_per_worker_and_action.head(20)

In [None]:
# now get nbytes per worker

In [None]:
nbytes_per_worker = hx_ddf[['worker', 'nbytes']].groupby(["worker"]).agg("sum")

In [None]:
nbytes_per_worker.head(20)