## An example of using DASK to speed up pandas operations on Hyperplane
- The task is to groupby and sorting 3G of data on s3 bucket 

In [1]:
import warnings
import os
import sys
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import dask
import dask.dataframe as dd
from dask.distributed import Client
from typing import List, Set, Dict, Tuple, Optional
import types

warnings.filterwarnings('ignore')

from hyperplane import notebook_common as nc

In [2]:
data_url = "s3://dask-data/airline-data"

## Use pandas

In [3]:
%%time
import pandas as pd

files = [f"{data_url}/{year}.csv" for year in range (1987, 1990)]
df_list = []
for f in tqdm(files):
    df = pd.read_csv(f, storage_options = {'anon': True},
                usecols = ['DepTime','FlightNum','DepDelay','Origin', 'Dest','Distance'],
                dtype={'Distance': 'float64',
                      'DepTime':'float64',
                      'FlightNum':'int64',
                      'DepDelay':'float64',
                      'Dest':'object',
                      'Origin':'object'}, 
                encoding = "ISO-8859-1")
    df_list.append(df)
df = pd.concat(df_list)
print(f'size of data {df.memory_usage().sum()/1e9 } G')

df_sort = df.groupby('Origin').apply(lambda x : x.nlargest(n = 10, columns = 'Distance'))
df_sort.shape

  0%|          | 0/3 [00:00<?, ?it/s]

size of data 0.647086832 G
CPU times: user 20.1 s, sys: 7.5 s, total: 27.6 s
Wall time: 1min 7s


(2424, 6)

## Use Dask

In [4]:
num_workers = 2
client, cluster = nc.initialize_cluster(
        nprocs=5,
        nthreads=3,
        ram_gb_per_proc=2.4,
        cores_per_worker=15,
        num_workers = num_workers
    )


👉 Hyperplane: selecting worker node pool
👉 Hyperplane: selecting scheduler node pool
Creating scheduler pod on cluster. This may take some time.
👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.
👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://ds.hyperplane.dev/dask-cluster-961fa205-1399-4261-a7cc-d45ee6253032/status
👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`


In [5]:
%%time
df = dd.read_csv(files, 
                 storage_options = {'anon': True},
                usecols = ['DepTime','FlightNum','DepDelay','Origin', 'Dest','Distance'],
                dtype={'Distance': 'float64',
                      'DepTime':'float64',
                      'FlightNum':'int64',
                      'DepDelay':'float64',
                      'Dest':'object',
                      'Origin':'object'}, 
                encoding = "ISO-8859-1")

# print(f"number of rows, {df.map_partitions(len).compute().sum()}")
# print(f"total size {df.memory_usage_per_partition().compute().sum()/1e9} G")
# df.head(2)


CPU times: user 118 ms, sys: 34.8 ms, total: 153 ms
Wall time: 865 ms


In [6]:
%%time
# lazy groupby and sorting to get the 10 largest trade per ticker
df_sort = df.groupby('Origin').apply(lambda x : x.nlargest(n = 10, columns = 'Distance'))
df_sort

CPU times: user 21.2 ms, sys: 3.04 ms, total: 24.2 ms
Wall time: 22.7 ms


Unnamed: 0_level_0,DepTime,FlightNum,DepDelay,Origin,Dest,Distance
npartitions=18,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,float64,int64,float64,object,object,float64
,...,...,...,...,...,...
...,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


In [7]:
%%time
# actual compute of the groupby sorting result
df_sort_local = df_sort.compute()
df_sort_local

CPU times: user 242 ms, sys: 17.3 ms, total: 260 ms
Wall time: 6.6 s


Unnamed: 0_level_0,Unnamed: 1_level_0,DepTime,FlightNum,DepDelay,Origin,Dest,Distance
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ALB,113602,1730.0,689,0.0,ALB,ATL,852.0
ALB,113603,1730.0,689,0.0,ALB,ATL,852.0
ALB,113604,1730.0,689,0.0,ALB,ATL,852.0
ALB,113605,1743.0,689,13.0,ALB,ATL,852.0
ALB,113606,1730.0,689,0.0,ALB,ATL,852.0
...,...,...,...,...,...,...,...
YAP,235540,1225.0,953,10.0,YAP,GUM,531.0
YAP,235541,950.0,953,-5.0,YAP,GUM,531.0
YAP,235542,1229.0,953,14.0,YAP,GUM,531.0
YAP,235543,1000.0,953,5.0,YAP,GUM,531.0


## close cluster after done to release resources
Dask clusters are short-lived and tied to the computation. Hyperplane also automatically garbage collect the dask node after it's being idling for a few minutes.

In [8]:
cluster.close()