## Train and Hyperparameter optimization with Ray Tune on 250G of data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import ray; print(f'ray version {ray.__version__}')
import xgboost_ray; print('xgboost_ray', xgboost_ray.__version__)
import xgboost; print('xgboost', xgboost.__version__)
import lightgbm_ray; print('lightgbm_ray', lightgbm_ray.__version__)
import pandas as pd; print('pandas version', pd.__version__)
from lightgbm_ray import RayDMatrix, RayFileType,  RayParams, train
from ray import tune

ray version 1.9.0
xgboost_ray 0.1.5
xgboost 1.5.1
lightgbm_ray 0.1.2
pandas version 1.2.3


In [3]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 18
cpu_core_per_worker = 15
ram_gb_per_worker = 110 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)

👉 Hyperplane: selecting worker node pool


2021-12-29 17:06:42,078	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://10.1.242.3:8787[39m[22m


ray dashboard available at https://shakdemo.hyperplane.dev/ray-stella2/#/
Waiting for worker ray-worker-cf4a3bf0-2d97-4912-a521-8953a7431bb2...
Waiting for worker ray-worker-fb059daa-789e-4f1e-9d82-5873dc6aac10...
Waiting for worker ray-worker-5f702103-d8d9-463d-a75e-30b99a9c7330...
Waiting for worker ray-worker-3e7de624-3c8f-43a1-9587-056daa6ca192...
Waiting for worker ray-worker-c7bfd73e-355c-40d7-9d7a-aa799d8ea19b...
Waiting for worker ray-worker-c9946459-dea6-4e60-8803-de19273b0a8f...
Waiting for worker ray-worker-fd901a26-d5a1-470a-9cba-026ed9ab72b2...
Waiting for worker ray-worker-0540e083-66ec-466b-b7a2-6cbf0e09a7fd...
Waiting for worker ray-worker-11a60c02-f5d6-4bac-8330-ebb6b7706593...
Waiting for worker ray-worker-c8fc444f-ae1a-45fb-8401-7a246bf3c688...
Waiting for worker ray-worker-0a20c182-4823-490d-b186-62a66f66e337...
Waiting for worker ray-worker-2a0e5919-52de-42c9-84ec-b6d736d95c24...
Waiting for worker ray-worker-259568ff-9c76-4f35-9028-c400388aeb5c...
Waiting for work

In [None]:
# wrapper train and HPO function for one liner train, MLFlow logging
# sklearn like api 

In [None]:
def train(config):
    train_set = (RayDMatrix(file_path))
    lightgbm_ray.train(train_set, config)

In [None]:
from ray import tune

config = {}

analysis = tune(train, params=config)



In [None]:
## list of models ray support 
lightgbm_ray
xgboost_ray
sklearn
tensorflow, pytorch, pytorch_lighting (simple model ) 


In [4]:
from google.cloud import storage
import os
storage_client = storage.Client()
bucket = os.environ['HYPERPLANE_GCP_BUCKET']
print(bucket)
blobs = storage_client.list_blobs(bucket, prefix = 'data/crypto2/')  #fake_stock_train
blob_names= [i.name for i in blobs]
print(f'number of files {len(blob_names)}')
file_list = [f"gs://{bucket}/{i}" for i in blob_names if '.parquet' in i]
file_list[:2]

shakdemo-hyperplane
number of files 4168


['gs://shakdemo-hyperplane/data/crypto2/0.parquet',
 'gs://shakdemo-hyperplane/data/crypto2/1.parquet']

In [5]:
## estimate file size

In [6]:
n_files = len(file_list)
# n_files = 4000
df_test = pd.read_parquet(file_list[0], columns = ['order', 'type', 'takerOrMaker', 'amount', 'cost', 'fee','price'])
one_file_size= df_test.memory_usage(deep = True, index = True).sum()/1024/1024
print(f'per file size {one_file_size} MB')
total_file_size = one_file_size * n_files

print(f'total file size {total_file_size/1024.} GB')

per file size 61.416748046875 MB
total file size 249.9853572845459 GB


## train model

In [7]:
## working combo 
## num_of_actors 60 , 10 of 12_16 workers, 500 files, ~ 15G data 
## num_of_actors 112, 18 of 12_128 workers, n_files, ~ 250G data

In [7]:
num_of_actors = 112

In [8]:
config = {
            "objective": "regression_l2",
            "metric": ["rmse"],
        }

ray_params = RayParams(num_actors=num_of_actors, cpus_per_actor=2)

In [9]:
use_cols =  ['order', 'type', 'takerOrMaker', 'amount', 'cost', 'fee','price']

In [10]:
def train_model(config):
    
    train_set = RayDMatrix(
        file_list[:n_files],
        num_actors = num_of_actors,
        distributed = True,
        columns=use_cols, 
        label="price",  # Will select this column as the label
        filetype=RayFileType.PARQUET)
    
    # train_set.assert_enough_shards_for_actors(num_actors=6)

    evals_result = {}
    
    bst = train(
        params=config,
        dtrain = train_set,
        evals_result=evals_result,
        valid_sets=[train_set],
        valid_names=["train"],
        verbose_eval=False,
        ray_params= ray_params,
        verbose = 0
    )


In [11]:
%%time
train_model(config)



[2m[36m(_RemoteRayLightGBMActor pid=86, ip=10.2.25.3)[0m [LightGBM] [Info] Trying to bind port 59889...
[2m[36m(_RemoteRayLightGBMActor pid=86, ip=10.2.25.3)[0m [LightGBM] [Info] Binding port 59889 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=86, ip=10.2.25.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.23.3)[0m [LightGBM] [Info] Trying to bind port 57525...
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.23.3)[0m [LightGBM] [Info] Binding port 57525 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.23.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=85, ip=10.2.23.3)[0m [LightGBM] [Info] Trying to bind port 51759...
[2m[36m(_RemoteRayLightGBMActor pid=85, ip=10.2.23.3)[0m [LightGBM] [Info] Binding port 51759 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=85, ip=10.2.23.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=86, ip=10.2.23.3)[0m [LightGBM] [Info] Trying 



[2m[36m(_RemoteRayLightGBMActor pid=87, ip=10.2.25.3)[0m [LightGBM] [Info] Trying to bind port 43195...
[2m[36m(_RemoteRayLightGBMActor pid=87, ip=10.2.25.3)[0m [LightGBM] [Info] Binding port 43195 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=87, ip=10.2.25.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.31.2)[0m [LightGBM] [Info] Trying to bind port 54825...
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.31.2)[0m [LightGBM] [Info] Binding port 54825 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.31.2)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=82, ip=10.2.28.2)[0m [LightGBM] [Info] Trying to bind port 52843...
[2m[36m(_RemoteRayLightGBMActor pid=82, ip=10.2.28.2)[0m [LightGBM] [Info] Binding port 52843 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=82, ip=10.2.28.2)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.28.2)[0m [LightGBM] [Info] Trying 



[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.18.3)[0m [LightGBM] [Info] Trying to bind port 33981...
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.18.3)[0m [LightGBM] [Info] Binding port 33981 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.18.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.14.3)[0m [LightGBM] [Info] Trying to bind port 44437...
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.14.3)[0m [LightGBM] [Info] Binding port 44437 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.14.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.17.3)[0m [LightGBM] [Info] Trying to bind port 38241...
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.17.3)[0m [LightGBM] [Info] Binding port 38241 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=84, ip=10.2.17.3)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=82, ip=10.2.17.3)[0m [LightGBM] [Info] Trying 




[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 20
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 21
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 22
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 23
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 24
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 25
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 26
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 27
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 28
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.21.3)[0m [LightGBM] [Info] Connected to rank 29
[2m[36m




[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 79
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 80
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 81
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 82
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 83
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 84
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 85
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 86
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 87
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 88
[2m[36m(_RemoteRayLightGBMActor pid=1498)[0m [LightGBM] [Info] Connected to rank 89
[2m[36m(_RemoteRayLightGBMActor pid=1498



[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 107
[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 108
[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 109
[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 110
[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 111
[2m[36m(_RemoteRayLightGBMActor pid=81, ip=10.2.17.3)[0m [LightGBM] [Info] Local rank: 94, total number of machines: 112
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 101
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 102
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.17.3)[0m [LightGBM] [Info] Connected to rank 103
[2m[36m(_RemoteRayLightGBMActor pid=83, ip=10.2.17.3)[0m [LightGBM] [Info

## Train with Hyperplarameter optimization with Ray Tune 

In [13]:
# num_of_actors = 28

In [14]:

# Specify the hyperparameter search space.
config = {
    "objective": "regression_l2",
    "metric": ["rmse"],    
    "eta": tune.loguniform(1e-4, 1e-2),
    # "subsample": tune.uniform(0.8, 1.0),
    "max_depth": tune.randint(7, 9)
}

ray_params = RayParams(num_actors=num_of_actors, cpus_per_actor=2)

def train_model(config):
    
    train_set = RayDMatrix(
        file_list[:n_files],
        num_actors = num_of_actors,
        distributed = True,
        columns=use_cols, 
        label="price",  # Will select this column as the label
        filetype=RayFileType.PARQUET)

    evals_result = {}
    
    bst = train(
        params=config,
        dtrain = train_set,
        evals_result=evals_result,
        valid_sets=[train_set],
        valid_names=["train"],
        verbose_eval=False,
        ray_params= ray_params,
        verbose = 1
    )


In [12]:
%%time


# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
analysis = tune.run(
    train_model,
    config=config,
    metric="train-rmse",
    mode="min",
    resources_per_trial=ray_params.get_tune_resources(),
    verbose = 1)

print("Best hyperparameters", analysis.best_config)

2021-12-29 17:19:19,226	INFO tune.py:626 -- Total run time: 57.84 seconds (57.10 seconds for the tuning loop).


Best hyperparameters {'objective': 'regression_l2', 'metric': ['rmse']}
CPU times: user 4.31 s, sys: 1.34 s, total: 5.65 s
Wall time: 57.9 s


In [16]:
analysis.results_df

Unnamed: 0_level_0,train-rmse,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,experiment_tag,config.objective,config.metric
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
528f7_00000,185.497261,2.434112,True,,,10,4fdcaee6abea4fb0b98c60c43596e09d,2021-12-29_17-19-19,1640798359,54.145826,514,ray-worker-70069876-9a77-4703-b565-aa248a37a9d8,10.2.14.3,54.145826,0,10,0,regression_l2,[rmse]


In [16]:
# bst.booster_.save_model("model.lgbm")

In [13]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-cf4a3bf0-2d97-4912-a521-8953a7431bb2
Deleting ray-worker-fb059daa-789e-4f1e-9d82-5873dc6aac10
Deleting ray-worker-5f702103-d8d9-463d-a75e-30b99a9c7330
Deleting ray-worker-3e7de624-3c8f-43a1-9587-056daa6ca192
Deleting ray-worker-c7bfd73e-355c-40d7-9d7a-aa799d8ea19b
Deleting ray-worker-c9946459-dea6-4e60-8803-de19273b0a8f
Deleting ray-worker-fd901a26-d5a1-470a-9cba-026ed9ab72b2
Deleting ray-worker-0540e083-66ec-466b-b7a2-6cbf0e09a7fd
Deleting ray-worker-11a60c02-f5d6-4bac-8330-ebb6b7706593
Deleting ray-worker-c8fc444f-ae1a-45fb-8401-7a246bf3c688
Deleting ray-worker-0a20c182-4823-490d-b186-62a66f66e337
Deleting ray-worker-2a0e5919-52de-42c9-84ec-b6d736d95c24
Deleting ray-worker-259568ff-9c76-4f35-9028-c400388aeb5c
Deleting ray-worker-70069876-9a77-4703-b565-aa248a37a9d8
Deleting ray-worker-c6f5bd2f-e23d-42aa-9cf0-517bee1133e2
Deleting ray-worker-aab63b9b-0581-4ca9-889c-099ab25ee657
Deleting ray-worker-0165ee76-dd0e-445b-8ff8-6e2d04d81522
Deleting ray-worker-3d59b1ac-92