# https://docs.ray.io/en/latest/tune/tutorials/tune-sklearn.html

In [1]:
import ray

# Keep this here for https://github.com/ray-project/ray/issues/11547
from sklearn.model_selection import GridSearchCV
# Replace above line with:
from ray.tune.sklearn import TuneGridSearchCV

In [2]:
# Other imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
import time  # Just to compare fit times

## Create dataset

In [3]:
X, y = make_classification(
    n_samples=11000,
    n_features=1000,
    n_informative=50,
    n_redundant=0,
    n_classes=10,
    class_sep=2.5)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000)

# Example parameters to tune from SGDClassifier
parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}

## Sklearn's Hyperparameter Search (Fixed Grid)

In [4]:
from sklearn.model_selection import GridSearchCV
# n_jobs=-1 enables use of all cores like Tune does
sklearn_search = GridSearchCV(SGDClassifier(), parameter_grid, n_jobs=-1)

start = time.time()
sklearn_search.fit(x_train, y_train)
end = time.time()
print("Sklearn Fit Time:", end - start)
# Sklearn Fit Time: 47.48055911064148 (for an 8 core laptop)

Sklearn Fit Time: 73.79787921905518


In [5]:
pd.DataFrame(sklearn_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,39.719502,4.205468,0.013365,0.000488,0.0001,0.01,"{'alpha': 0.0001, 'epsilon': 0.01}",0.838,0.822,0.8375,0.825,0.823,0.8291,0.00713,5
1,38.110991,4.946879,0.016968,0.009211,0.0001,0.1,"{'alpha': 0.0001, 'epsilon': 0.1}",0.8375,0.8215,0.8385,0.8235,0.8205,0.8283,0.007985,6
2,4.935084,0.488759,0.013962,0.001095,0.1,0.01,"{'alpha': 0.1, 'epsilon': 0.01}",0.862,0.8495,0.857,0.8525,0.8495,0.8541,0.00481,3
3,4.959681,0.684633,0.018748,0.008402,0.1,0.1,"{'alpha': 0.1, 'epsilon': 0.1}",0.8645,0.846,0.864,0.853,0.8425,0.854,0.009028,4
4,2.68353,0.210801,0.013364,0.000489,1.0,0.01,"{'alpha': 1, 'epsilon': 0.01}",0.866,0.852,0.8685,0.845,0.856,0.8575,0.008741,1
5,2.81515,0.269548,0.013354,0.000804,1.0,0.1,"{'alpha': 1, 'epsilon': 0.1}",0.864,0.852,0.865,0.8465,0.8545,0.8564,0.007109,2


## Ray's Hyperparameter Search (Fixed Grid)

In [6]:
########################
#### Early Stopping ####
########################

# For models that support partial fit, it splits the data into 10 folds and fits on each fold successively.
# If after say 5 folds, there is no improvement in the metrics, it will terminate that hyperparameter combination run.
# If a model does not support partial fit, then it trains it using a single fold as usual.

# For tree based models, if we specify n_estimators as 100, it divides that by 10.
# So in each iteration, it will add 10 more estimators to the model and try.
# Early stopping is done if adding more estimators does not improve the performance.
tune_search = TuneGridSearchCV(SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10)

start = time.time()
tune_search.fit(x_train, y_train)
end = time.time()
print("Tune GridSearch Fit Time:", end - start)
# Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop)

Log sync requires rsync to be installed.


Tune GridSearch Fit Time: 37.970200538635254


In [7]:
pd.DataFrame(tune_search.cv_results_)

Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,time_total_s,training_iteration,param_alpha,param_epsilon
0,"{'alpha': 0.0001, 'epsilon': 0.01}",0.752,0.732,0.733,0.7315,0.727,0.7351,0.008697,5,1.753353,1,0.0001,0.01
1,"{'alpha': 0.1, 'epsilon': 0.01}",0.865,0.8485,0.862,0.852,0.854,0.8563,0.00621,2,12.25369,10,0.1,0.01
2,"{'alpha': 1, 'epsilon': 0.01}",0.872,0.8595,0.858,0.852,0.8475,0.8578,0.008298,1,12.743038,10,1.0,0.01
3,"{'alpha': 0.0001, 'epsilon': 0.1}",0.74,0.6905,0.7515,0.731,0.711,0.7248,0.021676,6,1.710468,1,0.0001,0.1
4,"{'alpha': 0.1, 'epsilon': 0.1}",0.8405,0.842,0.847,0.844,0.838,0.8423,0.003059,4,1.727421,1,0.1,0.1
5,"{'alpha': 1, 'epsilon': 0.1}",0.8625,0.858,0.859,0.8455,0.8465,0.8543,0.006947,3,5.574522,4,1.0,0.1


## Bayesian Hyperparameter Optimization (Random Grid) using Ray and scikit-optimize

In [8]:
# First run `pip install bayesian-optimization`  --> actually uses scikit-optimize
from ray.tune.sklearn import TuneSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

digits = datasets.load_digits()
x = digits.data
y = digits.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

clf = SGDClassifier()
parameter_grid = {"alpha": (1e-4, 1), "epsilon": (0.01, 0.1)}

tune_search = TuneSearchCV(
    clf,
    parameter_grid,
    search_optimization="bayesian",
    n_trials=3,
    early_stopping=True,
    max_iters=10,
)
tune_search.fit(x_train, y_train)
print(tune_search.best_params_)
# {'alpha': 0.37460266483547777, 'epsilon': 0.09556428757689246}

Exception in thread ray_print_logs:
Traceback (most recent call last):
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\ray\worker.py", line 439, in print_logs
    pubsub_client.subscribe(gcs_utils.LOG_FILE_CHANNEL)
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\redis\client.py", line 1527, in subscribe
    ret_val = self.execute_command("SUBSCRIBE", *new_channels.keys())
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\redis\client.py", line 1368, in execute_command
    self.connection = self.connection_pool.get_connection(
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\redis\connection.py", line 1317, in get_connection
    if connection.can_read():
  File "C:\Users\Nikhil\.conda\envs\ray-test\

{'alpha': 0.4951394688015905, 'epsilon': 0.07876526823214007}


In [9]:
# ?TuneSearchCV

## Hyperparameter Search works for Regression Tasks as well

In [10]:
from sklearn import datasets
from sklearn.linear_model import SGDRegressor

n_samples = 1000

x, y, coef = datasets.make_regression(
    n_samples=n_samples,
    n_features=1,
    n_informative=1,
    noise=10,
    coef=True,
    random_state=0,    
)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

clf = SGDRegressor()
parameter_grid = {"alpha": (1e-4, 1), "epsilon": (0.01, 0.1)}

tune_search = TuneSearchCV(
    clf,
    parameter_grid,
    search_optimization="bayesian",
    n_trials=3,
    early_stopping=True,
    max_iters=10,
    random_state=42,
    scoring="r2"
)
tune_search.fit(x_train, y_train)
print(tune_search.best_params_)

Exception in thread ray_listen_error_messages:
Traceback (most recent call last):
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 932, in _bootstrap_inner
Exception in thread ray_print_logs:
Traceback (most recent call last):
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\ray\worker.py", line 439, in print_logs
    self.run()
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\threading.py", line 870, in run
        self._target(*self._args, **self._kwargs)
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\ray\worker.py", line 1211, in listen_error_messages_raylet
pubsub_client.subscribe(gcs_utils.LOG_FILE_CHANNEL)
  File "C:\Users\Nikhil\.conda\envs\ray-test\lib\site-packages\redis\client.py", line 1527, in

{'alpha': 0.4593029670766707, 'epsilon': 0.04003377500251198}


In [11]:
pd.DataFrame(tune_search.cv_results_)

Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,time_total_s,training_iteration,param_alpha,param_epsilon
0,"{'alpha': 0.796563332561547, 'epsilon': 0.0265...",0.794856,0.790276,0.781741,0.778047,0.768422,0.782669,0.009291,3,0.089756,10,0.796563,0.026509
1,"{'alpha': 0.5968904729306924, 'epsilon': 0.050...",0.852038,0.844185,0.842019,0.841013,0.831326,0.842116,0.006639,2,0.06782,10,0.59689,0.050125
2,"{'alpha': 0.4593029670766707, 'epsilon': 0.040...",0.892612,0.885211,0.883034,0.881985,0.871944,0.882957,0.006641,1,0.07081,10,0.459303,0.040034


In [12]:
#####################################
#### Dont forget to shutdown ray ####
#####################################
ray.shutdown()
assert ray.is_initialized() is False