In [None]:
# !pip install scikit-learn==0.24.2
# !pip install scikit-optimize
# !pip install --user git+https://github.com/scikit-optimize/scikit-optimize.git

In [None]:
# Checking that everything is correct with skopt (0.9.dev0) and sklearn 
from skopt import __version__
print(__version__)
from sklearn import __version__
print(__version__)

# SVM HYPER-PARAMETER TUNING

- **C :** float, default=1.0
Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
    
- **gamma :** float, default=’scale’
Kernel coefficient for ‘rbf’

- There are more hyper-parameters, but those two are the important ones: 
  - https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
  - https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html



First, data is loaded, inputs go to X, outputs to y.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from scipy.stats import sem

boston = load_boston()
X = boston.data
y = boston.target

## COMBINING HYPER-PARAMETER TUNING AND MODEL EVALUATION

The combination of model evaluation and hyper-parameter tuning can be understood as an external loop (outer) that trains a model and tests the model, and an internal loop (inner), where the training process consists on looking for the best hyper-parameters, and then obtaining the model with those best hyper-parameters.

First, we are going to use **Holdout** (train/test) for model evaluation (external loop or **outer**), and **3-fold crossvalidation** for hyper-parameter tuning (internal loop or **inner**). Hyper-parameters will be adjusted with **Gridsearch**.

#### GRIDSEARCH

First of all, let's define our our python function for RMSE

In [None]:
def rmse(y_test, y_test_pred):
  """ This is my computation of Root Mean Squared Error """
  return np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import train_test_split

# Holdout for model evaluation. 33% of available data for test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=42)

First, let's remember RMSE with default hyper-parameteres

In [None]:
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# This is the preprocessing pipeline: SVMs need scaling
scaler = StandardScaler()
svr = SVR()

pipe_regr = Pipeline([
    ('scale', scaler),
    ('SVM', svr)])

np.random.seed(42)
pipe_regr.fit(X=X_train, y=y_train)
print(f"RMSE of SVR with default hyper-pars: {rmse(y_test, pipe_regr.predict(X=X_test))}")


In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import KFold


# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100],
              'SVM__gamma': [0.01, 0.1, 1]}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Definition of a 2-step process that self-adjusts 2 hyperpars
hpo_regr = GridSearchCV(pipe_regr, 
                        param_grid,
                        scoring='neg_mean_squared_error',
                        cv=inner, 
                        n_jobs=4, verbose=1)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

# At this point, regr contains the model with the best hyper-parameters found by gridsearch
# and trained on the complete X_train

Let's visualize:
- The best hyper-parameters and their (inner!) score. 
- The outer evaluation (model evaluation) on the test partition of the model with the best hyper-parameters

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

We observe that the best value of C is 100, which is in the border of the search space. We may consider extending the search space and see if results improve.

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import KFold


# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000, 10000],
              'SVM__gamma': [0.01, 0.1, 1]}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Definition of a 2-step process that self-adjusts 2 hyperpars
hpo_regr = GridSearchCV(pipe_regr, 
                        param_grid,
                        scoring='neg_mean_squared_error',
                        cv=inner, 
                        n_jobs=4, verbose=1)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

Now, maybe gamma values smaller than 0.01 could be better. Let's extend the space again.

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import KFold


# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000, 10000],
              'SVM__gamma': [0.0001, 0.001, 0.01, 0.1, 1]}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Definition of a 2-step process that self-adjusts 2 hyperpars
hpo_regr = GridSearchCV(pipe_regr, 
                        param_grid,
                        scoring='neg_mean_squared_error',
                        cv=inner, 
                        n_jobs=4, verbose=1)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

#### RANDOMIZED SEARCH

Now, let's use **Randomized Search** instead of gridsearch. Only 20 hyper-parameter value combinations will be tried (budget=20)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# This is the preprocessing pipeline: SVMs need scaling
scaler = StandardScaler()
svr = SVR()

pipe_regr = Pipeline([
    ('scale', scaler),
    ('SVM', svr)])

# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000, 10000],
              'SVM__gamma': [0.0001, 0.001, 0.01, 0.1, 1]}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

budget = 20 # out of 30 possibilities
hpo_regr = RandomizedSearchCV(pipe_regr, 
                            param_grid,
                            scoring='neg_mean_squared_error',
                            cv=inner, 
                            n_jobs=4, verbose=1,
                            n_iter=budget
                        )
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

We have obtained the same results, but exploring fewer possibilities than with grid-search.

For **Randomized Search**, we can define the search space with statistical distributions, rather than using particular values as we did before. Below you can see how to use a loguniform distribution.

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.fixes import loguniform

# This is the preprocessing pipeline: SVMs need scaling
scaler = StandardScaler()
svr = SVR()

pipe_regr = Pipeline([
    ('scale', scaler),
    ('SVM', svr)])

# Search space
# [0.1, 1, 10, 100, 1000, 10000]
# [0.0001, 0.001, 0.01, 0.1, 1]
param_grid = {'SVM__C': loguniform(1e-1, 1e4),
              'SVM__gamma': loguniform(1e-4, 1e0)}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

budget = 20 
hpo_regr = RandomizedSearchCV(pipe_regr, 
                            param_grid,
                            scoring='neg_mean_squared_error',
                            cv=inner, 
                            n_jobs=4, verbose=0,
                            n_iter=budget
                        )
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

#### OBTAINING THE FINAL MODEL (FOR DEPLOYMENT, OR FOR SENDING TO A COMPETITION, ...)

If at the end, we need a final model, we can get it by fitting hpo_regr to all the available data. Let us remember that hpo_regr does hyper.parameter tuning.

In [None]:
np.random.seed(42)

# Fitting again the randomized search HPO
regrFinal = hpo_regr.fit(X,y)

In [None]:
regrFinal.best_params_, np.sqrt(-regrFinal.best_score_)

#### MODEL BASED OPTIMIZATION (BAYESIAN OPTIMIZATION)

scikit-optimize (skopt) will be used for this: https://scikit-optimize.github.io. **Holdout** for model evaluation and **3-fold crossvalidation** for hyper-parameter tuning (with **Model Based Optimization** )

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.fixes import loguniform
from skopt import BayesSearchCV

# This is the preprocessing pipeline: SVMs need scaling
scaler = StandardScaler()
svr = SVR()

pipe_regr = Pipeline([
    ('scale', scaler),
    ('SVM', svr)])

# Search space
# [0.1, 1, 10, 100, 1000, 10000]
# [0.0001, 0.001, 0.01, 0.1, 1]
param_grid = {'SVM__C': Real(1e-1, 1e4, prior="loguniform"),
              'SVM__gamma': Real(1e-4, 1e0, prior="loguniform")}

inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Train the self-adjusting process
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

budget = 20
hpo_regr = BayesSearchCV(pipe_regr, 
                        param_grid,
                        scoring='neg_mean_squared_error',
                        cv=inner, 
                        n_jobs=4, verbose=0,
                        n_iter=budget
                        )
np.random.seed(42)
hpo_regr.fit(X=X_train, y=y_train)

In [None]:
print(f"Best params: {hpo_regr.best_params_}, best score (inner!): {np.sqrt(-hpo_regr.best_score_)}")
# Now, the performance of regr is computed on the test partition
print(f"RMSE (outer!) of SVR with hyper-parameter tuning (grid-search): {rmse(y_test, hpo_regr.predict(X=X_test))}")

We can check if the optimization has converged

In [None]:
_ = plot_convergence(hpo_regr.optimizer_results_[0])
plt.show()

In [None]:
_ = plot_objective(hpo_regr.optimizer_results_[0],
                   dimensions=['max_depth', 'min_samples_split'],
                   n_minimum_search=int(1e8))
plt.show()