### Sample program for grid search of hyperparameters  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from joblib import dump

#### Parameters  

In [None]:
csv_in = '../ai-04/winequality-red_modified-utf8.txt'

#### Read CSV file  

In [None]:
df_all = pd.read_csv(csv_in, sep='\s+', skiprows=13, header=0)
print(df_all.shape)
print(df_all.info())
display(df_all.head())

#### Check rows with missing values  

In [None]:
display(df_all[df_all.isnull().any(axis=1)])

#### Drop rows with missing values  

In [None]:
df = df_all.dropna().reset_index(drop=True)
print(df.shape)
display(df.head())

#### Get X and y  

In [None]:
X = df.iloc[:, :-1]  # explanatory variables
y = df['quality']  # objective variable
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

#### Make pipeline and set parameters for grid search   

In [None]:
pipe = Pipeline([('scaler', MinMaxScaler(feature_range=(0, 1), copy=True)),
                 ('svr', SVR(kernel='rbf'))])

C_grid = [1e0, 1e1, 1e2, 1e3]  # np.logspace(0, 3, num=4)
g_grid = [1e-4, 1e-3, 1e-2, 1e-1]  # np.logspace(-4, -1, num=4)
e_grid = [1e-2, 1e-1, 1e0, 1e1, 1e2]  # np.logspace(-2, 2, num=5)

# Hyperparameter settings for grid search
param_grid = {
    'svr__C': C_grid,
    'svr__gamma' : g_grid,
    'svr__epsilon' : e_grid,
}

#### Preparation of objects for cross validation  

In [None]:
grid_cv = KFold(n_splits=4, shuffle=True, random_state=7)  # for grid search
gen_cv = KFold(n_splits=4, shuffle=True, random_state=11)  # for estimation of generalization performance

#### Define the grid search for hyperparameters  

In [None]:
gs = GridSearchCV(pipe, param_grid , cv=grid_cv, scoring='neg_mean_squared_error')

#### Estimation of generalization performance  

In [None]:
%%time
nested_score = cross_val_score(gs, X=X, y=y, cv=gen_cv,
                               scoring='neg_mean_squared_error')
print(nested_score)
print(np.sqrt(-nested_score.mean()))

#### Cross-validation to obtain the model with the best hyperparameter set (best estimator)  
- Note: gs_best is already fit to the whole data (X) in gs.fit(X,y)  

In [None]:
%%time
gs.fit(X, y)
gs_best = gs.best_estimator_

In [None]:
print(gs.best_params_)
print(gs_best)

#### Check the predictions during cross-validation   

In [None]:
pipe = Pipeline([('scaler', MinMaxScaler(feature_range=(0, 1), copy=True)),
                 ('svr', SVR(C=100.0, epsilon=0.1, gamma=0.1, kernel='rbf'))])
y_pred = cross_val_predict(pipe, X=X, y=y, cv=grid_cv)

In [None]:
y_min = np.append(y_pred, y).min()
y_max = np.append(y_pred, y).max()
y_margin = (y_max - y_min) * 0.1
y_min -= y_margin
y_max += y_margin

plt.scatter(y_pred, y, alpha=0.1)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(y_min, y_max)
plt.ylim(y_min, y_max)
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

#### Save the best predictor  

In [None]:
tag = 'wine'
model_file = 'svr_best_{}.joblib'.format(tag)
dump(gs_best, model_file)

#### Demo for loading best SVR model from file and using it for predicton   

In [None]:
from joblib import load

svr_from_file = load(model_file)
y_pred_from_file = svr_from_file.predict(X)  # from file

#### Compare the results with original prediction results  

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = gs_best.predict(X)  # original one
print(mean_squared_error(y_pred, y_pred_from_file))

#### Do prediction  

In [None]:
X_test = pd.DataFrame([
  [8.0, 0.8, 0.3, 4.0, 0.08, 21, 64, 0.99, 3.2, 0.5, 9.0],
  [6.5, 0.6, 0.1, 2.4, 0.06, 13, 29, 0.98, 3.4, 0.6, 11.7],
], columns=X.columns)
y_pred = svr_from_file.predict(X_test)
print(y_pred)