In [None]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [None]:
"""
The pipeline we are going to setup is composed of the following tasks:

Data Normalization: in this tutorial we have selected three different normalization methods, including the QuantileTransformer (check out the documentation)..
Dimensionality Reduction: we selected Principal Component Analysis (PCA) and a univariate feature selection algorithm as possible candidates.
Regression: we apply a simple regularized linear method, although the method is easily extendable to other learning algorithms.
"""

In [None]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [None]:
X_train = scaler.fit_transform(X_train)
X_train = pca.fit_transform(X_train)
ridge.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Ridge())
        ])

In [None]:
pipe = pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

In [None]:
print(pipe.steps[1][1].explained_variance_)

In [None]:
"""
Hyper-parameters are parameters that are manually tuned by a human operator to maximize the model performance against a validation set through a grid search.
"""

#Concerning PCA, we want to evaluate how accuracy varies with the number of components, from 1 to 10:
import numpy as np
n_features_to_test = np.arange(1, 11)

alpha_to_test = 2.0**np.arange(-6, +6)

params = {'reduce_dim__n_components': n_features_to_test,\
              'regressor__alpha': alpha_to_test}

"""
It is worth remarking the convention adopted to name the parameters: name of the pipeline step, followed by a double underscore (__), then finally the name 
of the parameter within the step. 
"""

from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

In [None]:
"""
We can follow the same approach, this time to decide which algorithm we should use, for example, to perform data normalization:
"""

scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

params = {'scaler': scalers_to_test,
        'reduce_dim__n_components': n_features_to_test,\
        'regressor__alpha': alpha_to_test}

"""
If we wanted to pick between reduce dim methods
Luckily, GridSearchCV also allows to optimize lists of parameter dictionaries, which solves this issue as well:
"""

params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))