#### Import required packages

In [20]:
import warnings
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler

In [21]:
warnings.filterwarnings("ignore")

#### Load data

In [22]:
# Load the breast cancer data
dataset = datasets.load_breast_cancer()

# Create X from the dataset's features
X = dataset.data

# Create y from the dataset's output
y = dataset.target

#### Create pipelines

In [23]:
# Create an scaler object
sc = StandardScaler()

# Create a pca object
pca = decomposition.PCA()

# Create a logistic regression object with an L2 penalty
logistic = linear_model.LogisticRegression()

# Create a pipeline of three steps. First, standardize the data.
# Second, tranform the data with PCA.
# Third, train a logistic regression on the data.
pipe = Pipeline(steps=[('sc', sc), 
                       ('pca', pca), 
                       ('logistic', logistic)])

#### Create parameter space

In [24]:
# Create a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
n_components = list(range(1,X.shape[1]+1,1))

# Create a list of values of the regularization parameter
C = np.logspace(-4, 4, 50)

# Create a list of options for the regularization penalty
penalty = ['l1', 'l2']

# Create a dictionary of all the parameter options 
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(pca__n_components=n_components, 
                  logistic__C=C,
                  logistic__penalty=penalty)

#### Conduct parameter optmization with pipeline

In [25]:
# Create a grid search object
clf = GridSearchCV(pipe, parameters, cv = 5)

# Fit the grid search
clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('sc',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('logistic',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False

In [26]:
# View The Best Parameters
print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])

Best Penalty: l2
Best C: 0.08685113737513521
Best Number Of Components: 18


#### Use cross validation to evaluate model

In [27]:
# Fit the grid search using 5-Fold cross validation
cross_val_score(clf, X, y)

array([0.97894737, 0.97368421, 0.97354497])