# Building Pipelines

In [1]:
# Building pipielines with preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0
)

pipe = Pipeline([('scaler', MinMaxScaler()), ('svm', SVC())])
# pipeline has fit score and predict method
# pipeline fit data using minmaxscaler and then
# fit the preprocessed data to SVC
pipe.fit(X_train, y_train)

print("Test score {}".format(pipe.score(X_test, y_test)))

Test score 0.972027972027972


# Using Pipeline in GridSearches

In [4]:
# create our param grid like before
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

# use our grid search as usual with
# the pipe as the param
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross validation accuracy {}".format(grid.best_score_))
print("Test set score {}".format(grid.score(X_test, y_test)))
print("Best Parameters {}".format(grid.best_params_))

Best cross validation accuracy 0.9812311901504789
Test set score 0.972027972027972
Best Parameters {'svm__C': 1, 'svm__gamma': 1}


What makes this step is different than we did in parameter selection with preprocessing section is that the we split the data before any preprocessing happens so there are no knowledge leak