### [Pipelining](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline) | [demo](plot_digits_pipe.ipynb)

* chains estimators together
* use case #1: only need to call fit or predict once per pipeline
* use case #2: you can grid search over parameters for all estimators in the pipeline

* demos:
   [Anova SVM](feature_selection_pipeline.ipynb)
   [text feature extract](grid_search_text_feature_extraction.ipynb)
   [digits](plot_digits_pipe.ipynb)
   [RBF kernel approx](plot_kernel_approximation.ipynb)
   [Anova SVM, univariate features](plot_svm_anova.ipynb)

In [15]:
# pipelining 
# setup using list of (key, value) pairs
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

estimators = [
    ('reduce_dim', PCA()), 
    ('clf', SVC())]
pipe = Pipeline(estimators)
print(pipe)

Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])


In [16]:
# make_pipeline
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(), MultinomialNB()) 

Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [17]:
# estimators stored in steps attribute
pipe.steps[0]

('reduce_dim',
 PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False))

In [18]:
pipe.named_steps['reduce_dim']

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
# accessing estimator params
pipe.set_params(clf__C=10)

Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [20]:
from sklearn.model_selection import GridSearchCV

params = dict(
    reduce_dim__n_components=[2, 5, 10],
    clf__C=[0.1, 10, 100])

grid_search = GridSearchCV(
    pipe, 
    param_grid=params)

In [21]:
from sklearn.linear_model import LogisticRegression

params = dict(
    reduce_dim=[
        None, PCA(5), PCA(10)], 
    clf=[
        SVC(), LogisticRegression()],
    clf__C=[0.1, 10, 100])

grid_search = GridSearchCV(pipe, param_grid=params)
grid_search

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), LogisticRegression(C=1.0, class_weight=None, dual=False...nents=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)], 'clf__C': [0.1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=

In [22]:
# pipelining
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

# generate some data to play with
X, y = samples_generator.make_classification(
    n_informative=5, 
    n_redundant=0, 
    random_state=42)

# ANOVA SVM-C
anova_filter = SelectKBest(
    f_regression, 
    k=5)
clf = svm.SVC(
    kernel='linear')
anova_svm = Pipeline([
        ('anova', anova_filter), ('svc', clf)])

# You can set the parameters using the names issued
# For instance, fit using a k of 10 in the SelectKBest
# and a parameter 'C' of the svm
anova_svm.set_params(
    anova__k=10, 
    svc__C=.1).fit(X, y)
                                             
prediction = anova_svm.predict(X)
anova_svm.score(X, y)                        

# getting the selected features chosen by anova_filter
anova_svm.named_steps['anova'].get_support()

array([ True,  True,  True, False, False,  True, False,  True,  True,
        True, False, False,  True, False,  True, False, False, False,
       False,  True], dtype=bool)

### Feature Union

[API](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html#sklearn.pipeline.FeatureUnion) |
[demo](hetero_feature_union.ipynb) |
[demo: feature stacker](feature_stacker.ipynb)

In [23]:
# feature union
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
combined = FeatureUnion(estimators)
combined 

FeatureUnion(n_jobs=1,
       transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
     random_state=None, remove_zero_eig=False, tol=0))],
       transformer_weights=None)

In [24]:
# make_union
combined.set_params(kernel_pca=None) 

FeatureUnion(n_jobs=1,
       transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', None)],
       transformer_weights=None)