In [20]:
from sklearn.pipeline import Pipeline, FeatureUnion
# http://scikit-learn.org/stable/modules/pipeline.html

# Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of
# steps in processing the data, for example feature selection, normalization and classification. Pipeline serves 
# two purposes here:
# Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators.
# Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once.
# All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). 
# The last estimator may be any type (transformer, classifier, etc.).

# FeatureUnion combines several transformer objects into a new transformer that combines their output. 
# A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data 
# independently. For transforming data, the transformers are applied in parallel, and the sample vectors 
# they output are concatenated end-to-end into larger vectors.
# FeatureUnion serves the same purposes as Pipeline - convenience and joint parameter estimation and validation.
# FeatureUnion and Pipeline can be combined to create complex models.
# (A FeatureUnion has no way of checking whether two transformers might produce identical features. 
# It only produces a union when the feature sets are disjoint, and making sure they are is the caller’s 
# responsibility.)
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
# C-Support Vector Classification.
# The implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples
# which makes it hard to scale to dataset with more than a couple of 10000 samples.
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
# Principal component analysis (PCA)
# Linear dimensionality reduction using Singular Value Decomposition of the data and keeping only the most 
# significant singular vectors to project the data to a lower dimensional space.
# This implementation uses the scipy.linalg implementation of the singular value decomposition. It only works 
# for dense arrays and is not scalable to large dimensional data.
from sklearn.feature_selection import SelectKBest

In [21]:
iris = load_iris()
iris

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [22]:
X, y = iris.data, iris.target

In [23]:
# This dataset is way to high-dimensional. Better do PCA:
pca = PCA(n_components=2)

In [24]:
# Maybe some original features where good, too?
selection = SelectKBest(k=1)
selection

SelectKBest(k=1, score_func=<function f_classif at 0x0000000014E0F7B8>)

In [25]:
# Build estimator from PCA and Univariate selection:
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
combined_features

FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, n_components=2, whiten=False)), ('univ_select', SelectKBest(k=1, score_func=<function f_classif at 0x0000000014E0F7B8>))],
       transformer_weights=None)

In [26]:
# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)
X_features

array([[-2.68420713, -0.32660731,  1.4       ],
       [-2.71539062,  0.16955685,  1.4       ],
       [-2.88981954,  0.13734561,  1.3       ],
       [-2.7464372 ,  0.31112432,  1.5       ],
       [-2.72859298, -0.33392456,  1.4       ],
       [-2.27989736, -0.74778271,  1.7       ],
       [-2.82089068,  0.08210451,  1.4       ],
       [-2.62648199, -0.17040535,  1.5       ],
       [-2.88795857,  0.57079803,  1.4       ],
       [-2.67384469,  0.1066917 ,  1.5       ],
       [-2.50652679, -0.65193501,  1.5       ],
       [-2.61314272, -0.02152063,  1.6       ],
       [-2.78743398,  0.22774019,  1.4       ],
       [-3.22520045,  0.50327991,  1.1       ],
       [-2.64354322, -1.1861949 ,  1.2       ],
       [-2.38386932, -1.34475434,  1.5       ],
       [-2.6225262 , -0.81808967,  1.3       ],
       [-2.64832273, -0.31913667,  1.4       ],
       [-2.19907796, -0.87924409,  1.7       ],
       [-2.58734619, -0.52047364,  1.5       ],
       [-2.3105317 , -0.39786782,  1.7  

In [27]:
svm = SVC(kernel="linear")
#  kernel must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. 

In [28]:
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
# Do grid search over k, n_components and C:
param_grid = dict(features__pca__n_components=[1, 2, 3],
                  features__univ_select__k=[1, 2],
                  svm__C=[0.1, 1, 10])
param_grid

{'features__pca__n_components': [1, 2, 3],
 'features__univ_select__k': [1, 2],
 'svm__C': [0.1, 1, 10]}

In [29]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1 
[CV]  features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1, score=0.960784 -   0.1s
[CV] features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1 
[CV]  features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1, score=0.901961 -   0.0s
[CV] features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1 
[CV]  features__pca__n_components=1, svm__C=0.1, features__univ_select__k=1, score=0.979167 -   0.0s
[CV] features__pca__n_components=1, svm__C=1, features__univ_select__k=1 
[CV]  features__pca__n_components=1, svm__C=1, features__univ_select__k=1, score=0.941176 -   0.0s
[CV] features__pca__n_components=1, svm__C=1, features__univ_select__k=1 
[CV]  features__pca__n_components=1, svm__C=1, features__univ_select__k=1, score=0.921569 -   0.0s
[CV] features__pca__n_components=1, svm__C=1, features__univ_s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:    0.2s



[CV] features__pca__n_components=1, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=1, features__univ_select__k=2, score=0.960784 -   0.0s
[CV] features__pca__n_components=1, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=1, features__univ_select__k=2, score=0.921569 -   0.0s
[CV] features__pca__n_components=1, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=1, features__univ_select__k=2, score=1.000000 -   0.0s
[CV] features__pca__n_components=1, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=10, features__univ_select__k=2, score=0.980392 -   0.0s
[CV] features__pca__n_components=1, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=10, features__univ_select__k=2, score=0.901961 -   0.0s
[CV] features__pca__n_components=1, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=1, svm__C=10, feature

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:    0.3s



[CV] features__pca__n_components=2, svm__C=0.1, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=0.1, features__univ_select__k=2, score=0.979167 -   0.0s
[CV] features__pca__n_components=2, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=1, features__univ_select__k=2, score=1.000000 -   0.0s
[CV] features__pca__n_components=2, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=1, features__univ_select__k=2, score=0.960784 -   0.0s
[CV] features__pca__n_components=2, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=1, features__univ_select__k=2, score=0.979167 -   0.0s
[CV] features__pca__n_components=2, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=10, features__univ_select__k=2, score=0.980392 -   0.0s
[CV] features__pca__n_components=2, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=2, svm__C=10, featu

[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    0.4s finished



[CV]  features__pca__n_components=3, svm__C=1, features__univ_select__k=2, score=1.000000 -   0.0s
[CV] features__pca__n_components=3, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=3, svm__C=1, features__univ_select__k=2, score=0.960784 -   0.0s
[CV] features__pca__n_components=3, svm__C=1, features__univ_select__k=2 
[CV]  features__pca__n_components=3, svm__C=1, features__univ_select__k=2, score=0.979167 -   0.0s
[CV] features__pca__n_components=3, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=3, svm__C=10, features__univ_select__k=2, score=1.000000 -   0.0s
[CV] features__pca__n_components=3, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=3, svm__C=10, features__univ_select__k=2, score=0.921569 -   0.0s
[CV] features__pca__n_components=3, svm__C=10, features__univ_select__k=2 
[CV]  features__pca__n_components=3, svm__C=10, features__univ_select__k=2, score=1.000000 -   0.0s


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, n_components=2, whiten=False)), ('univ_select', SelectKBest(k=1, score_func=<function f_classif at 0x0000000014E0F7B8>))],
       transformer_weights=None)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'features__pca__n_components': [1, 2, 3], 'svm__C': [0.1, 1, 10], 'features__univ_select__k': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=10)

In [30]:
print(grid_search.best_estimator_)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, n_components=2, whiten=False)), ('univ_select', SelectKBest(k=2, score_func=<function f_classif at 0x0000000014E0F7B8>))],
       transformer_weights=None)), ('svm', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])
