In [2]:
%%markdown
# Pipeline 1

# Pipeline 1

In [1]:
from __future__ import print_function

import numpy as np

from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# For reproducibility
np.random.seed(1000)

nb_samples = 500


if __name__ == '__main__':
    # Create the dataset
    X, Y = make_classification(n_samples=nb_samples, n_informative=15, n_redundant=5, n_classes=2)

    # Create the steps for the pipeline
    pca = PCA(n_components=10)
    scaler = StandardScaler()
    svc = SVC(kernel='poly', gamma=3)

    steps = [
                ('pca', pca),
                ('scaler', scaler),
        ('classifier', svc)
    ]

    # Create the pipeline
    pipeline = Pipeline(steps)

    # Perform a grid search
    param_grid = {
        'pca__n_components': [5, 10, 12, 15, 18, 20],
        'classifier__kernel': ['rbf', 'poly'],
        'classifier__gamma': [0.05, 0.1, 0.2, 0.5],
        'classifier__degree': [2, 3, 5]
    }

    gs = GridSearchCV(pipeline, param_grid)
    gs.fit(X, Y)

    print('Best estimator:')
    print(gs.best_estimator_)

    print('Best score:')
    print(gs.best_score_)

Best estimator:
Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=15, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma=0.2, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Best score:
0.924


In [3]:
%%markdown
# Pipeline 2

# Pipeline 2

In [4]:
from __future__ import print_function

import numpy as np
import warnings

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# For reproducibility
np.random.seed(1000)


if __name__ == '__main__':
    warnings.simplefilter("ignore")

    # Load the dataset
    digits = load_digits()

    # Create the steps for the pipeline
    pca = PCA()
    nmf = NMF()
    scaler = StandardScaler()
    kbest = SelectKBest(f_classif)
    lr = LogisticRegression()
    svc = SVC()

    pipeline_steps = [
        ('dimensionality_reduction', pca),
        ('normalization', scaler),
        ('classification', lr)
    ]

    # Create the pipeline
    pipeline = Pipeline(pipeline_steps)

    # Perform a grid search
    pca_nmf_components = [10, 20, 30]

    param_grid = [
        {
            'dimensionality_reduction': [pca],
            'dimensionality_reduction__n_components': pca_nmf_components,
            'classification': [lr],
            'classification__C': [1, 5, 10, 20]
        },
        {
            'dimensionality_reduction': [pca],
            'dimensionality_reduction__n_components': pca_nmf_components,
            'classification': [svc],
            'classification__kernel': ['rbf', 'poly'],
            'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
            'classification__degree': [2, 3, 5],
            'classification__C': [1, 5, 10, 20]
        },
        {
            'dimensionality_reduction': [nmf],
            'dimensionality_reduction__n_components': pca_nmf_components,
            'classification': [lr],
            'classification__C': [1, 5, 10, 20]
        },
        {
            'dimensionality_reduction': [nmf],
            'dimensionality_reduction__n_components': pca_nmf_components,
            'classification': [svc],
            'classification__kernel': ['rbf', 'poly'],
            'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
            'classification__degree': [2, 3, 5],
            'classification__C': [1, 5, 10, 20]
        },
        {
            'dimensionality_reduction': [kbest],
            'classification': [svc],
            'classification__kernel': ['rbf', 'poly'],
            'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
            'classification__degree': [2, 3, 5],
            'classification__C': [1, 5, 10, 20]
        },
    ]

    gs = GridSearchCV(pipeline, param_grid)
    gs.fit(digits.data, digits.target)

    print('Best estimator:')
    print(gs.best_estimator_)

    print('Best score:')
    print(gs.best_score_)
    

Best estimator:
Pipeline(memory=None,
     steps=[('dimensionality_reduction', NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=30, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)), ('normalization', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classification'...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Best score:
0.9693934335


In [5]:
%%markdown
# Feature Union

# Feature Union

In [6]:
from __future__ import print_function

import numpy as np
import warnings

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# For reproducibility
np.random.seed(1000)


if __name__ == '__main__':
    warnings.simplefilter("ignore")

    # Load the dataset
    digits = load_digits()

    # Create the steps for a feature union
    steps_fu = [
        ('pca', PCA(n_components=10)),
        ('kbest', SelectKBest(f_classif, k=5)),
    ]

    # Create the steps for the pipeline
    fu = FeatureUnion(steps_fu)
    scaler = StandardScaler()
    svc = SVC(kernel='rbf', C=5.0, gamma=0.05)

    pipeline_steps = [
        ('fu', fu),
        ('scaler', scaler),
        ('classifier', svc)
    ]

    pipeline = Pipeline(pipeline_steps)

    print('Cross-validation score:')
    print(cross_val_score(pipeline, digits.data, digits.target, cv=10).mean())
    

Cross-validation score:
0.965464333604
