In [1]:
from sklearn.datasets import make_classification
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline

In [2]:
x, y = make_classification(n_samples=10000,
                           n_features=500,
                           n_classes=2,
                           n_redundant=250,
                           random_state=42)

In [3]:
# PCA and Logistic Regression Pipeline
logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [4]:
# Grid search across some parameters
grid = dict(pca__n_components=[50, 100, 250],
            logistic__C=[1e-4, 1.0, 1e4],
            logistic__penalty=['l1', 'l2'])

In [5]:
y

array([0, 1, 1, ..., 1, 1, 1])

In [6]:
from sklearn.grid_search import GridSearchCV

In [7]:
estimator = GridSearchCV(pipe, grid)

In [9]:
%time estimator.fit(x, y)

CPU times: user 1min 25s, sys: 6.59 s, total: 1min 32s
Wall time: 35.8 s


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [50, 100, 250], 'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 1.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
estimator.best_score_

0.89290000000000003

In [11]:
estimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

In [12]:
estimator.best_estimator_

Pipeline(steps=[('pca', PCA(copy=True, n_components=50, whiten=False)), ('logistic', LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])