In [6]:
from sklearn.datasets import make_classification
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline

In [7]:
x, y = make_classification(n_samples=10000,
                           n_features=500,
                           n_classes=2,
                           n_redundant=250,
                           random_state=42)

In [8]:
# PCA and Logistic Regression Pipeline
logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [9]:
# Grid search across some parameters
grid = dict(pca__n_components=[50, 100, 250],
            logistic__C=[1e-4, 1.0, 1e4],
            logistic__penalty=['l1', 'l2'])

In [10]:
y

array([0, 1, 1, ..., 1, 1, 1])

In [11]:
from sklearn.grid_search import GridSearchCV



In [12]:
estimator = GridSearchCV(pipe, grid)

In [13]:
%time estimator.fit(x, y)

CPU times: user 1min 29s, sys: 8.5 s, total: 1min 38s
Wall time: 36.1 s


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [50, 100, 250], 'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 1.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [14]:
estimator.best_score_

0.8932

In [15]:
estimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

In [16]:
estimator.best_estimator_

Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

# Dask Implementation

In [21]:
from dklearn.model_selection import DaskGridSearchCV

In [22]:
destimator = DaskGridSearchCV(pipe, grid)

In [23]:
%time destimator.fit(x, y)

CPU times: user 36.5 s, sys: 4.86 s, total: 41.3 s
Wall time: 7.94 s


DaskGridSearchCV(cache_cv=True, cv=None, error_score='raise',
         estimator=Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
         get=None, iid=True,
         param_grid={'pca__n_components': [50, 100, 250], 'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 1.0, 10000.0]},
         refit=True, return_train_score=True, scoring=None)

In [24]:
destimator.best_params_

{'logistic__C': 0.0001, 'logistic__penalty': 'l2', 'pca__n_components': 50}

In [25]:
destimator.best_estimator_

Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])