# Training on a “large” dataset with the IncrementalSearchCV

In [None]:
from dask.distributed import Client

client = Client()

import numpy as np

from dask_ml.datasets import make_classification

X, y = make_classification(n_samples=5000000, n_features=20, chunks=100000, random_state=0)


Our underlying estimator is an SGDClassifier.

In [None]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)

We also define the distribution of parameters from which we will sample:

In [None]:
params = {'alpha': np.logspace(-2, 1, num=1000),
          'l1_ratio': np.linspace(0, 1, num=1000),
          'average': [True, False]}


Finally we create many random models in this parameter space and train-and-score them until we find the best one.

In [None]:
from dask_ml.model_selection import IncrementalSearchCV

search = IncrementalSearchCV(model, params, random_state=0)

search.fit(X, y, classes=[0, 1])

Note that when we do post-fit tasks like search.score, the underlying estimator’s score method is used. If that is unable to handle a larger-than-memory Dask Array, we’ll exhaust our machines memory. If we plan to use post-estimation features like scoring or prediction, it is recommended to use dask_ml.wrappers.ParallelPostFit.



In [None]:
from dask_ml.wrappers import ParallelPostFit

params = {'estimator__alpha': np.logspace(-2, 1, num=1000),
                'estimator__l1_ratio': np.linspace(0, 1, num=1000),
                'estimator__average': [True, False]}


model = ParallelPostFit(SGDClassifier(tol=1e-3,
                                      penalty="elasticnet",
                                      random_state=0))


search = IncrementalSearchCV(model, params, random_state=0)

search.fit(X, y, classes=[0, 1])

In [None]:
search.score(X, y)