In [1]:
import random

from sklearn.datasets import fetch_20newsgroups
from dask_searchcv import GridSearchCV as DaskGridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA, TruncatedSVD

%matplotlib inline

SEED = 42


In [16]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [2]:
CATEGORIES = [
    'soc.religion.christian', 'talk.politics.guns',
    'sci.crypt', 'sci.electronics'
]

data = fetch_20newsgroups(subset='train', categories=CATEGORIES)
X, y = data.data, data.target
print(len(X))

2331


In [3]:
for key, value in data.items():
    try:
        len_ = len(value)
    except TypeError:
        len_ = 0
    print(key, type(value), len_)

target_names <class 'list'> 4
data <class 'list'> 2331
filenames <class 'numpy.ndarray'> 2331
target <class 'numpy.ndarray'> 2331
DESCR <class 'NoneType'> 0
description <class 'str'> 33


In [4]:
np.bincount(data.target[:10])

array([4, 0, 2, 4])

In [30]:
N_DOCS = 200

idx = np.random.choice(len(X), N_DOCS, replace=False)

X_train, y_train = np.array(X)[idx], np.array(y)[idx]

X_train = TfidfVectorizer().fit_transform(X_train)

In [31]:
pipe = Pipeline([
    ('reduce', TruncatedSVD()),
    ('clf', SGDClassifier())
])

In [29]:
np.random.seed(SEED)

reduce__n_components = [50, 100, 150]
param_grid = dict(reduce__n_components=reduce__n_components)

print(param_grid)
print()
print('scikit-learn:')
%timeit grid = GridSearchCV(pipe, param_grid).fit(X_vec, y_train)
print()
print('dask-searchcv:')
%timeit grid = DaskGridSearchCV(pipe, param_grid).fit(X_vec, y_train)

{'reduce__n_components': [50, 100, 150]}

scikit-learn:
1 loop, best of 3: 1.9 s per loop

dask-searchcv:
1 loop, best of 3: 1.74 s per loop


In [34]:
np.random.seed(SEED)

reduce__n_components = [50, 100, 150] * 2
param_grid = dict(reduce__n_components=reduce__n_components)

print(param_grid)
print()
print('scikit-learn:')
%timeit grid = GridSearchCV(pipe, param_grid).fit(X_vec, y_train)
print()
print('dask-searchcv:')
%timeit grid = DaskGridSearchCV(pipe, param_grid).fit(X_vec, y_train)

{'reduce__n_components': [50, 100, 150, 50, 100, 150]}

scikit-learn:
1 loop, best of 3: 4.33 s per loop

dask-searchcv:
1 loop, best of 3: 2.05 s per loop


In [33]:
np.random.seed(SEED)

reduce__n_components = [500, 1000, 1500]
param_grid = dict(reduce__n_components=reduce__n_components)

print(param_grid)
print()
print('scikit-learn:')
%timeit grid = GridSearchCV(pipe, param_grid).fit(X_vec, y_train)
print()
print('dask-searchcv:')
%timeit grid = DaskGridSearchCV(pipe, param_grid).fit(X_vec, y_train)

{'reduce__n_components': [500, 1000, 1500]}

scikit-learn:
1 loop, best of 3: 11.9 s per loop

dask-searchcv:
1 loop, best of 3: 9.3 s per loop


In [37]:
np.random.seed(SEED)

reduce__n_components = [50, 100, 150]
param_grid = dict(reduce__n_components=reduce__n_components)

print(param_grid)
prun = %prun -r GridSearchCV(pipe, param_grid).fit(X_vec, y_train)

{'reduce__n_components': [50, 100, 150]}
 