## Scalable sklearn with dask

In [1]:
# check the conda python install (skip if you only have one install of python, sys or otherwise)
! which python

/Users/micheleenharris/miniconda3/bin/python


In [2]:
# just making sure they are in agreement
import sys
sys.executable

'/Users/micheleenharris/miniconda3/bin/python'

### Create toy data

In [2]:
# No dask, just ndarrays
import numpy as np

X_np = np.random.rand(30).reshape(10, 3)
print(X_np.shape)
y_np = np.random.randint(low=0, high=1, size=10)
print(y_np.shape)

(10, 3)
(10,)


In [1]:
# dask toy data
import dask.array as da

X = da.random.random((10, 3), chunks=(5, 3))
print(X.shape)
y = da.random.randint(low=0, high = 1, size = 10, chunks=(5,))
print(y.shape)

(10, 3)
(10,)


### Stochastic gradient decent
* instantiate
* fit to dask arrays (data and labels)

In [3]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
%time sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
sgd  # doctest: +SKIP

# This passes all of X and y through the classifier sequentially.  We can use
# the classifier as normal on in-memory data

CPU times: user 25.6 ms, sys: 3.58 ms, total: 29.1 ms
Wall time: 51.6 ms


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

### Prediction using SGD model

In [5]:
import numpy as np

sgd.predict(np.random.random((4, 3)))

# Or predict on a larger dataset
z = da.random.random((4e9, 3), chunks=(1000, 3))
da.learn.predict(sgd, z)

((1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,
  1000,


In [4]:
help(da.learn)

Help on module dask.array.learn in dask.array:

NAME
    dask.array.learn

FUNCTIONS
    fit(model, x, y, get=<function get at 0x10c44dea0>, **kwargs)
        Fit scikit learn model against dask arrays
        
        Model must support the ``partial_fit`` interface for online or batch
        learning.
        
        This method will be called on dask arrays in sequential order.  Ideally
        your rows are independent and identically distributed.
        
        Parameters
        ----------
        model: sklearn model
            Any model supporting partial_fit interface
        x: dask Array
            Two dimensional array, likely tall and skinny
        y: dask Array
            One dimensional array with same chunks as x's rows
        kwargs:
            options to pass to partial_fit
        
        Examples
        --------
        >>> import dask.array as da
        >>> X = da.random.random((10, 3), chunks=(5, 3))
        >>> y = da.random.random(10, chunks=(5,))
 