## Dask ML and Gridsearch with cuML

In [1]:
import numpy as np
from cuml import Ridge as cumlRidge
import dask_ml
import cudf
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import dask_ml.model_selection as dcv

## Use a DGX

In [2]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

# Start one worker per GPU on the local system
cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33345  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 540.96 GB


## Use Diabetes Data

In [3]:
diabetes = datasets.load_diabetes()

In [4]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
# row of data
diabetes.data[0]

array([ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
       -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613])

## Fit Data with Ridge Regression

In [6]:
# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)

In [7]:
fit_intercept = True
normalize = False
alpha = np.array([1.0])
solver = "eig" 

In [8]:
ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')
cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver="eig")

In [9]:
# data in MB
X_train.nbytes/1e6

0.02824

In [10]:
%%time
ridge.fit(X_train, y_train)

CPU times: user 504 ms, sys: 8 ms, total: 512 ms
Wall time: 38.1 ms


Ridge(alpha=array([1.]), copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001)

In [11]:
%%time
cu_ridge.fit(X_train, y_train)

CPU times: user 568 ms, sys: 272 ms, total: 840 ms
Wall time: 872 ms


<cuml.linear_model.ridge.Ridge at 0x7f1b603872b0>

## Hyperparameter Optimization with Dask/Dask-ML

In [15]:
params = {'alpha': np.logspace(-3, -1, 10)}
clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')
cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver="eig")

In [16]:
%%time
grid = dcv.GridSearchCV(clf, params ,scoring='r2')
grid.fit(X_train, y_train)

CPU times: user 284 ms, sys: 44 ms, total: 328 ms
Wall time: 2.39 s


In [17]:
%%time
cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')
cu_grid.fit(X_train, y_train)

CPU times: user 1.54 s, sys: 216 ms, total: 1.76 s
Wall time: 13.7 s


## Verify Output

In [19]:
cu_grid.best_params_, grid.best_params_

({'alpha': 0.03593813663804626}, {'alpha': 0.03593813663804626})