## Dask ML and Gridsearch with cuML

In [None]:
import numpy as np
from cuml import Ridge as cumlRidge
import dask_ml
import cudf
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
import dask_ml.model_selection as dcv
from librmm_cffi import librmm as rmm

## Use a DGX

In [None]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

# Start one worker per GPU on the local system
cluster = LocalCUDACluster()
client = Client(cluster)
client

## Use Diabetes Data

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
diabetes.feature_names

In [None]:
# row of data
diabetes.data[0]

## Fit Data with Ridge Regression

In [None]:
# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)

In [None]:
fit_intercept = True
normalize = False
alpha = np.array([1.0])
solver = "eig" 

In [None]:
ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')
cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver="eig")

In [None]:
# data in MB
X_train.nbytes/1e6

In [None]:
%%time
ridge.fit(X_train, y_train)

In [None]:
%%time
cu_ridge.fit(X_train, y_train)

## Hyperparameter Optimization with Dask/Dask-ML

In [None]:
params = {'alpha': np.logspace(-3, -1, 10)}
clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')
cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver="eig")

In [None]:
%%timeit
grid = dcv.GridSearchCV(clf, params ,scoring='r2')
grid.fit(X_train, y_train)

In [None]:
%%timeit
cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')
cu_grid.fit(X_train, y_train)

## Verify Output

In [None]:
grid = dcv.GridSearchCV(clf, params ,scoring='r2')
grid.fit(X_train, y_train)
cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')
cu_grid.fit(X_train, y_train)
cu_grid.best_params_, grid.best_params_

## Compare with sklearn GridSearchCV

In [None]:
%%timeit
sk_grid = GridSearchCV(clf, params ,scoring='r2', cv=5, iid=False)
sk_grid.fit(X_train, y_train)

In [None]:
%%timeit
sk_cu_grid = GridSearchCV(cu_clf, params ,scoring='r2', cv=5, iid=False)
sk_cu_grid.fit(X_train, y_train)

In [None]:
sk_grid = GridSearchCV(clf, params ,scoring='r2', cv=5, iid=False)
sk_grid.fit(X_train, y_train)
sk_cu_grid = GridSearchCV(cu_clf, params ,scoring='r2', cv=5, iid=False)
sk_cu_grid.fit(X_train, y_train)
sk_cu_grid.best_params_, sk_grid.best_params_