# cuML vs Scikit

In this post we will compare performance of cuMl vs scikit on following models:
- Kmeans
- linear regression
- random forest

Note: These experiments are on a single Node

In [1]:
import numpy as np

import pandas as pd
import cudf as gd

%matplotlib inline
import matplotlib.pyplot as plt

## Kmeans

A lot of code is directly copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/kmeans_demo.ipynb

In [2]:
from cuml.datasets import make_blobs
from sklearn.metrics import adjusted_rand_score

from sklearn.cluster import KMeans as skKMeans
from cuml.cluster import KMeans as cumlKMeans

In [3]:
n_samples = 1000000
n_features = 4

##### Generate Data

In [4]:
cudf_data_kmeans, cudf_labels_kmeans = make_blobs(
   n_samples=n_samples, n_features=n_features, centers=5, random_state=7)

cudf_data_kmeans = gd.DataFrame.from_gpu_matrix(cudf_data_kmeans)
cudf_labels_kmeans = gd.Series(cudf_labels_kmeans)

In [5]:
scikit_data_kmeans = cudf_data_kmeans.to_pandas()
scikit_labels_kmeans = cudf_labels_kmeans.to_pandas()

##### Model Training

In [6]:
%%time
#scikit kmeans model training

kmeans_sk = skKMeans(n_clusters=5,
                     n_jobs=-1)
kmeans_sk.fit(scikit_data_kmeans)

CPU times: user 2.44 s, sys: 378 ms, total: 2.82 s
Wall time: 1.81 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [7]:
%%time
#cuml kmeans model training


kmeans_cuml = cumlKMeans(n_clusters=5)
kmeans_cuml.fit(cudf_data_kmeans)

CPU times: user 313 ms, sys: 77.3 ms, total: 390 ms
Wall time: 394 ms


KMeans(handle=<cuml.common.handle.Handle object at 0x7fbb35810108>, n_clusters=5, max_iter=300, tol=0.0001, verbose=0, random_state=1, precompute_distances='auto', init='scalable-k-means++', n_init=1, algorithm='auto', n_gpu=1)

##### Comparison of results

In [8]:
%%time
cuml_score = adjusted_rand_score(scikit_labels_kmeans, kmeans_cuml.labels_)
sk_score = adjusted_rand_score(scikit_labels_kmeans, kmeans_sk.labels_)

CPU times: user 4min 40s, sys: 5.23 s, total: 4min 46s
Wall time: 4min 43s


In [9]:
threshold = 1e-4

passed = (cuml_score - sk_score) < threshold
print('compare kmeans: cuml vs sklearn labels_ are ' + ('equal' if passed else 'NOT equal'))

compare kmeans: cuml vs sklearn labels_ are equal


## Linear Regression

A lot of code has been copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/linear_regression_demo.ipynb

In [10]:
import os
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from cuml.linear_model import LinearRegression as cuLR
from sklearn.linear_model import LinearRegression as skLR

In [11]:
n_samples = 2**20
n_features = 399

##### Generate Data

In [12]:
%%time
X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0)

X = pd.DataFrame(X)
y = pd.Series(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

CPU times: user 35.1 s, sys: 4.96 s, total: 40.1 s
Wall time: 38 s


In [13]:
%%time
X_cudf = gd.DataFrame.from_pandas(X_train)
X_cudf_test = gd.DataFrame.from_pandas(X_test)

y_cudf = gd.Series(y_train.values)

CPU times: user 11 s, sys: 556 ms, total: 11.6 s
Wall time: 9.75 s


##### Model Training

In [14]:
%%time
#scikit linear regression
ols_sk = skLR(fit_intercept=True,
              normalize=True,
              n_jobs=-1)

ols_sk.fit(X_train, y_train)

CPU times: user 46.2 s, sys: 4.31 s, total: 50.5 s
Wall time: 14.9 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [15]:
%%time
#cuml linear regression
ols_cuml = cuLR(fit_intercept=True,
                normalize=True,
                algorithm='eig')

ols_cuml.fit(X_cudf, y_cudf)

CPU times: user 570 ms, sys: 116 ms, total: 686 ms
Wall time: 686 ms


LinearRegression(algorithm='eig', fit_intercept=True, normalize=True, handle=<cuml.common.handle.Handle object at 0x7fbb2591cbe8>)

##### Evaluation of results

In [17]:
%%time

#scikit evaluation
predict_sk = ols_sk.predict(X_test)

error_sk = mean_squared_error(y_test, predict_sk)

CPU times: user 205 ms, sys: 0 ns, total: 205 ms
Wall time: 101 ms


In [18]:
%%time

#cuml evaluation
predict_cuml = ols_cuml.predict(X_cudf_test).to_array()

error_cuml = mean_squared_error(y_test, predict_cuml)

CPU times: user 473 ms, sys: 15.5 ms, total: 488 ms
Wall time: 275 ms


In [19]:
print("SKL MSE(y): %s" % error_sk)
print("CUML MSE(y): %s" % error_cuml)

SKL MSE(y): 4.891118498467036e-25
CUML MSE(y): 4.275699939922919e-25


## Random Forest

Some of the code has been copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/random_forest_mnmg_demo.ipynb

In [20]:
from sklearn.metrics import accuracy_score
from sklearn import model_selection, datasets


from cuml.ensemble import RandomForestClassifier as cumlRF
from sklearn.ensemble import RandomForestClassifier as sklRF

In [21]:
# Data parameters
train_size = 100000
test_size = 1000
n_samples = train_size + test_size
n_features = 20

# Random Forest building parameters
max_depth = 12
n_bins = 16
n_trees = 1000

##### Generate Data

In [22]:
X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features,
                                 n_clusters_per_class=1, n_informative=int(n_features / 3),
                                 random_state=123, n_classes=5)
y = y.astype(np.int32)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size)

In [24]:
X_train_cudf = gd.DataFrame.from_pandas(pd.DataFrame(X_train))
y_train_cudf = gd.Series(y_train)

##### Model Training

In [25]:
%%time

# Use all avilable CPU cores
skl_model = sklRF(max_depth=max_depth, n_estimators=n_trees, n_jobs=-1)
skl_model.fit(X_train, y_train)

CPU times: user 5min 36s, sys: 438 ms, total: 5min 36s
Wall time: 34.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [26]:
%%time

cuml_model = cumlRF(max_depth=max_depth, n_estimators=n_trees, n_bins=n_bins)
cuml_model.fit(X_train_cudf, y_train_cudf)

  


CPU times: user 23.1 s, sys: 3.77 s, total: 26.9 s
Wall time: 6.88 s


RandomForestClassifier(n_estimators=1000, max_depth=12, handle=<cuml.common.handle.Handle object at 0x7fbb575960c0>, max_features=1.0, n_bins=16, n_streams=4, split_algo=1, split_criterion=0, min_rows_per_node=2, bootstrap=True, bootstrap_features=False, verbose=False, rows_sample=1.0, max_leaves=-1, quantile_per_tree=False)

##### Evaluation and comparison

In [27]:
skl_y_pred = skl_model.predict(X_test)
cuml_y_pred = cuml_model.predict(X_test)

# Due to randomness in the algorithm, you may see slight variation in accuracies
print("SKLearn accuracy:  ", accuracy_score(y_test, skl_y_pred))
print("CuML accuracy:     ", accuracy_score(y_test, cuml_y_pred))

SKLearn accuracy:   0.869
CuML accuracy:      0.869
