# cuML vs Scikit

In this post we will compare performance of cuML and scikit-learn on the following models:
- K-means Clustering
- Linear Regression
- Random Forest Classification

Note: These experiments were done on a single Node

In [1]:
import numpy as np

import pandas as pd
import cudf as gd

%matplotlib inline
import matplotlib.pyplot as plt

## Kmeans

A lot of code is directly copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/kmeans_demo.ipynb

In [2]:
from cuml.datasets import make_blobs
from sklearn.metrics import adjusted_rand_score

from sklearn.cluster import KMeans as skKMeans
from cuml.cluster import KMeans as cumlKMeans

In [3]:
n_samples = 1000000
n_features = 40

##### Generate Data

In [4]:
cudf_data_kmeans, cudf_labels_kmeans = make_blobs(
   n_samples=n_samples, n_features=n_features, centers=5, random_state=7)

cudf_data_kmeans = gd.DataFrame.from_gpu_matrix(cudf_data_kmeans)
cudf_labels_kmeans = gd.Series(cudf_labels_kmeans)

scikit_data_kmeans = cudf_data_kmeans.to_pandas()
scikit_labels_kmeans = cudf_labels_kmeans.to_pandas()

##### Model Training

In [5]:
%%time
#scikit kmeans model training

kmeans_sk = skKMeans(n_clusters=5,
                     n_jobs=-1)
kmeans_sk.fit(scikit_data_kmeans)

CPU times: user 1.35 s, sys: 1.02 s, total: 2.37 s
Wall time: 6.61 s


In [6]:
%%time
#cuml kmeans model training


kmeans_cuml = cumlKMeans(n_clusters=5)
kmeans_cuml.fit(cudf_data_kmeans)

CPU times: user 314 ms, sys: 157 ms, total: 471 ms
Wall time: 472 ms


##### Comparison of results

In [8]:
%%time
cuml_score = adjusted_rand_score(scikit_labels_kmeans, kmeans_cuml.labels_)
sk_score = adjusted_rand_score(scikit_labels_kmeans, kmeans_sk.labels_)

CPU times: user 4min 40s, sys: 5.72 s, total: 4min 46s
Wall time: 4min 46s


In [9]:
threshold = 1e-4

passed = (cuml_score - sk_score) < threshold
print('compare kmeans: cuml vs sklearn labels_ are ' + ('equal' if passed else 'NOT equal'))

compare kmeans: cuml vs sklearn labels_ are equal


## PCA

In [7]:
from cuml import PCA as cumlPCA
from sklearn.decomposition import PCA as skPCA
import time 

In [8]:
n_samples = 1000000
n_features = 40

In [9]:
cudf_data_kmeans, cudf_labels_kmeans = make_blobs(
   n_samples=n_samples, n_features=n_features, centers=5, random_state=7)

cudf_data_kmeans = gd.DataFrame.from_gpu_matrix(cudf_data_kmeans)
cudf_labels_kmeans = gd.Series(cudf_labels_kmeans)

In [10]:
scikit_data_kmeans = cudf_data_kmeans.to_pandas()
scikit_labels_kmeans = cudf_labels_kmeans.to_pandas()

In [11]:
from sklearn.preprocessing import StandardScaler

scikit_data_kmeans = StandardScaler().fit_transform(scikit_data_kmeans)

In [12]:
data_ = StandardScaler().fit_transform(cudf_data_kmeans.to_pandas())
cudf_data_kmeans = gd.from_pandas(pd.DataFrame(data_))

In [15]:
%%time
sk_pca = skPCA(n_components=3)
sk_pca.fit(scikit_data_kmeans)


CPU times: user 30.8 s, sys: 3.1 s, total: 33.9 s
Wall time: 4.49 s


In [16]:
%%time
cuml_pca = cumlPCA(n_components=3)
cuml_pca.fit(scikit_data_kmeans)

CPU times: user 208 ms, sys: 105 ms, total: 314 ms
Wall time: 311 ms


## Linear Regression

A lot of code has been copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/linear_regression_demo.ipynb

In [17]:
import os
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from cuml.linear_model import LinearRegression as cuLR
from sklearn.linear_model import LinearRegression as skLR

In [18]:
n_samples = 1000000
n_features = 40

##### Generate Data

In [19]:
%%time
X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0)

X = pd.DataFrame(X)
y = pd.Series(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

CPU times: user 4.3 s, sys: 634 ms, total: 4.93 s
Wall time: 3.31 s


In [20]:
%%time
X_cudf = gd.DataFrame.from_pandas(X_train)
X_cudf_test = gd.DataFrame.from_pandas(X_test)

y_cudf = gd.Series(y_train.values)

CPU times: user 764 ms, sys: 43.1 ms, total: 807 ms
Wall time: 804 ms


##### Model Training

In [21]:
%%time
#scikit linear regression
ols_sk = skLR(fit_intercept=True,
              normalize=True,
              n_jobs=-1)

ols_sk.fit(X_train, y_train)

CPU times: user 3.03 s, sys: 466 ms, total: 3.49 s
Wall time: 1.68 s


In [22]:
%%time
#cuml linear regression
ols_cuml = cuLR(fit_intercept=True,
                normalize=True,
                algorithm='eig')

ols_cuml.fit(X_cudf, y_cudf)

CPU times: user 32.6 ms, sys: 51.7 ms, total: 84.3 ms
Wall time: 79.7 ms


##### Evaluation of results

In [23]:
%%time

#scikit evaluation
predict_sk = ols_sk.predict(X_test)

error_sk = mean_squared_error(y_test, predict_sk)

CPU times: user 42 ms, sys: 0 ns, total: 42 ms
Wall time: 13.2 ms


In [24]:
%%time

#cuml evaluation
predict_cuml = ols_cuml.predict(X_cudf_test).to_array()

error_cuml = mean_squared_error(y_test, predict_cuml)

CPU times: user 35.2 ms, sys: 8.55 ms, total: 43.7 ms
Wall time: 41 ms


In [25]:
print("SKL MSE(y): %s" % error_sk)
print("CUML MSE(y): %s" % error_cuml)

SKL MSE(y): 1.516684016128886e-25
CUML MSE(y): 1.4860955136347414e-25


## Random Forest

Some of the code has been copied from https://github.com/rapidsai/notebooks/blob/branch-0.10/cuml/random_forest_mnmg_demo.ipynb

In [26]:
from sklearn.metrics import accuracy_score
from sklearn import model_selection, datasets


from cuml.ensemble import RandomForestClassifier as cumlRF
from sklearn.ensemble import RandomForestClassifier as sklRF

In [27]:
# Data parameters
train_size = 1000000
test_size = 10000
n_samples = train_size + test_size
n_features = 10

# Random Forest building parameters
max_depth = 12
n_bins = 16
n_trees = 1000

##### Generate Data

In [28]:
X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features,
                                 n_clusters_per_class=1, n_informative=int(n_features / 3),
                                 random_state=123, n_classes=5)
y = y.astype(np.int32)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size)

In [29]:
X_train_cudf = gd.DataFrame.from_pandas(pd.DataFrame(X_train))
y_train_cudf = gd.Series(y_train)

##### Model Training

In [31]:
%%time

# Use all avilable CPU cores
skl_model = sklRF(max_depth=max_depth, n_estimators=n_trees, n_jobs=-1)
skl_model.fit(X_train, y_train)

CPU times: user 1h 36min 19s, sys: 3.42 s, total: 1h 36min 22s
Wall time: 9min 41s


In [30]:
%%time

cuml_model = cumlRF(max_depth=max_depth, n_estimators=n_trees, n_bins=n_bins)
cuml_model.fit(X_train_cudf, y_train_cudf)

CPU times: user 18 s, sys: 1min 27s, total: 1min 45s
Wall time: 26.9 s


##### Evaluation and comparison

In [29]:
skl_y_pred = skl_model.predict(X_test)
cuml_y_pred = cuml_model.predict(X_test)

# Due to randomness in the algorithm, you may see slight variation in accuracies
print("SKLearn accuracy:  ", accuracy_score(y_test, skl_y_pred))
print("CuML accuracy:     ", accuracy_score(y_test, cuml_y_pred))

SKLearn accuracy:   0.862
CuML accuracy:      0.873
