<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/rapids-cuDF-cuml-02-linear-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# cuML - Linear Regression

In [25]:
!nvidia-smi

Thu Sep 12 02:38:38 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
!bash rapids-colab.sh

import sys, os

sys.path.append('/usr/local/lib/python3.6/site-packages/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

--2019-09-12 01:08:18--  https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
Resolving github.com (github.com)... 140.82.118.4
Connecting to github.com (github.com)|140.82.118.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/rapidsai/notebooks-contrib/raw/master/utils/rapids-colab.sh [following]
--2019-09-12 01:08:23--  https://github.com/rapidsai/notebooks-contrib/raw/master/utils/rapids-colab.sh
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh [following]
--2019-09-12 01:08:23--  https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.co

### Required Imports

In [33]:
import cudf
import cuml
import gzip
import math
import sklearn
import time
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import cos, sin, asin, sqrt, pi, atan2
from numba import cuda
from sklearn.linear_model import LinearRegression as skLinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from cuml.linear_model import LinearRegression as LinearRegressionGPU

# select a particular GPU to run the notebook
os.environ["CUDA_VISIBLE_DEVICES"]="2"

%matplotlib inline

print('NumPy Version:', np.__version__)
print('Scikit-learn Version:', sklearn.__version__)
print('cuDF Version:', cudf.__version__)
print('cuML Version:', cuml.__version__)

NumPy Version: 1.16.5
Scikit-learn Version: 0.21.3
cuDF Version: 0.10.0a+1233.gf8e8353
cuML Version: 0.10.0a+456.gb96498b


### Helper Functions

In [4]:
!git clone https://github.com/Quansight/scipy-2019-rapids-tutorial

Cloning into 'scipy-2019-rapids-tutorial'...
remote: Enumerating objects: 245, done.[K
remote: Counting objects: 100% (245/245), done.[K
remote: Compressing objects: 100% (162/162), done.[K
remote: Total 245 (delta 144), reused 170 (delta 82), pack-reused 0[K
Receiving objects: 100% (245/245), 35.10 MiB | 22.41 MiB/s, done.
Resolving deltas: 100% (144/144), done.


In [0]:
dataset = 'scipy-2019-rapids-tutorial/cuml/data/mortgage.npy.gz'

if not os.path.exists(dataset):
  print('Unable to find dataset.')

In [0]:
def load_data(nrows, ncols, cached = dataset):
  # split the dataset in a 80:20 split
  train_rows = int(nrows*0.8)
  with gzip.open(cached) as f:
    X = np.load(f)
    X = X[:, [i for i in range(X.shape[1]) if i!=4]]
    y = X[:, 4:5]
    rindices = np.random.randint(0, X.shape[0]-1, nrows)
    X = X[rindices, :ncols]
    y = y[rindices]
    
    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
    
    return df_X_train, df_X_test, df_y_train, df_y_test
  

In [26]:
%%time

nrows = 2 **20
ncols = 399

X_train, X_test , y_train, y_test = load_data(nrows, ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)
print('label',y_test.shape)

[[0.6666667  0.19690022 0.0045045  ... 0.         0.         1.        ]
 [0.6666667  0.19690022 0.00900901 ... 0.         0.         1.        ]
 [0.6666667  0.19690022 0.01351351 ... 0.         0.         1.        ]
 ...
 [0.5555556  0.34976676 0.1981982  ... 0.         0.         1.        ]
 [0.5555556  0.34976676 0.2027027  ... 0.         0.         1.        ]
 [0.5555556  0.34899753 0.2072072  ... 0.         0.         1.        ]]
[[0.       ]
 [0.       ]
 [0.       ]
 ...
 [0.7161238]
 [0.7161238]
 [0.7161238]]
training data (838860, 399)
training label (838860, 1)
testing data (209716, 399)
testing label (209716, 1)
label (209716, 1)
CPU times: user 17.2 s, sys: 35.1 ms, total: 17.3 s
Wall time: 17.3 s


In [30]:
%%time

skols = skLinearRegression(fit_intercept=True, normalize=True)
skols.fit(X_train, y_train)


CPU times: user 32.3 s, sys: 4.54 s, total: 36.8 s
Wall time: 12.3 s


In [34]:
%%time
# calculate the mean squared error of the sklearn linear regression model on the testing dataset
sk_predict = skols.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)

CPU times: user 189 ms, sys: 17 ms, total: 205 ms
Wall time: 134 ms


In [35]:
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

ERROR:Call to cuInit results in CUDA_ERROR_NO_DEVICE


CudaSupportError: ignored