#!conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=11.0

In [None]:
%%bash
nvidia-smi
nvcc --version

In [None]:
import numpy as np; print('numpy Version:', np.__version__)
import pandas as pd; print('pandas Version:', pd.__version__)
import xgboost as xgb; print('XGBoost Version:', xgb.__version__)
import cudf; print('cudf Version:', cudf.__version__)
import cuml; print('cudf Version:', cuml.__version__)
import gcsfs; print('gcsfs Version:', gcsfs.__version__)
import time
import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)
import dask; print('dask Version:', gcsfs.__version__)
import dask.dataframe as dask_df

Download HIGGs dataset & unzip
https://archive.ics.uci.edu/ml/datasets/HIGGS

In [None]:
# %%bash
# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P /home/jupyter/
# gzip -d /home/jupyter/HIGGS.csv.gz /home/jupyter/
# ls -lh /home/jupyter/

In [None]:
colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
#filname = '/home/jupyter/HIGGS.csv'
filname = 'gs://mchrestkha-github-ml-examples/higgs/HIGGS.csv'

## Pandas

In [None]:
start_time = time.time()
df=pd.read_csv(filname, header=None, names=colnames)
print("[INFO]: ------ Data Ingestion is completed in {} seconds ---".format((time.time() - start_time)))
start_time = time.time()
X = df[df.columns.difference(['label'])]
y = df['label']
dtrain=xgb.DMatrix(X,y)
print("[INFO]: ------ DMatrix is completed in {} seconds ---".format((time.time() - start_time)))


start_time = time.time()
param =  {
               'max_depth': 8,
               'objective': 'reg:squarederror',
               'tree_method': 'hist'
             }
bst = xgb.train(param, dtrain,num_boost_round=100)
print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))

## cuDF

In [None]:
start_time = time.time()
df=cudf.read_csv(filname, header=None, names=colnames)
print("[INFO]: ------ Data Ingestion is completed in {} seconds ---".format((time.time() - start_time)))
start_time = time.time()
X = df[df.columns.difference(['label'])]
y = df['label']
dtrain=xgb.DMatrix(X,y)
print("[INFO]: ------ DMatrix is completed in {} seconds ---".format((time.time() - start_time)))

start_time = time.time()
param =  {
               'max_depth': 8,
               'objective': 'reg:squarederror',
               'tree_method': 'gpu_hist'
             }
bst = xgb.train(param, dtrain,num_boost_round=100)
print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))

## Dask

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
# num_workders=2
# threads_per_worker=12
# cluster = LocalCluster(n_workers=num_workders, threads_per_worker=threads_per_worker)

client = Client(cluster)
client

In [None]:
start_time = time.time()
df=dask_df.read_csv(filname, header=None, names=colnames)
df=df.persist()
print("[INFO]: ------ Data Ingestion is completed in {} seconds ---".format((time.time() - start_time)))
# start_time = time.time()
X = df[df.columns.difference(['label'])]
y = df['label']
dtrain=xgb.dask.DaskDMatrix(client,X,y)

del df
del X
del y

print("[INFO]: ------ DMatrix is completed in {} seconds ---".format((time.time() - start_time)))

start_time = time.time()
param =  {
               'max_depth': 8,
               'objective': 'reg:squarederror',
               'tree_method': 'hist'
             }
bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)
print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))

## Dask_cuDF

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
# Create a Dask Cluster with one worker per GPU
num_workders=2
threads_per_worker=12
cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)
#cluster = LocalCUDACluster()

client = Client(cluster)
client

In [None]:
start_time = time.time()
df=dask_cudf.read_csv(filname, header=None, names=colnames)
df=df.persist()
print("[INFO]: ------ Data Ingestion is completed in {} seconds ---".format((time.time() - start_time)))
# start_time = time.time()
X = df[df.columns.difference(['label'])]
y = df['label']
#dtrain=xgb.dask.DaskDMatrix(client,X,y)
dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)

del df
del X
del y

print("[INFO]: ------ DMatrix is completed in {} seconds ---".format((time.time() - start_time)))

start_time = time.time()
param =  {
               'max_depth': 8,
               'objective': 'reg:squarederror',
               'tree_method': 'gpu_hist'
             }
bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)
print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
# Create a Dask Cluster with one worker per GPU
# num_workders=2
# threads_per_worker=12
# cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)
cluster = LocalCUDACluster()

client = Client(cluster)
client

start_time = time.time()
df=dask_cudf.read_csv(filname, header=None, names=colnames)
df=df.persist()
print("[INFO]: ------ Data Ingestion is completed in {} seconds ---".format((time.time() - start_time)))
# start_time = time.time()
X = df[df.columns.difference(['label'])]
y = df['label']
#dtrain=xgb.dask.DaskDMatrix(client,X,y)
dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)

del df
del X
del y

print("[INFO]: ------ DMatrix is completed in {} seconds ---".format((time.time() - start_time)))

start_time = time.time()
param =  {
               'max_depth': 8,
               'objective': 'reg:squarederror',
               'tree_method': 'gpu_hist'
             }
bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)
print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))