
# XGBoost on GPU
---



- This is tested on XGboost 0.90 version

In [10]:
!nvidia-smi

Sat Mar 21 09:24:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import xgboost as xgb

In [5]:
print(xgb.__version__)

0.90


In [0]:
import sys,tempfile, urllib, os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [0]:
from sklearn.datasets import fetch_openml
covtyp = fetch_openml(name='covertype', version=4)


In [8]:
covtyp.data.shape

(581012, 54)

In [9]:
np.unique(covtyp.target)

array(['1', '2', '3', '4', '5', '6', '7'], dtype=object)

In [0]:
cov_df = pd.DataFrame(data= np.c_[covtyp['data'], covtyp['target']],
                     columns= covtyp['feature_names'] + ['target'])

In [12]:
cov_df.memory_usage(index=True).sum()

255645408

# Loading Data

In [13]:
cov_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,target
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [14]:
print ("Rows     : " ,cov_df.shape[0])
print ("Columns  : " ,cov_df.shape[1])

Rows     :  581012
Columns  :  55


In [15]:
cov_df.target.value_counts()

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: target, dtype: int64

In [0]:
# To use in XGboost we need the data type to be float32
# We have it as objects currently

In [0]:
cov_df.dtypes

In [0]:
for cols in cov_df.columns:
    cov_df[cols] = cov_df[cols].astype(np.float32)

In [22]:
cov_df.dtypes

Elevation                             float32
Aspect                                float32
Slope                                 float32
Horizontal_Distance_To_Hydrology      float32
Vertical_Distance_To_Hydrology        float32
Horizontal_Distance_To_Roadways       float32
Hillshade_9am                         float32
Hillshade_Noon                        float32
Hillshade_3pm                         float32
Horizontal_Distance_To_Fire_Points    float32
Wilderness_Area1                      float32
Wilderness_Area2                      float32
Wilderness_Area3                      float32
Wilderness_Area4                      float32
Soil_Type1                            float32
Soil_Type2                            float32
Soil_Type3                            float32
Soil_Type4                            float32
Soil_Type5                            float32
Soil_Type6                            float32
Soil_Type7                            float32
Soil_Type8                        

In [0]:
cov_df['target'] = cov_df['target'].astype(np.int32)

In [0]:
# We need the values to start from 0 not from 1 for categorical variable
cov_df['target'] = cov_df['target'] - 1

In [0]:
cov_df_x = cov_df.drop(['target'], axis=1, inplace=False)

In [0]:
cov_df_y = pd.DataFrame(cov_df['target'])

In [0]:
X_train, X_test, y_train, y_test = train_test_split(cov_df_x, cov_df_y, test_size=0.2, random_state=31, stratify=cov_df_y)

# XGBoost Benchmarking

In [0]:
# For using XGBoost we need to use the DMatrix data type

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [0]:
# Still we have not loaded the data into GPU
!nvidia-smi

In [0]:
import time

In [0]:
# We are using the XGboost API not the sklearn API
num_rounds = 50
max_depth = 8

params = {
  'colsample_bylevel': 1,
  'colsample_bytree': 1,
  'gamma': 0,
  'learning_rate': 0.1, 
  'random_state': 1010,
  'objective': 'multi:softmax', 
  'num_class': 7,
}

In [0]:
params['tree_method'] = 'hist'       
params['grow_policy'] = 'depthwise'
params['max_depth'] = max_depth
params['max_leaves'] = 0
params['verbosity'] = 0

In [37]:
cpu_result = {}
start_time = time.time()
xgb.train(params, dtrain, evals={(dtest, 'test')}, evals_result=cpu_result, verbose_eval=20)
end_time = time.time()
print("Training Time on CPU = %s" %(end_time - start_time))

[0]	test-merror:0.217275
[9]	test-merror:0.197671
Training Time on CPU = 17.250194311141968


In [0]:
# Doing the same on GPU
# We need to change the parameters to GPU Hist methods
params = {}
max_depth = 8

params['tree_method'] = 'gpu_hist'
params['grow_policy'] = 'depthwise'
params['max_depth'] = max_depth
params['max_leaves'] = 0
params['verbosity'] = 0
params['gpu_id'] = 0
params['updater'] = 'grow_gpu_hist'
params['predictor'] = 'gpu_predictor'

gpu_result = {}
params['updater'] = 'grow_gpu_hist'
params['predictor'] = 'gpu_predictor'


In [40]:
start_time = time.time()
xgb.train(params, dtrain, num_boost_round=50, evals={(dtest, 'test')}, evals_result=gpu_result, verbose_eval=20)
end_time = time.time()
print("Training Time on GPU : %s" %(end_time - start_time))

[0]	test-rmse:1.25674
[20]	test-rmse:0.767745
[40]	test-rmse:0.690947
[49]	test-rmse:0.67123
Training Time on GPU : 1.6855368614196777


In [41]:
!nvidia-smi

Sat Mar 21 09:46:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    25W /  70W |    791MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------