## XGBoostをGPU対応させる方法
公式で推奨されている前処理を行っている。RAMを効率的に使うために、変数を消してメモリを有効利用している。    
欠損値に-999を入れていているのは、作成者の趣味か公式によるものなのか、調べる必要があるな  
https://www.kaggle.com/xhlulu/ieee-fraud-xgboost-with-gpu-fit-in-40s

In [1]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

## データ前処理

In [2]:
%%time
train_transaction = pd.read_csv('../competitions/ieee-fraud-detection/input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('../competitions/ieee-fraud-detection/input/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('../competitions/ieee-fraud-detection/input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../competitions/ieee-fraud-detection/input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../competitions/ieee-fraud-detection/input/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  

(590540, 433)
(506691, 432)
CPU times: user 2min 40s, sys: 10.4 s, total: 2min 51s
Wall time: 1min 18s


## tree_methodをgpu-histとする

In [3]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2019,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

In [4]:
%time clf.fit(X_train,y_train)

CPU times: user 53.4 s, sys: 17.5 s, total: 1min 10s
Wall time: 1min 8s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=-999, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=2019,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, tree_method='gpu_hist', verbosity=1)