In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd
import optuna
import xgboost as xgb

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=500_000,
    n_features=500,
    weights=[0.75, 0.25],
    flip_y=0.75,
    random_state=123,
)
X.shape

In [None]:
f"{X.nbytes / 1e9} GB"

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25)

In [None]:
bst = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1)

bst.fit(train_x, train_y, early_stopping_rounds=100,
        eval_set=[(test_x, test_y)], verbose=True)
score = bst.score(test_x, test_y)
score

In [None]:
class Objective:
    def __init__(self, X, y, split_func, use_gpus=False, num_gpus=None):
        train_x, valid_x, train_y, valid_y = split_func(X, y, test_size=0.25)
        self.split_func = split_func
        
        self.valid_x = valid_x
        self.valid_y = valid_y
        
        self.obj_train_x, self.obj_test_x, self.obj_train_y, self.obj_test_y = self.split_func(
            train_x, train_y, test_size=0.25)
        
        self.use_gpus = use_gpus
        self.num_gpus = num_gpus or 1
        self.gpu_status = [0 for _ in range(self.num_gpus)]
        
    def get_next_gpu(self):
        def _find_gpu(min_stat=0):
            next_gpu = None
            for gpu_id, stat in enumerate(self.gpu_status):
                if stat <= min_stat:
                    next_gpu = gpu_id
                    break
            return next_gpu
        
        next_gpu = None
        min_stat = 0
        while(next_gpu is None):
            next_gpu = _find_gpu(min_stat=min_stat + 1)
            
        self.gpu_status[next_gpu] += 1
        return next_gpu
    
    def __call__(self, trial):
        params = {
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 1, 24),
            "eta": trial.suggest_float("eta", 1e-8, 1.0, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        }
        
        if self.use_gpus:
            gpu_id = self.get_next_gpu()
            params["gpu_id"] = gpu_id
            params["tree_method"] = "gpu_hist"
        
        bst = xgb.XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            **params)
        
        bst.fit(self.obj_train_x,
                self.obj_train_y,
                early_stopping_rounds=100,
                eval_set=[(self.obj_test_x, self.obj_test_y)],
                verbose=False)
        score = bst.score(self.valid_x, self.valid_y)
        self.gpu_status[gpu_id] -= 1
        return score

In [None]:
objective = Objective(X, y, train_test_split, use_gpus=True, num_gpus=1)
study = optuna.create_study(direction="minimize")

In [None]:
%%time
study.optimize(objective, n_trials=6, timeout=600, n_jobs=1)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# More Data?Use Multi GPUs
Only if our data fits onto a single GPU!

In [None]:
X, y = make_classification(
    n_samples=2_500_000,
    n_features=500,
    weights=[0.75, 0.25],
    flip_y=0.75,
    random_state=123,
)
X.shape

In [None]:
f"{X.nbytes / 1e9} GB"

In [None]:
objective = Objective(X, y, train_test_split, use_gpus=True, num_gpus=2)
study = optuna.create_study(direction="minimize")

In [None]:
%%time
study.optimize(objective, n_trials=6, timeout=600, n_jobs=6)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Use real data

In [None]:
import random
import time
from contextlib import contextmanager

import cudf
import cuml
import numpy as np
import optuna
import pandas as pd
import sklearn
import os

from cuml import LogisticRegression
#from cuml.model_selection import train_test_split
#from cuml.metrics import log_loss


In [None]:
import os

file_name = 'train.csv' 

data_dir = "data/"
INPUT_FILE = os.path.join(data_dir, file_name)

In [None]:
N_TRIALS = 150

df = pd.read_csv(INPUT_FILE)

# Drop non-numerical data and fill NaNs before passing to cuML RF
CAT_COLS = list(df.select_dtypes('object').columns)
df = df.drop(CAT_COLS, axis=1)
df = df.fillna(0)

df = df.astype("float32")
X, y = df.drop(["target"], axis=1), df["target"].astype('int32')

In [None]:
f"{X.values.nbytes / 1e9} GB"

In [None]:
objective = Objective(X.values, y.values, train_test_split, use_gpus=True, num_gpus=2)
study = optuna.create_study(direction="minimize")

In [None]:
%%time
study.optimize(objective, n_trials=150, timeout=600, n_jobs=6)

In [None]:
from IPython.display import Image

In [None]:
f = optuna.visualization.plot_param_importances(study)
Image(f.to_image(format="png", engine='kaleido'))

In [None]:
f = optuna.visualization.plot_optimization_history(study)
Image(f.to_image(format="png", engine='kaleido'))

In [None]:
f = optuna.visualization.plot_parallel_coordinate(study, params=['max_depth', 'eta', 'gamma', 'grow_policy', 'alpha'])
Image(f.to_image(format="png", engine='kaleido'))