In [1]:
import optuna.integration.lightgbm as lgb
import optuna
import itertools
import json
from sklearn.metrics import mean_squared_error
import neptune
import pandas as pd

import warnings

import numpy as np
warnings.filterwarnings("ignore")

In [2]:
def read_data(dataset_path, mode):
    print('Read dataset for ' + mode + ' from file ' + dataset_path + mode + '.csv')
    dataset = pd.read_csv(dataset_path + mode + '.csv')
    dataset = dataset.drop(['Unnamed: 0'], axis=1)
    return dataset.drop(['y'], axis=1), dataset['y']

In [3]:
import neptunecontrib.monitoring.optuna as opt_utils
neptune_monitor_optuna = opt_utils.NeptuneMonitor()

In [4]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
print('Loading data...')
x_train, y_train = read_data('../data/', 'train')
x_eval, y_eval = read_data('../data/', 'eval')

neptune.init('kowson/OLN')

used_params = []

Loading data...
Read dataset for train from file ../data/train.csv
Read dataset for eval from file ../data/eval.csv


In [11]:
print('Preparing LightGBM datasets...')
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_eval, y_eval, reference=lgb_train)


Preparing LightGBM datasets...


In [0]:
TAGS = ['lightgbm', 'data_v5', '200k_uniform', 'mape', 'relative', 'optimization', 'optuna']

def neptune_monitor():
    def callback(env):
        for name, loss_name, loss_value, _ in env.evaluation_result_list:
            neptune.send_metric('{}_{}'.format(name, loss_name), x=env.iteration, y=loss_value)
    return callback


In [0]:
params_dict = {
    'boosting_type': ['gbdt', 'dart',],
    'objective': ['regression'],
    'metric': [['l2', 'l1']],
    'num_leaves': [10, 20, 25, 30, 40, 50, 100],
    'num_rounds': [30, 50, 70, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
    'verbose': [0],
}

In [0]:
print("Training...")
try:
    with open('lgb_params.json') as data_file:    
        used_params = json.load(data_file)
except (FileNotFoundError, json.JSONDecodeError): 
    used_params = []
keys, values = zip(*params_dict.items())
for v in itertools.product(*values):
    experiment_params = dict(zip(keys, v))
    if any(param_dict == experiment_params for param_dict in used_params):
        continue  # skip already computed
    neptune.create_experiment(
        name='LightGBM regressor with relative doses', 
        params=experiment_params,
        tags=TAGS
    )
    gbm = lgb.train(
        params=experiment_params, 
        train_set=lgb_train, 
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=False,
        early_stopping_rounds=5,
        callbacks=[neptune_monitor()],
    )
    # PREDICT AND EVAL
    y_pred = gbm.predict(x_eval, num_iteration=gbm.best_iteration)
    error = mean_squared_error(y_eval, y_pred) ** 0.5
    print("RMSE of prediction is: {}".format(error))
    neptune.log_text('rmse', str(error))
    error = mean_absolute_percentage_error(y_eval, y_pred)
    print("MAPE of prediction is: {}".format(error))
    neptune.log_text('mape', str(error))
    neptune.stop()
    used_params.append(experiment_params)
    with open('lgb_params.json', 'w') as outfile:
        json.dump(used_params, outfile, sort_keys=True, indent=4)

In [12]:
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'l2',
        'boosting_type': ['gbdt'],
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'num_rounds': trial.suggest_int('num_rounds', 20, 300),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.6),
    }
    gbm = lgb.train(
        params=param, 
        train_set=lgb_train, 
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=False,
        early_stopping_rounds=5,
    )
    # PREDICT AND EVAL
    y_pred = gbm.predict(x_eval, num_iteration=gbm.best_iteration)
    #error = mean_squared_error(y_eval, y_pred) ** 0.5
    #print("RMSE of prediction is: {}".format(error))
    #neptune.log_text('rmse', str(error))
    error = mean_absolute_percentage_error(y_eval, y_pred)
    print("MAPE of prediction is: {}".format(error))
    neptune.log_metric('mape_error', error)
    #neptune.stop()
    return error

In [13]:
print("Optimizing...")
neptune.create_experiment(
    name='Optuna optimization with relative doses', 
    tags=['optimization', 'optuna', 'lightgbm', 'data_v5', 'relative', 'uniform_200k', 'l2']
)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, callbacks=[neptune_monitor_optuna])

Optimizing...
https://ui.neptune.ai/kowson/OLN/e/OLN-3150


NVMLError: NVML Shared Library Not Found - GPU usage metrics may not be reported.
feature_fraction, val_score: 1309.494738:  14%|#4        | 1/7 [00:08<00:49,  8.33s/it][I 2020-05-21 14:17:51,032] Finished trial#0 with value: 1309.494738473686 with parameters: {'feature_fraction': 0.8}. Best is trial#0 with value: 1309.494738473686.
feature_fraction, val_score: 1281.024394:  29%|##8       | 2/7 [00:19<00:45,  9.10s/it][I 2020-05-21 14:18:01,907] Finished trial#1 with value: 1281.0243939759787 with parameters: {'feature_fraction': 1.0}. Best is trial#1 with value: 1281.0243939759787.
feature_fraction, val_score: 1281.024394:  29%|##8       | 2/7 [00:19<00:45,  9.10s/it]

KeyboardInterrupt: 

In [0]:
neptune.stop()