From 83565f01eaf100e21d7c5e9595ca12e9ddf5d19d Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 11 Sep 2018 14:08:43 +0800 Subject: [PATCH] fix RF's bug with MAPE & support RF with multi-class (#1637) * fix RF's bug withMAPE * simplify rf's code & support multi-class rf * fix bug & add test * add more tests * Update test_engine.py * Update test_engine.py --- src/boosting/rf.hpp | 118 +++++++++++------------ tests/python_package_test/test_engine.py | 66 ++++++++++++- 2 files changed, 123 insertions(+), 61 deletions(-) diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 1092a4d97f8..3e3641a1f0e 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -38,16 +38,16 @@ class RF: public GBDT { CHECK(train_data->metadata().init_score() == nullptr); } // cannot use RF for multi-class. - CHECK(num_tree_per_iteration_ == 1); + CHECK(num_tree_per_iteration_ == num_class_); // not shrinkage rate for the RF shrinkage_rate_ = 1.0f; // only boosting one time - Boosting(); + GetRFTargets(train_data); if (is_use_subset_ && bag_data_cnt_ < num_data_) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_grad_.resize(total_size); - tmp_hess_.resize(total_size); + tmp_grad_.resize(num_data_); + tmp_hess_.resize(num_data_); } + tmp_score_.resize(num_data_, 0.0f); } void ResetConfig(const Config* config) override { @@ -67,72 +67,80 @@ class RF: public GBDT { } } // cannot use RF for multi-class. - CHECK(num_tree_per_iteration_ == 1); + CHECK(num_tree_per_iteration_ == num_class_); // only boosting one time - Boosting(); + GetRFTargets(train_data); if (is_use_subset_ && bag_data_cnt_ < num_data_) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - tmp_grad_.resize(total_size); - tmp_hess_.resize(total_size); + tmp_grad_.resize(num_data_); + tmp_hess_.resize(num_data_); } + tmp_score_.resize(num_data_, 0.0f); } - void Boosting() override { - if (objective_function_ == nullptr) { - Log::Fatal("No object function provided"); + void GetRFTargets(const Dataset* train_data) { + auto label_ptr = train_data->metadata().label(); + std::fill(hessians_.begin(), hessians_.end(), 1); + if (num_tree_per_iteration_ == 1) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static,1) + for (data_size_t i = 0; i < train_data->num_data(); ++i) { + OMP_LOOP_EX_BEGIN(); + double label = label_ptr[i]; + gradients_[i] = static_cast(-label); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } + else { + std::fill(gradients_.begin(), gradients_.end(), 0); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static,1) + for (data_size_t i = 0; i < train_data->num_data(); ++i) { + OMP_LOOP_EX_BEGIN(); + double label = label_ptr[i]; + gradients_[i + static_cast(label) * num_data_] = -1; + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); } - std::vector tmp_score(num_tree_per_iteration_ * num_data_, 0.0f); - objective_function_-> - GetGradients(tmp_score.data(), gradients_.data(), hessians_.data()); + } + + void Boosting() override { + } bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { // bagging logic Bagging(iter_); - if (gradients == nullptr || hessians == nullptr) { - gradients = gradients_.data(); - hessians = hessians_.data(); - } + CHECK(gradients == nullptr); + CHECK(hessians == nullptr); + gradients = gradients_.data(); + hessians = hessians_.data(); for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { std::unique_ptr new_tree(new Tree(2)); - if (class_need_train_[cur_tree_id]) { - size_t bias = static_cast(cur_tree_id)* num_data_; - - auto grad = gradients + bias; - auto hess = hessians + bias; - - // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_) { - for (int i = 0; i < bag_data_cnt_; ++i) { - tmp_grad_[bias + i] = grad[bag_data_indices_[i]]; - tmp_hess_[bias + i] = hess[bag_data_indices_[i]]; - } - grad = tmp_grad_.data() + bias; - hess = tmp_hess_.data() + bias; + size_t bias = static_cast(cur_tree_id)* num_data_; + auto grad = gradients + bias; + auto hess = hessians + bias; + + // need to copy gradients for bagging subset. + if (is_use_subset_ && bag_data_cnt_ < num_data_) { + for (int i = 0; i < bag_data_cnt_; ++i) { + tmp_grad_[i] = grad[bag_data_indices_[i]]; + tmp_hess_[i] = hess[bag_data_indices_[i]]; } - - new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, - forced_splits_json_)); + grad = tmp_grad_.data(); + hess = tmp_hess_.data(); } - + new_tree.reset(tree_learner_->Train(grad, hess, is_constant_hessian_, + forced_splits_json_)); if (new_tree->num_leaves() > 1) { + tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, tmp_score_.data(), + num_data_, bag_data_indices_.data(), bag_data_cnt_); // update score MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_)); - ConvertTreeOutput(new_tree.get()); UpdateScore(new_tree.get(), cur_tree_id); MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1)); - } else { - // only add default score one-time - if (!class_need_train_[cur_tree_id] && models_.size() < static_cast(num_tree_per_iteration_)) { - double output = class_default_output_[cur_tree_id]; - objective_function_->ConvertOutput(&output, &output); - new_tree->AsConstantTree(output); - train_score_updater_->AddScore(output, cur_tree_id); - for (auto& score_updater : valid_score_updater_) { - score_updater->AddScore(output, cur_tree_id); - } - } } // add model models_.push_back(std::move(new_tree)); @@ -169,15 +177,6 @@ class RF: public GBDT { } } - void ConvertTreeOutput(Tree* tree) { - tree->Shrinkage(1.0f); - for (int i = 0; i < tree->num_leaves(); ++i) { - double output = tree->LeafOutput(i); - objective_function_->ConvertOutput(&output, &output); - tree->SetLeafOutput(i, output); - } - } - void AddValidDataset(const Dataset* valid_data, const std::vector& valid_metrics) override { GBDT::AddValidDataset(valid_data, valid_metrics); @@ -201,6 +200,7 @@ class RF: public GBDT { std::vector tmp_grad_; std::vector tmp_hess_; + std::vector tmp_score_; }; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d0573754f9b..b3a342974c2 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -294,6 +294,33 @@ def test_multiclass(self): self.assertLess(ret, 0.2) self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) + def test_multiclass_rf(self): + X, y = load_digits(10, True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + params = { + 'boosting_type': 'rf', + 'objective': 'multiclass', + 'metric': 'multi_logloss', + 'bagging_freq': 1, + 'bagging_fraction': 0.6, + 'feature_fraction': 0.6, + 'num_class': 10, + 'num_leaves': 50, + 'min_data': 1, + 'verbose': -1 + } + lgb_train = lgb.Dataset(X_train, y_train, params=params) + lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=100, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = multi_logloss(y_test, gbm.predict(X_test)) + self.assertLess(ret, 0.4) + self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) + def test_multiclass_prediction_early_stopping(self): X, y = load_digits(10, True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -652,9 +679,44 @@ def test_refit(self): } lgb_train = lgb.Dataset(X_train, y_train) gbm = lgb.train(params, lgb_train, - num_boost_round=20, - verbose_eval=False) + num_boost_round=20) err_pred = log_loss(y_test, gbm.predict(X_test)) new_gbm = gbm.refit(X_test, y_test) new_err_pred = log_loss(y_test, new_gbm.predict(X_test)) self.assertGreater(err_pred, new_err_pred) + + def test_mape_rf(self): + X, y = load_boston(True) + params = { + 'boosting_type': 'rf', + 'objective': 'mape', + 'verbose': -1, + 'bagging_freq': 1, + 'bagging_fraction': 0.8, + 'feature_fraction': 0.8, + 'boost_from_average': False + } + lgb_train = lgb.Dataset(X, y) + gbm = lgb.train(params, lgb_train, + num_boost_round=20) + pred = gbm.predict(X) + pred_mean = pred.mean() + self.assertGreater(pred_mean, 20) + + def test_mape_dart(self): + X, y = load_boston(True) + params = { + 'boosting_type': 'dart', + 'objective': 'mape', + 'verbose': -1, + 'bagging_freq': 1, + 'bagging_fraction': 0.8, + 'feature_fraction': 0.8, + 'boost_from_average': False + } + lgb_train = lgb.Dataset(X, y) + gbm = lgb.train(params, lgb_train, + num_boost_round=40) + pred = gbm.predict(X) + pred_mean = pred.mean() + self.assertGreater(pred_mean, 18)