Skip to content

Commit

Permalink
add FitByExistingTree.
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed Mar 25, 2017
1 parent 32ef85d commit 8a6bd5e
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 55 deletions.
7 changes: 6 additions & 1 deletion include/LightGBM/tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ class TreeLearner {
*/
virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;

/*!
* \brief use a existing tree to fit the new gradients and hessians.
*/
virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0;

/*!
* \brief Set bagging data
* \param used_indices Used data indices
Expand All @@ -55,7 +60,7 @@ class TreeLearner {
* \brief Using last trained tree to predict score then adding to out_score;
* \param out_score output score
*/
virtual void AddPredictionToScore(double* out_score) const = 0;
virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0;

TreeLearner() = default;
/*! \brief Disable copy */
Expand Down
2 changes: 1 addition & 1 deletion src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
#endif
// update training score
if (!is_use_subset_) {
train_score_updater_->AddScore(tree_learner_.get(), curr_class);
train_score_updater_->AddScore(tree_learner_.get(), tree, curr_class);
} else {
train_score_updater_->AddScore(tree, curr_class);
}
Expand Down
10 changes: 5 additions & 5 deletions src/boosting/score_updater.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,19 @@ class ScoreUpdater {
/*!
* \brief Adding prediction score, only used for training data.
* The training data is partitioned into tree leaves after training
* Based on which We can get prediction quckily.
* Based on which We can get prediction quickly.
* \param tree_learner
* \param curr_class Current class for multiclass training
*/
inline void AddScore(const TreeLearner* tree_learner, int curr_class) {
tree_learner->AddPredictionToScore(score_.data() + curr_class * num_data_);
inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int curr_class) {
tree_learner->AddPredictionToScore(tree, score_.data() + curr_class * num_data_);
}
/*!
* \brief Using tree model to get prediction number, then adding to scores for parts of data
* Used for prediction of training out-of-bag data
* \param tree Trained tree model
* \param data_indices Indices of data that will be proccessed
* \param data_cnt Number of data that will be proccessed
* \param data_indices Indices of data that will be processed
* \param data_cnt Number of data that will be processed
* \param curr_class Current class for multiclass training
*/
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
Expand Down
80 changes: 47 additions & 33 deletions src/treelearner/feature_histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#include <cstring>

namespace LightGBM
namespace LightGBM
{

class FeatureMetainfo {
Expand Down Expand Up @@ -45,10 +45,10 @@ class FeatureHistogram {
data_ = data;
if (bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
}

Expand All @@ -68,12 +68,12 @@ class FeatureHistogram {
}

void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
SplitInfo* output) {
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
}

void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
SplitInfo* output) {
double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN;
double best_gain = kMinScore;
Expand All @@ -82,7 +82,8 @@ class FeatureHistogram {
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
Expand All @@ -95,7 +96,7 @@ class FeatureHistogram {
right_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (right_count < meta_->tree_config->min_data_in_leaf
|| sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
|| sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count;
// if data not enough
if (left_count < meta_->tree_config->min_data_in_leaf) break;
Expand All @@ -106,8 +107,10 @@ class FeatureHistogram {

double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;

Expand All @@ -126,12 +129,14 @@ class FeatureHistogram {
if (is_splittable_) {
// update split information
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian);
sum_hessian - best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
Expand All @@ -142,13 +147,14 @@ class FeatureHistogram {
}

void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
SplitInfo* output) {
double best_gain = kMinScore;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
data_size_t best_left_count = 0;
double best_sum_left_gradient = 0.0f;
double best_sum_left_hessian = 0.0f;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
Expand All @@ -158,7 +164,7 @@ class FeatureHistogram {
for (; t >= t_end; --t) {
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
Expand All @@ -169,8 +175,10 @@ class FeatureHistogram {

double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;

Expand Down Expand Up @@ -199,12 +207,14 @@ class FeatureHistogram {
data_size_t other_count = num_data - cnt_bin0;
double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
&& sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
&& other_count >= meta_->tree_config->min_data_in_leaf
&& sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
&& sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
&& other_count >= meta_->tree_config->min_data_in_leaf
&& sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
double sum_other_gradient = sum_gradient - sum_bin0_gradient;
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
if (current_gain > min_gain_shift) {
is_splittable_ = true;
// better split point
Expand All @@ -221,12 +231,14 @@ class FeatureHistogram {
if (is_splittable_) {
// update split information
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian);
sum_hessian - best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
Expand Down Expand Up @@ -260,18 +272,17 @@ class FeatureHistogram {
*/
void set_is_splittable(bool val) { is_splittable_ = val; }

private:
/*!
* \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return split gain
*/
double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
static double GetLeafSplitGain(double sum_gradients, double sum_hessians, double l1, double l2) {
double abs_sum_gradients = std::fabs(sum_gradients);
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - l1);
return (reg_abs_sum_gradients * reg_abs_sum_gradients)
/ (sum_hessians + meta_->tree_config->lambda_l2);
/ (sum_hessians + l2);

}

Expand All @@ -281,12 +292,15 @@ class FeatureHistogram {
* \param sum_hessians
* \return leaf output
*/
double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2) {
double abs_sum_gradients = std::fabs(sum_gradients);
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - l1);
return -std::copysign(reg_abs_sum_gradients, sum_gradients)
/ (sum_hessians + meta_->tree_config->lambda_l2);
/ (sum_hessians + l2);
}

private:

const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */
HistogramBinEntry* data_;
Expand Down Expand Up @@ -346,7 +360,7 @@ class HistogramPool {
void DynamicChangeSize(const Dataset* train_data, const TreeConfig* tree_config, int cache_size, int total_size) {
if (feature_metas_.empty()) {
feature_metas_.resize(train_data->num_features());
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
Expand All @@ -363,7 +377,7 @@ class HistogramPool {
Reset(cache_size, total_size);
pool_.resize(cache_size);
data_.resize(cache_size);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int i = old_cache_size; i < cache_size_; ++i) {
pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
data_[i].resize(num_total_bin);
Expand All @@ -382,7 +396,7 @@ class HistogramPool {
}

void ResetConfig(const TreeConfig* tree_config) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(feature_metas_.size()); ++i) {
feature_metas_[i].tree_config = tree_config;
}
Expand Down
2 changes: 1 addition & 1 deletion src/treelearner/parallel_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class VotingParallelTreeLearner: public SerialTreeLearner {
void ResetConfig(const TreeConfig* tree_config) override;
protected:
void BeforeTrain() override;
bool BeforeFindBestSplit(int left_leaf, int right_leaf) override;
bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
void FindBestThresholds() override;
void FindBestSplitsForLeaves() override;
void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
Expand Down
31 changes: 26 additions & 5 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
#endif

auto tree = std::unique_ptr<Tree>(new Tree(tree_config_->num_leaves));
// save pointer to last trained tree
last_trained_tree_ = tree.get();
// root leaf
int left_leaf = 0;
int cur_depth = 1;
Expand All @@ -191,7 +189,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
start_time = std::chrono::steady_clock::now();
#endif
// some initial works before finding best split
if (BeforeFindBestSplit(left_leaf, right_leaf)) {
if (BeforeFindBestSplit(tree.get(), left_leaf, right_leaf)) {
#ifdef TIMETAG
init_split_time += std::chrono::steady_clock::now() - start_time;
#endif
Expand Down Expand Up @@ -223,6 +221,29 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
return tree.release();
}

Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const {
auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
CHECK(data_partition_->num_leaves() >= tree->num_leaves());
#pragma omp parallel for schedule(static)
for (int i = 0; i < data_partition_->num_leaves(); ++i) {
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
double sum_grad = 0.0f;
double sum_hess = 0.0f;
for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
auto idx = tmp_idx[j];
sum_grad += gradients[idx];
sum_hess += hessians[idx];
}
// avoid zero hessians.
if (sum_hess <= 0) sum_hess = kEpsilon;
double output = FeatureHistogram::CalculateSplittedLeafOutput(sum_grad, sum_hess,
tree_config_->lambda_l1, tree_config_->lambda_l2);
tree->SetLeafOutput(i, output);
}
return tree.release();
}

void SerialTreeLearner::BeforeTrain() {

// reset histogram pool
Expand Down Expand Up @@ -305,11 +326,11 @@ void SerialTreeLearner::BeforeTrain() {
}
}

bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
// check depth of current leaf
if (tree_config_->max_depth > 0) {
// only need to check left leaf, since right leaf is in same level of left leaf
if (last_trained_tree_->leaf_depth(left_leaf) >= tree_config_->max_depth) {
if (tree->leaf_depth(left_leaf) >= tree_config_->max_depth) {
best_split_per_leaf_[left_leaf].gain = kMinScore;
if (right_leaf >= 0) {
best_split_per_leaf_[right_leaf].gain = kMinScore;
Expand Down
14 changes: 7 additions & 7 deletions src/treelearner/serial_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,18 @@ class SerialTreeLearner: public TreeLearner {

Tree* Train(const score_t* gradients, const score_t *hessians) override;

Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;

void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
data_partition_->SetUsedDataIndices(used_indices, num_data);
}

void AddPredictionToScore(double* out_score) const override {
if (last_trained_tree_->num_leaves() <= 1) { return; }
void AddPredictionToScore(const Tree* tree, double* out_score) const override {
if (tree->num_leaves() <= 1) { return; }
CHECK(tree->num_leaves() <= data_partition_->num_leaves());
#pragma omp parallel for schedule(static)
for (int i = 0; i < data_partition_->num_leaves(); ++i) {
double output = static_cast<double>(last_trained_tree_->LeafOutput(i));
double output = static_cast<double>(tree->LeafOutput(i));
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
Expand All @@ -64,7 +67,7 @@ class SerialTreeLearner: public TreeLearner {
/*!
* \brief Some initial works before FindBestSplit
*/
virtual bool BeforeFindBestSplit(int left_leaf, int right_leaf);
virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);


/*!
Expand Down Expand Up @@ -95,9 +98,6 @@ class SerialTreeLearner: public TreeLearner {
* \return The number of data in the leaf_idx leaf
*/
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;

/*! \brief Last trained decision tree */
const Tree* last_trained_tree_;
/*! \brief number of data */
data_size_t num_data_;
/*! \brief number of features */
Expand Down

0 comments on commit 8a6bd5e

Please sign in to comment.