Skip to content

Commit

Permalink
improve speed of regression task. (#381)
Browse files Browse the repository at this point in the history
* reduce the sumup cost of constant hessians.

* fix test.

* fix bug when have weights.

* fix a comment.

* reduce branching.
  • Loading branch information
guolinke committed Apr 5, 2017
1 parent 98ffbb2 commit d4c4d9a
Show file tree
Hide file tree
Showing 14 changed files with 335 additions and 75 deletions.
25 changes: 25 additions & 0 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,16 @@ class OrderedBin {
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;

/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;

/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
Expand Down Expand Up @@ -323,6 +333,21 @@ class Bin {
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;

/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;

/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
Expand Down
25 changes: 12 additions & 13 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,23 +386,22 @@ class Dataset {

LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);

void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* histogram_data) const;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian,
HistogramBinEntry* histogram_data) const;

void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const;

inline data_size_t Split(
int feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
inline data_size_t Split(int feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
Expand Down
2 changes: 2 additions & 0 deletions include/LightGBM/objective_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class ObjectiveFunction {

virtual const char* GetName() const = 0;

virtual bool IsConstantHessian() const { return false; }

ObjectiveFunction() = default;
/*! \brief Disable copy */
ObjectiveFunction& operator=(const ObjectiveFunction&) = delete;
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ class TreeLearner {
* \brief training tree model on dataset
* \param gradients The first order gradients
* \param hessians The second order gradients
* \param is_constant_hessian True if all hessians share the same value
* \return A trained tree
*/
virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian) = 0;

/*!
* \brief use a existing tree to fit the new gradients and hessians.
Expand Down
8 changes: 6 additions & 2 deletions src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
shrinkage_rate_ = new_config->learning_rate;

object_function_ = object_function;

if (object_function_ != nullptr) {
is_constant_hessian_ = object_function_->IsConstantHessian();
} else {
is_constant_hessian_ = false;
}
sigmoid_ = -1.0f;
if (object_function_ != nullptr
&& (std::string(object_function_->GetName()) == std::string("binary")
Expand Down Expand Up @@ -408,7 +412,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[curr_class]) {
new_tree.reset(
tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_, is_constant_hessian_));
}
#ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time;
Expand Down
1 change: 1 addition & 0 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ class GBDT: public Boosting {
bool boost_from_average_;
std::vector<bool> class_need_train_;
std::vector<double> class_default_output_;
bool is_constant_hessian_;
};

} // namespace LightGBM
Expand Down
141 changes: 95 additions & 46 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,70 +401,119 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
}

void Dataset::ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* hist_data) const {
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian,
HistogramBinEntry* hist_data) const {

if (leaf_idx < 0 || num_data <= 0 || hist_data == nullptr) {
return;
}
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
OMP_LOOP_EX_END();
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
OMP_THROW_EX();
}
OMP_THROW_EX();
}

void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const {
HistogramBinEntry* data) const {
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
Expand Down
63 changes: 59 additions & 4 deletions src/io/dense_bin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ template <typename VAL_T>
class DenseBin;

template <typename VAL_T>
class DenseBinIterator : public BinIterator {
class DenseBinIterator: public BinIterator {
public:
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
Expand All @@ -39,7 +39,7 @@ class DenseBinIterator : public BinIterator {
* Use template to reduce memory cost
*/
template <typename VAL_T>
class DenseBin : public Bin {
class DenseBin: public Bin {
public:
friend DenseBinIterator<VAL_T>;
DenseBin(data_size_t num_data)
Expand All @@ -63,8 +63,8 @@ class DenseBin : public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;

void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3;
Expand Down Expand Up @@ -129,6 +129,61 @@ class DenseBin : public Bin {
}
}

void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
const VAL_T bin0 = data_[data_indices[i]];
const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]];

out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];

++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} else { // use full data
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
const VAL_T bin0 = data_[i];
const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3];

out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];

++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
}
}

virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
Expand Down

0 comments on commit d4c4d9a

Please sign in to comment.