improve speed of regression task. (#381)

* reduce the sumup cost of constant hessians. * fix test. * fix bug when have weights. * fix a comment. * reduce branching.
microsoft · Apr 5, 2017 · d4c4d9a · d4c4d9a
1 parent 98ffbb2
commit d4c4d9a
Show file tree

Hide file tree

Showing 14 changed files with 335 additions and 75 deletions.
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
@@ -227,6 +227,16 @@ class OrderedBin {
   virtual void ConstructHistogram(int leaf, const score_t* gradients,
     const score_t* hessians, HistogramBinEntry* out) const = 0;
 
+  /*!
+  * \brief Construct histogram by using this bin
+  *        Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
+  *        Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
+  * \param leaf Using which leaf's data to construct
+  * \param gradients Gradients, Note:non-oredered by leaf
+  * \param out Output Result
+  */
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+
   /*!
   * \brief Split current bin, and perform re-order by leaf
   * \param leaf Using which leaf's to split
@@ -323,6 +333,21 @@ class Bin {
     const score_t* ordered_gradients, const score_t* ordered_hessians,
     HistogramBinEntry* out) const = 0;
 
+  /*!
+  * \brief Construct histogram of this feature,
+  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
+  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
+  which is not cache friendly, since the access of memory is not continuous.
+  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
+  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
+  * \param data_indices Used data indices in current leaf
+  * \param num_data Number of used data
+  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
+  * \param out Output Result
+  */
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
+                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+
   /*!
   * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
   * \param min_bin min_bin of current used feature

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -386,23 +386,22 @@ class Dataset {
 
   LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
 
-  void ConstructHistograms(
-    const std::vector<int8_t>& is_feature_used,
-    const data_size_t* data_indices, data_size_t num_data,
-    int leaf_idx,
-    std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
-    const score_t* gradients, const score_t* hessians,
-    score_t* ordered_gradients, score_t* ordered_hessians,
-    HistogramBinEntry* histogram_data) const;
+  void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
+                           const data_size_t* data_indices, data_size_t num_data,
+                           int leaf_idx,
+                           std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
+                           const score_t* gradients, const score_t* hessians,
+                           score_t* ordered_gradients, score_t* ordered_hessians,
+                           bool is_constant_hessian,
+                           HistogramBinEntry* histogram_data) const;
 
   void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
                     HistogramBinEntry* data) const;
 
-  inline data_size_t Split(
-    int feature,
-    uint32_t threshold,
-    data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices) const {
+  inline data_size_t Split(int feature,
+                           uint32_t threshold,
+                           data_size_t* data_indices, data_size_t num_data,
+                           data_size_t* lte_indices, data_size_t* gt_indices) const {
     const int group = feature2group_[feature];
     const int sub_feature = feature2subfeature_[feature];
     return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);

diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
@@ -33,6 +33,8 @@ class ObjectiveFunction {
 
   virtual const char* GetName() const = 0;
 
+  virtual bool IsConstantHessian() const { return false; }
+
   ObjectiveFunction() = default;
   /*! \brief Disable copy */
   ObjectiveFunction& operator=(const ObjectiveFunction&) = delete;

diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
@@ -39,9 +39,10 @@ class TreeLearner {
   * \brief training tree model on dataset 
   * \param gradients The first order gradients
   * \param hessians The second order gradients
+  * \param is_constant_hessian True if all hessians share the same value
   * \return A trained tree
   */
-  virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
+  virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian) = 0;
 
   /*!
   * \brief use a existing tree to fit the new gradients and hessians.

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
@@ -82,7 +82,11 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
   shrinkage_rate_ = new_config->learning_rate;
 
   object_function_ = object_function;
-
+  if (object_function_ != nullptr) {
+    is_constant_hessian_ = object_function_->IsConstantHessian();
+  } else {
+    is_constant_hessian_ = false;
+  }
   sigmoid_ = -1.0f;
   if (object_function_ != nullptr
       && (std::string(object_function_->GetName()) == std::string("binary")
@@ -408,7 +412,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
     std::unique_ptr<Tree> new_tree(new Tree(2));
     if (class_need_train_[curr_class]) {
       new_tree.reset(
-        tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
+        tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_, is_constant_hessian_));
     }
     #ifdef TIMETAG
     tree_time += std::chrono::steady_clock::now() - start_time;

diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
@@ -345,6 +345,7 @@ class GBDT: public Boosting {
   bool boost_from_average_;
   std::vector<bool> class_need_train_;
   std::vector<double> class_default_output_;
+  bool is_constant_hessian_;
 };
 
 }  // namespace LightGBM

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
@@ -401,70 +401,119 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
   }
 }
 
-void Dataset::ConstructHistograms(
-  const std::vector<int8_t>& is_feature_used,
-  const data_size_t* data_indices, data_size_t num_data,
-  int leaf_idx,
-  std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
-  const score_t* gradients, const score_t* hessians,
-  score_t* ordered_gradients, score_t* ordered_hessians,
-  HistogramBinEntry* hist_data) const {
+void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
+                                  const data_size_t* data_indices, data_size_t num_data,
+                                  int leaf_idx,
+                                  std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
+                                  const score_t* gradients, const score_t* hessians,
+                                  score_t* ordered_gradients, score_t* ordered_hessians,
+                                  bool is_constant_hessian,
+                                  HistogramBinEntry* hist_data) const {
 
   if (leaf_idx < 0 || num_data <= 0 || hist_data == nullptr) {
     return;
   }
   auto ptr_ordered_grad = gradients;
   auto ptr_ordered_hess = hessians;
   if (data_indices != nullptr && num_data < num_data_) {
-#pragma omp parallel for schedule(static)
-    for (data_size_t i = 0; i < num_data; ++i) {
-      ordered_gradients[i] = gradients[data_indices[i]];
-      ordered_hessians[i] = hessians[data_indices[i]];
+    if (!is_constant_hessian) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        ordered_gradients[i] = gradients[data_indices[i]];
+        ordered_hessians[i] = hessians[data_indices[i]];
+      }
+    } else {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        ordered_gradients[i] = gradients[data_indices[i]];
+      }
     }
     ptr_ordered_grad = ordered_gradients;
     ptr_ordered_hess = ordered_hessians;
   }
-  OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
-  for (int group = 0; group < num_groups_; ++group) {
-    OMP_LOOP_EX_BEGIN();
-    bool is_groud_used = false;
-    const int f_cnt = group_feature_cnt_[group];
-    for (int j = 0; j < f_cnt; ++j) {
-      const int fidx = group_feature_start_[group] + j;
-      if (is_feature_used[fidx]) {
-        is_groud_used = true;
-        break;
+  if (!is_constant_hessian) {
+    OMP_INIT_EX();
+    #pragma omp parallel for schedule(static)
+    for (int group = 0; group < num_groups_; ++group) {
+      OMP_LOOP_EX_BEGIN();
+      bool is_groud_used = false;
+      const int f_cnt = group_feature_cnt_[group];
+      for (int j = 0; j < f_cnt; ++j) {
+        const int fidx = group_feature_start_[group] + j;
+        if (is_feature_used[fidx]) {
+          is_groud_used = true;
+          break;
+        }
+      }
+      if (!is_groud_used) { continue; }
+      // feature is not used
+      auto data_ptr = hist_data + group_bin_boundaries_[group];
+      const int num_bin = feature_groups_[group]->num_total_bin_;
+      std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
+      // construct histograms for smaller leaf
+      if (ordered_bins[group] == nullptr) {
+        // if not use ordered bin
+        feature_groups_[group]->bin_data_->ConstructHistogram(
+          data_indices,
+          num_data,
+          ptr_ordered_grad,
+          ptr_ordered_hess,
+          data_ptr);
+      } else {
+        // used ordered bin
+        ordered_bins[group]->ConstructHistogram(leaf_idx,
+                                                gradients,
+                                                hessians,
+                                                data_ptr);
       }
+      OMP_LOOP_EX_END();
     }
-    if (!is_groud_used) { continue; }
-    // feature is not used
-    auto data_ptr = hist_data + group_bin_boundaries_[group];
-    const int num_bin = feature_groups_[group]->num_total_bin_;
-    std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-    // construct histograms for smaller leaf
-    if (ordered_bins[group] == nullptr) {
-      // if not use ordered bin
-      feature_groups_[group]->bin_data_->ConstructHistogram(
-        data_indices,
-        num_data,
-        ptr_ordered_grad,
-        ptr_ordered_hess,
-        data_ptr);
-    } else {
-      // used ordered bin
-      ordered_bins[group]->ConstructHistogram(leaf_idx,
-        gradients,
-        hessians,
-        data_ptr);
+    OMP_THROW_EX();
+  } else {
+    OMP_INIT_EX();
+    #pragma omp parallel for schedule(static)
+    for (int group = 0; group < num_groups_; ++group) {
+      OMP_LOOP_EX_BEGIN();
+      bool is_groud_used = false;
+      const int f_cnt = group_feature_cnt_[group];
+      for (int j = 0; j < f_cnt; ++j) {
+        const int fidx = group_feature_start_[group] + j;
+        if (is_feature_used[fidx]) {
+          is_groud_used = true;
+          break;
+        }
+      }
+      if (!is_groud_used) { continue; }
+      // feature is not used
+      auto data_ptr = hist_data + group_bin_boundaries_[group];
+      const int num_bin = feature_groups_[group]->num_total_bin_;
+      std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
+      // construct histograms for smaller leaf
+      if (ordered_bins[group] == nullptr) {
+        // if not use ordered bin
+        feature_groups_[group]->bin_data_->ConstructHistogram(
+          data_indices,
+          num_data,
+          ptr_ordered_grad,
+          data_ptr);
+      } else {
+        // used ordered bin
+        ordered_bins[group]->ConstructHistogram(leaf_idx,
+                                                gradients,
+                                                data_ptr);
+      }
+      // fixed hessian.
+      for (int i = 0; i < num_bin; ++i) {
+        data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
+      }
+      OMP_LOOP_EX_END();
     }
-    OMP_LOOP_EX_END();
+    OMP_THROW_EX();
   }
-  OMP_THROW_EX();
 }
 
 void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
-  HistogramBinEntry* data) const {
+                           HistogramBinEntry* data) const {
   const int group = feature2group_[feature_idx];
   const int sub_feature = feature2subfeature_[feature_idx];
   const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();

diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
@@ -13,7 +13,7 @@ template <typename VAL_T>
 class DenseBin;
 
 template <typename VAL_T>
-class DenseBinIterator : public BinIterator {
+class DenseBinIterator: public BinIterator {
 public:
   explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
     : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
@@ -39,7 +39,7 @@ class DenseBinIterator : public BinIterator {
 * Use template to reduce memory cost
 */
 template <typename VAL_T>
-class DenseBin : public Bin {
+class DenseBin: public Bin {
 public:
   friend DenseBinIterator<VAL_T>;
   DenseBin(data_size_t num_data)
@@ -63,8 +63,8 @@ class DenseBin : public Bin {
   BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
+                          const score_t* ordered_gradients, const score_t* ordered_hessians,
+                          HistogramBinEntry* out) const override {
     // use 4-way unrolling, will be faster
     if (data_indices != nullptr) {  // if use part of data
       const data_size_t rest = num_data & 0x3;
@@ -129,6 +129,61 @@ class DenseBin : public Bin {
     }
   }
 
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
+                          const score_t* ordered_gradients,
+                          HistogramBinEntry* out) const override {
+    // use 4-way unrolling, will be faster
+    if (data_indices != nullptr) {  // if use part of data
+      const data_size_t rest = num_data & 0x3;
+      data_size_t i = 0;
+      for (; i < num_data - rest; i += 4) {
+        const VAL_T bin0 = data_[data_indices[i]];
+        const VAL_T bin1 = data_[data_indices[i + 1]];
+        const VAL_T bin2 = data_[data_indices[i + 2]];
+        const VAL_T bin3 = data_[data_indices[i + 3]];
+
+        out[bin0].sum_gradients += ordered_gradients[i];
+        out[bin1].sum_gradients += ordered_gradients[i + 1];
+        out[bin2].sum_gradients += ordered_gradients[i + 2];
+        out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+        ++out[bin0].cnt;
+        ++out[bin1].cnt;
+        ++out[bin2].cnt;
+        ++out[bin3].cnt;
+      }
+      for (; i < num_data; ++i) {
+        const VAL_T bin = data_[data_indices[i]];
+        out[bin].sum_gradients += ordered_gradients[i];
+        ++out[bin].cnt;
+      }
+    } else {  // use full data
+      const data_size_t rest = num_data & 0x3;
+      data_size_t i = 0;
+      for (; i < num_data - rest; i += 4) {
+        const VAL_T bin0 = data_[i];
+        const VAL_T bin1 = data_[i + 1];
+        const VAL_T bin2 = data_[i + 2];
+        const VAL_T bin3 = data_[i + 3];
+
+        out[bin0].sum_gradients += ordered_gradients[i];
+        out[bin1].sum_gradients += ordered_gradients[i + 1];
+        out[bin2].sum_gradients += ordered_gradients[i + 2];
+        out[bin3].sum_gradients += ordered_gradients[i + 3];
+
+        ++out[bin0].cnt;
+        ++out[bin1].cnt;
+        ++out[bin2].cnt;
+        ++out[bin3].cnt;
+      }
+      for (; i < num_data; ++i) {
+        const VAL_T bin = data_[i];
+        out[bin].sum_gradients += ordered_gradients[i];
+        ++out[bin].cnt;
+      }
+    }
+  }
+
   virtual data_size_t Split(
     uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
     uint32_t threshold, data_size_t* data_indices, data_size_t num_data,