microsoft · shiyu1994 · Feb 23, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -46,6 +46,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \

@@ -47,6 +47,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \

@@ -62,15 +62,17 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
                     reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
       }
     } else if (HIST_BITS == 16) {
-      const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
-        static_cast<size_t>(num_bin_aligned_);
       if (is_use_subcol_) {
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
+          static_cast<size_t>(num_bin_aligned_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
           std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
                       reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
         }
       } else {
+        CHECK_EQ(INNER_HIST_BITS, 8);
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2;
         int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < num_bin_; ++i) {
@@ -148,7 +150,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
         }
       }
     } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) {
-      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2;
       std::memset(reinterpret_cast<void*>(dst), 0, num_bin_ * kInt16HistBufferEntrySize);
       #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
       for (int t = 0; t < n_bin_block; ++t) {