Fix add features (#2754)

* fix subset bug * typo * add fixme tag * bin mapper * fix test * fix add_features_from * Update dataset.cpp * fix merge bug * added Python merge code * added test for add_features * Update dataset.cpp * Update src/io/dataset.cpp * continue implementing * warn users about categorical features Co-authored-by: StrikerRUS <nekit94-12@hotmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
microsoft · Oct 26, 2020 · 53977f3 · 53977f3
1 parent ceb6265
commit 53977f3
Show file tree

Hide file tree

Showing 4 changed files with 306 additions and 117 deletions.
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
@@ -1,6 +1,7 @@
 /*!
  * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
  */
 #ifndef LIGHTGBM_FEATURE_GROUP_H_
 #define LIGHTGBM_FEATURE_GROUP_H_
@@ -17,7 +18,8 @@ namespace LightGBM {
 
 class Dataset;
 class DatasetLoader;
-/*! \brief Using to store data and providing some operations on one feature group*/
+/*! \brief Using to store data and providing some operations on one feature
+ * group*/
 class FeatureGroup {
  public:
   friend Dataset;
@@ -83,13 +85,13 @@ class FeatureGroup {
   }
 
   /*!
-  * \brief Constructor from memory
-  * \param memory Pointer of memory
-  * \param num_all_data Number of global data
-  * \param local_used_indices Local used indices, empty means using all data
-  */
+   * \brief Constructor from memory
+   * \param memory Pointer of memory
+   * \param num_all_data Number of global data
+   * \param local_used_indices Local used indices, empty means using all data
+   */
   FeatureGroup(const void* memory, data_size_t num_all_data,
-    const std::vector<data_size_t>& local_used_indices) {
+               const std::vector<data_size_t>& local_used_indices) {
     const char* memory_ptr = reinterpret_cast<const char*>(memory);
     // get is_sparse
     is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
@@ -122,9 +124,11 @@ class FeatureGroup {
       for (int i = 0; i < num_feature_; ++i) {
         int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
         if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
-          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+          multi_bin_data_.emplace_back(Bin::CreateSparseBin(
+              num_data, bin_mappers_[i]->num_bin() + addi));
         } else {
-          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+          multi_bin_data_.emplace_back(
+              Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
         }
         multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
         memory_ptr += multi_bin_data_.back()->SizesInByte();
@@ -141,18 +145,20 @@ class FeatureGroup {
   }
 
   /*! \brief Destructor */
-  ~FeatureGroup() {
-  }
+  ~FeatureGroup() {}
 
   /*!
-  * \brief Push one record, will auto convert to bin and push to bin data
-  * \param tid Thread id
-  * \param idx Index of record
-  * \param value feature value of record
-  */
-  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
+   * \brief Push one record, will auto convert to bin and push to bin data
+   * \param tid Thread id
+   * \param idx Index of record
+   * \param value feature value of record
+   */
+  inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
+                       double value) {
     uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
-    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
+    if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
+      return;
+    }
     if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
       bin -= 1;
     }
@@ -184,6 +190,23 @@ class FeatureGroup {
     }
   }
 
+  void AddFeaturesFrom(const FeatureGroup* other) {
+    CHECK(is_multi_val_);
+    CHECK(other->is_multi_val_);
+    for (int i = 0; i < other->num_feature_; ++i) {
+      const auto& other_bin_mapper = other->bin_mappers_[i];
+      bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
+      auto num_bin = other_bin_mapper->num_bin();
+      if (other_bin_mapper->GetMostFreqBin() == 0) {
+        num_bin -= 1;
+      }
+      num_total_bin_ += num_bin;
+      bin_offsets_.emplace_back(num_total_bin_);
+      multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
+    }
+    num_feature_ += other->num_feature_;
+  }
+
   inline BinIterator* SubFeatureIterator(int sub_feature) {
     uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
     if (!is_multi_val_) {
@@ -194,14 +217,15 @@ class FeatureGroup {
       int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
       uint32_t min_bin = 1;
       uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
-      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
+      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
+                                                       most_freq_bin);
     }
   }
 
   inline void FinishLoad() {
     if (is_multi_val_) {
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(guided)
+#pragma omp parallel for schedule(guided)
       for (int i = 0; i < num_feature_; ++i) {
         OMP_LOOP_EX_BEGIN();
         multi_bin_data_[i]->FinishLoad();
@@ -213,11 +237,6 @@ class FeatureGroup {
     }
   }
 
-  /*!
-   * \brief Returns a BinIterator that can access the entire feature group's raw data.
-   *        The RawGet() function of the iterator should be called for best efficiency.
-   * \return A pointer to the BinIterator object
-   */
   inline BinIterator* FeatureGroupIterator() {
     if (is_multi_val_) {
       return nullptr;
@@ -288,18 +307,18 @@ class FeatureGroup {
   }
 
   /*!
-  * \brief From bin to feature value
-  * \param bin
-  * \return FeatureGroup value of this bin
-  */
+   * \brief From bin to feature value
+   * \param bin
+   * \return FeatureGroup value of this bin
+   */
   inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
     return bin_mappers_[sub_feature_idx]->BinToValue(bin);
   }
 
   /*!
-  * \brief Save binary data to file
-  * \param file File want to write
-  */
+   * \brief Save binary data to file
+   * \param file File want to write
+   */
   void SaveBinaryToFile(const VirtualFileWriter* writer) const {
     writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
     writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
@@ -317,8 +336,8 @@ class FeatureGroup {
   }
 
   /*!
-  * \brief Get sizes in byte of this object
-  */
+   * \brief Get sizes in byte of this object
+   */
   size_t SizesInByte() const {
     size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
                  VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
@@ -377,8 +396,9 @@ class FeatureGroup {
       }
       is_multi_val_ = true;
     } else {
-      if (force_sparse || (!force_dense && num_feature_ == 1 &&
-                           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
+      if (force_sparse ||
+          (!force_dense && num_feature_ == 1 &&
+           bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
         is_sparse_ = true;
         bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
       } else {
@@ -404,7 +424,6 @@ class FeatureGroup {
   int num_total_bin_;
 };
 
-
 }  // namespace LightGBM
 
-#endif   // LIGHTGBM_FEATURE_GROUP_H_
+#endif  // LIGHTGBM_FEATURE_GROUP_H_
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -1904,6 +1904,76 @@ def add_features_from(self, other):
         if self.handle is None or other.handle is None:
             raise ValueError('Both source and target Datasets must be constructed before adding features')
         _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
+        was_none = self.data is None
+        old_self_data_type = type(self.data).__name__
+        if other.data is None:
+            self.data = None
+        elif self.data is not None:
+            if isinstance(self.data, np.ndarray):
+                if isinstance(other.data, np.ndarray):
+                    self.data = np.hstack((self.data, other.data))
+                elif scipy.sparse.issparse(other.data):
+                    self.data = np.hstack((self.data, other.data.toarray()))
+                elif isinstance(other.data, DataFrame):
+                    self.data = np.hstack((self.data, other.data.values))
+                elif isinstance(other.data, DataTable):
+                    self.data = np.hstack((self.data, other.data.to_numpy()))
+                else:
+                    self.data = None
+            elif scipy.sparse.issparse(self.data):
+                sparse_format = self.data.getformat()
+                if isinstance(other.data, np.ndarray) or scipy.sparse.issparse(other.data):
+                    self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
+                elif isinstance(other.data, DataFrame):
+                    self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
+                elif isinstance(other.data, DataTable):
+                    self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
+                else:
+                    self.data = None
+            elif isinstance(self.data, DataFrame):
+                if not PANDAS_INSTALLED:
+                    raise LightGBMError("Cannot add features to DataFrame type of raw data "
+                                        "without pandas installed")
+                from pandas import concat
+                if isinstance(other.data, np.ndarray):
+                    self.data = concat((self.data, DataFrame(other.data)),
+                                       axis=1, ignore_index=True)
+                elif scipy.sparse.issparse(other.data):
+                    self.data = concat((self.data, DataFrame(other.data.toarray())),
+                                       axis=1, ignore_index=True)
+                elif isinstance(other.data, DataFrame):
+                    self.data = concat((self.data, other.data),
+                                       axis=1, ignore_index=True)
+                elif isinstance(other.data, DataTable):
+                    self.data = concat((self.data, DataFrame(other.data.to_numpy())),
+                                       axis=1, ignore_index=True)
+                else:
+                    self.data = None
+            elif isinstance(self.data, DataTable):
+                if isinstance(other.data, np.ndarray):
+                    self.data = DataTable(np.hstack((self.data.to_numpy(), other.data)))
+                elif scipy.sparse.issparse(other.data):
+                    self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
+                elif isinstance(other.data, DataFrame):
+                    self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
+                elif isinstance(other.data, DataTable):
+                    self.data = DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
+                else:
+                    self.data = None
+            else:
+                self.data = None
+        if self.data is None:
+            err_msg = ("Cannot add features from {} type of raw data to "
+                       "{} type of raw data.\n").format(type(other.data).__name__,
+                                                        old_self_data_type)
+            err_msg += ("Set free_raw_data=False when construct Dataset to avoid this"
+                        if was_none else "Freeing raw data")
+            warnings.warn(err_msg)
+        self.feature_name = self.get_feature_name()
+        warnings.warn("Reseting categorical features.\n"
+                      "You can set new categorical features via ``set_categorical_feature`` method")
+        self.categorical_feature = "auto"
+        self.pandas_categorical = None
         return self
 
     def _dump_text(self, filename):