Skip to content

Commit

Permalink
fix bug in feature fraction (#1099)
Browse files Browse the repository at this point in the history
* fix feature fraction

* fix bugs.
  • Loading branch information
guolinke committed Dec 4, 2017
1 parent a957bd6 commit 699d438
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 6 deletions.
10 changes: 9 additions & 1 deletion include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,15 @@ class Dataset {
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}

inline std::vector<int> ValidFeatureIndices() const {
std::vector<int> ret;
for (int i = 0; i < num_total_features_; ++i) {
if (used_feature_map_[i] >= 0) {
ret.push_back(i);
}
}
return ret;
}
void ReSize(data_size_t num_data);

void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
Expand Down
14 changes: 9 additions & 5 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
// initialize data partition
data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
is_feature_used_.resize(num_features_);
valid_feature_indices_ = train_data_->ValidFeatureIndices();
// initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_);
Expand Down Expand Up @@ -237,16 +238,19 @@ void SerialTreeLearner::BeforeTrain() {
histogram_pool_.ResetMap();

if (tree_config_->feature_fraction < 1) {
int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction);
int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*tree_config_->feature_fraction);
// at least use one feature
used_feature_cnt = std::max(used_feature_cnt, 1);
// initialize used features
std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
// Get used feature at current tree
auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt);
int omp_loop_size = static_cast<int>(used_feature_indices.size());
auto sampled_indices = random_.Sample(valid_feature_indices_.size(), used_feature_cnt);
int omp_loop_size = static_cast<int>(sampled_indices.size());
#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
for (int i = 0; i < omp_loop_size; ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]);
if (inner_feature_index < 0) { continue; }
int used_feature = valid_feature_indices_[sampled_indices[i]];
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
CHECK(inner_feature_index >= 0);
is_feature_used_[inner_feature_index] = 1;
}
} else {
Expand Down
1 change: 1 addition & 0 deletions src/treelearner/serial_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class SerialTreeLearner: public TreeLearner {
std::unique_ptr<LeafSplits> smaller_leaf_splits_;
/*! \brief stores best thresholds for all feature for larger leaf */
std::unique_ptr<LeafSplits> larger_leaf_splits_;
std::vector<int> valid_feature_indices_;

#ifdef USE_GPU
/*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
Expand Down

0 comments on commit 699d438

Please sign in to comment.