Skip to content

Commit

Permalink
add per-feature-penalites (#1449)
Browse files Browse the repository at this point in the history
* add per-feature-penalites

* fix comment
  • Loading branch information
guolinke committed Jun 14, 2018
1 parent dfbb883 commit 0584065
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 1 deletion.
6 changes: 6 additions & 0 deletions docs/Parameters.rst
Expand Up @@ -347,6 +347,12 @@ Learning Control Parameters

- you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature

- ``feature_contri``, default = ``None``, type = multi-double, aliases: ``fc``, ``fp``, ``feature_penalty``

- used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature

- you need to specify all features in order

- ``forcedsplits_filename``, default = ``""``, type = string, aliases: ``fs``, ``forced_splits_filename``, ``forced_splits_file``, ``forced_splits``

- path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
Expand Down
7 changes: 7 additions & 0 deletions include/LightGBM/config.h
Expand Up @@ -353,6 +353,13 @@ struct Config {
// desc = ``1`` means increasing, ``-1`` means decreasing, ``0`` means non-constraint
// desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
std::vector<int8_t> monotone_constraints;

// type = multi-double
// alias = fc, fp, feature_penalty
// default = None
// desc = used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
// desc = you need to specify all features in order
std::vector<double> feature_contri;

// alias = fs, forced_splits_filename, forced_splits_file, forced_splits
// desc = path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
Expand Down
9 changes: 9 additions & 0 deletions include/LightGBM/dataset.h
Expand Up @@ -443,6 +443,14 @@ class Dataset {
}
}

inline double FeaturePenalte(int i) const {
if (feature_penalty_.empty()) {
return 1;
} else {
return feature_penalty_[i];
}
}

bool HasMonotone() const {
if (monotone_types_.empty()) {
return false;
Expand Down Expand Up @@ -605,6 +613,7 @@ class Dataset {
std::vector<int> group_feature_start_;
std::vector<int> group_feature_cnt_;
std::vector<int8_t> monotone_types_;
std::vector<double> feature_penalty_;
bool is_finish_load_;
};

Expand Down
9 changes: 9 additions & 0 deletions include/LightGBM/utils/array_args.h
Expand Up @@ -176,6 +176,15 @@ class ArrayArgs {
return true;
}

inline static bool CheckAll(const std::vector<VAL_T>& array, VAL_T t) {
for (size_t i = 0; i < array.size(); ++i) {
if (array[i] != t) {
return false;
}
}
return true;
}

};

} // namespace LightGBM
Expand Down
9 changes: 9 additions & 0 deletions src/io/config_auto.cpp
Expand Up @@ -57,6 +57,9 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"topk", "top_k"},
{"mc", "monotone_constraints"},
{"monotone_constraint", "monotone_constraints"},
{"fc", "feature_contri"},
{"fp", "feature_contri"},
{"feature_penalty", "feature_contri"},
{"fs", "forcedsplits_filename"},
{"forced_splits_filename", "forcedsplits_filename"},
{"forced_splits_file", "forcedsplits_filename"},
Expand Down Expand Up @@ -172,6 +175,7 @@ std::unordered_set<std::string> Config::parameter_set({
"max_cat_to_onehot",
"top_k",
"monotone_constraints",
"feature_contri",
"forcedsplits_filename",
"verbosity",
"max_bin",
Expand Down Expand Up @@ -336,6 +340,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
monotone_constraints = Common::StringToArray<int8_t>(tmp_str, ',');
}

if (GetString(params, "feature_contri", &tmp_str)) {
feature_contri = Common::StringToArray<double>(tmp_str, ',');
}

GetString(params, "forcedsplits_filename", &forcedsplits_filename);

GetInt(params, "verbosity", &verbosity);
Expand Down Expand Up @@ -523,6 +531,7 @@ std::string Config::SaveMembersToString() const {
str_buf << "[max_cat_to_onehot: " << max_cat_to_onehot << "]\n";
str_buf << "[top_k: " << top_k << "]\n";
str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints),",") << "]\n";
str_buf << "[feature_contri: " << Common::Join(feature_contri,",") << "]\n";
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
str_buf << "[verbosity: " << verbosity << "]\n";
str_buf << "[max_bin: " << max_bin << "]\n";
Expand Down
25 changes: 24 additions & 1 deletion src/io/dataset.cpp
Expand Up @@ -306,6 +306,19 @@ void Dataset::Construct(
monotone_types_.clear();
}
}
if (!io_config.feature_contri.empty()) {
CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
feature_penalty_.resize(num_features_);
for (int i = 0; i < num_total_features_; ++i) {
int inner_fidx = InnerFeatureIndex(i);
if (inner_fidx >= 0) {
feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
}
}
if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
feature_penalty_.clear();
}
}
}

void Dataset::FinishLoad() {
Expand Down Expand Up @@ -350,6 +363,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
group_feature_start_ = dataset->group_feature_start_;
group_feature_cnt_ = dataset->group_feature_cnt_;
monotone_types_ = dataset->monotone_types_;
feature_penalty_ = dataset->feature_penalty_;
}

void Dataset::CreateValid(const Dataset* dataset) {
Expand Down Expand Up @@ -403,6 +417,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
}
}
monotone_types_ = dataset->monotone_types_;
feature_penalty_ = dataset->feature_penalty_;
}

void Dataset::ReSize(data_size_t num_data) {
Expand Down Expand Up @@ -555,7 +570,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_;
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
+ sizeof(double) * num_features_;
// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
Expand All @@ -581,6 +597,13 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
monotone_types_.clear();
}
if (feature_penalty_.empty()) {
ArrayArgs<double>::Assign(&feature_penalty_, 1.0, num_features_);
}
writer->Write(feature_penalty_.data(), sizeof(double) * num_features_);
if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
feature_penalty_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
Expand Down
11 changes: 11 additions & 0 deletions src/io/dataset_loader.cpp
Expand Up @@ -380,6 +380,17 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
dataset->monotone_types_.clear();
}

const double* tmp_ptr_feature_penalty = reinterpret_cast<const double*>(mem_ptr);
dataset->feature_penalty_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature_penalty_.push_back(tmp_ptr_feature_penalty[i]);
}
mem_ptr += sizeof(double) * (dataset->num_features_);

if (ArrayArgs<double>::CheckAll(dataset->feature_penalty_, 1)) {
dataset->feature_penalty_.clear();
}

// get feature names
dataset->feature_names_.clear();
// write feature names
Expand Down
3 changes: 3 additions & 0 deletions src/treelearner/feature_histogram.hpp
Expand Up @@ -19,6 +19,7 @@ class FeatureMetainfo {
int8_t bias = 0;
uint32_t default_bin;
int8_t monotone_type;
double penalty;
/*! \brief pointer of tree config */
const Config* config;
BinType bin_type;
Expand Down Expand Up @@ -77,6 +78,7 @@ class FeatureHistogram {
output->default_left = true;
output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
output->gain *= meta_->penalty;
}

void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
Expand Down Expand Up @@ -707,6 +709,7 @@ class HistogramPool {
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
Expand Down
1 change: 1 addition & 0 deletions src/treelearner/voting_parallel_tree_learner.cpp
Expand Up @@ -70,6 +70,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
Expand Down

0 comments on commit 0584065

Please sign in to comment.