Skip to content

Commit

Permalink
Use one-vs-other for small categorical features.
Browse files Browse the repository at this point in the history
commit c9e123f
Author: Guolin Ke <i@yumumu.me>
Date:   Wed Oct 18 10:00:19 2017 +0800

    change default max_cat_to_onehot

commit 805a5c3
Author: Guolin Ke <i@yumumu.me>
Date:   Tue Oct 17 22:57:18 2017 +0800

    use one hot coding for the small cats
  • Loading branch information
guolinke committed Oct 18, 2017
1 parent afe63f1 commit 087ec47
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 105 deletions.
4 changes: 4 additions & 0 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ Learning Control Parameters

- L2 regularization in categorcial split

- ``max_cat_to_onehot``, default=\ ``4``, type=int

- When number of categories of one feature smaller than or equal to ``max_cat_to_onehot``, will use one-vs-other split algorithm.

IO Parameters
-------------

Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ struct TreeConfig: public ConfigBase {
int max_cat_threshold = 32;
double cat_l2 = 10;
double cat_smooth = 10;
int max_cat_to_onehot = 4;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};

Expand Down Expand Up @@ -471,7 +472,7 @@ struct ParameterAlias {
"max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename", "machines",
"zero_as_missing", "init_score_file", "valid_init_score_file", "is_predict_contrib",
"max_cat_threshold", "cat_smooth", "min_data_per_group", "cat_l2"
"max_cat_threshold", "cat_smooth", "min_data_per_group", "cat_l2", "max_cat_to_onehot"
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
Expand Down
2 changes: 2 additions & 0 deletions src/io/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,12 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetDouble(params, "cat_l2", &cat_l2);
GetDouble(params, "cat_smooth", &cat_smooth);
GetInt(params, "min_data_per_group", &min_data_per_group);
GetInt(params, "max_cat_to_onehot", &max_cat_to_onehot);
CHECK(max_cat_threshold > 0);
CHECK(cat_l2 >= 0.0f);
CHECK(cat_smooth >= 1);
CHECK(min_data_per_group > 0);
CHECK(max_cat_to_onehot > 0);
}

void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
Expand Down
183 changes: 112 additions & 71 deletions src/treelearner/feature_histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,78 +112,114 @@ class FeatureHistogram {

double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1;

if (is_full_categorical) ++used_bin;
int used_bin = meta_->num_bin - 1 + is_full_categorical;

std::vector<int> sorted_idx;
for (int i = 0; i < used_bin; ++i) {
if (data_[i].cnt >= meta_->tree_config->cat_smooth) {
sorted_idx.push_back(i);
}
}
used_bin = static_cast<int>(sorted_idx.size());

const double l2 = meta_->tree_config->lambda_l2 + meta_->tree_config->cat_l2;

auto ctr_fun = [this](double sum_grad, double sum_hess) {
return (sum_grad) / (sum_hess + meta_->tree_config->cat_smooth);
};
std::sort(sorted_idx.begin(), sorted_idx.end(),
[this, &ctr_fun](int i, int j) {
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
});

std::vector<int> find_direction(1, 1);
std::vector<int> start_position(1, 0);
find_direction.push_back(-1);
start_position.push_back(used_bin - 1);
const int max_num_cat = std::min(meta_->tree_config->max_cat_threshold, (used_bin + 1) / 2);

is_splittable_ = false;
double l2 = meta_->tree_config->lambda_l2;
bool use_onehot = meta_->num_bin <= meta_->tree_config->max_cat_to_onehot;
int best_threshold = -1;
int best_dir = 1;
for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
auto dir = find_direction[out_i];
auto start_pos = start_position[out_i];
data_size_t min_data_per_group = meta_->tree_config->min_data_per_group;
data_size_t cnt_cur_group = 0;
double sum_left_gradient = 0.0f;
double sum_left_hessian = kEpsilon;
data_size_t left_count = 0;
for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
auto t = sorted_idx[start_pos];
start_pos += dir;

sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
cnt_cur_group += data_[t].cnt;

if (left_count < meta_->tree_config->min_data_in_leaf
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
if (right_count < meta_->tree_config->min_data_in_leaf || right_count < min_data_per_group) break;

double sum_right_hessian = sum_hessian - sum_left_hessian;
if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;

if (cnt_cur_group < min_data_per_group) continue;
if (use_onehot) {
for (int t = 0; t < used_bin; ++t) {
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < meta_->tree_config->min_data_in_leaf) continue;

cnt_cur_group = 0;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
// if sum hessian too small
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;

double sum_right_gradient = sum_gradient - sum_left_gradient;
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, meta_->tree_config->lambda_l1, l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian, meta_->tree_config->lambda_l1, l2);
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
meta_->tree_config->lambda_l1, l2)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
meta_->tree_config->lambda_l1, l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;

// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
best_threshold = i;
best_threshold = t;
best_sum_left_gradient = data_[t].sum_gradients;
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
best_left_count = data_[t].cnt;
best_gain = current_gain;
best_dir = dir;
}
}
} else {
for (int i = 0; i < used_bin; ++i) {
if (data_[i].cnt >= meta_->tree_config->cat_smooth) {
sorted_idx.push_back(i);
}
}
used_bin = static_cast<int>(sorted_idx.size());

l2 += meta_->tree_config->cat_l2;

auto ctr_fun = [this](double sum_grad, double sum_hess) {
return (sum_grad) / (sum_hess + meta_->tree_config->cat_smooth);
};
std::sort(sorted_idx.begin(), sorted_idx.end(),
[this, &ctr_fun](int i, int j) {
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
});

std::vector<int> find_direction(1, 1);
std::vector<int> start_position(1, 0);
find_direction.push_back(-1);
start_position.push_back(used_bin - 1);
const int max_num_cat = std::min(meta_->tree_config->max_cat_threshold, (used_bin + 1) / 2);

is_splittable_ = false;
for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
auto dir = find_direction[out_i];
auto start_pos = start_position[out_i];
data_size_t min_data_per_group = meta_->tree_config->min_data_per_group;
data_size_t cnt_cur_group = 0;
double sum_left_gradient = 0.0f;
double sum_left_hessian = kEpsilon;
data_size_t left_count = 0;
for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
auto t = sorted_idx[start_pos];
start_pos += dir;

sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
cnt_cur_group += data_[t].cnt;

if (left_count < meta_->tree_config->min_data_in_leaf
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
if (right_count < meta_->tree_config->min_data_in_leaf || right_count < min_data_per_group) break;

double sum_right_hessian = sum_hessian - sum_left_hessian;
if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;

if (cnt_cur_group < min_data_per_group) continue;

cnt_cur_group = 0;

double sum_right_gradient = sum_gradient - sum_left_gradient;
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, meta_->tree_config->lambda_l1, l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian, meta_->tree_config->lambda_l1, l2);
if (current_gain <= min_gain_shift) continue;
is_splittable_ = true;
if (current_gain > best_gain) {
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
best_threshold = i;
best_gain = current_gain;
best_dir = dir;
}
}
}
}
Expand All @@ -201,17 +237,22 @@ class FeatureHistogram {
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - min_gain_shift;
output->num_cat_threshold = best_threshold + 1;
output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
if (best_dir == 1) {
for (int i = 0; i < output->num_cat_threshold; ++i) {
auto t = sorted_idx[i];
output->cat_threshold[i] = t;
}
if (use_onehot) {
output->num_cat_threshold = 1;
output->cat_threshold = std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold));
} else {
for (int i = 0; i < output->num_cat_threshold; ++i) {
auto t = sorted_idx[used_bin - 1 - i];
output->cat_threshold[i] = t;
output->num_cat_threshold = best_threshold + 1;
output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
if (best_dir == 1) {
for (int i = 0; i < output->num_cat_threshold; ++i) {
auto t = sorted_idx[i];
output->cat_threshold[i] = t;
}
} else {
for (int i = 0; i < output->num_cat_threshold; ++i) {
auto t = sorted_idx[used_bin - 1 - i];
output->cat_threshold[i] = t;
}
}
}
}
Expand Down
35 changes: 2 additions & 33 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ def test_categorical_handle(self):
'min_data_per_group': 1,
'cat_smooth': 1,
'cat_l2': 0,
'max_cat_to_onehot': 1,
'zero_as_missing': True,
'categorical_column': 0
}
Expand Down Expand Up @@ -264,39 +265,7 @@ def test_categorical_handle2(self):
'min_data_per_group': 1,
'cat_smooth': 1,
'cat_l2': 0,
'zero_as_missing': False,
'categorical_column': 0
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)

def test_categorical_handle3(self):
x = [11, np.nan, 11, np.nan, 11, np.nan]
y = [0, 1, 0, 1, 0, 1]

X_train = np.array(x).reshape(len(x), 1)
y_train = np.array(y)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)

params = {
'objective': 'regression',
'metric': 'auc',
'verbose': -1,
'boost_from_average': False,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'min_data_per_group': 1,
'cat_smooth': 1,
'cat_l2': 0,
'max_cat_to_onehot': 1,
'zero_as_missing': False,
'categorical_column': 0
}
Expand Down

0 comments on commit 087ec47

Please sign in to comment.