Skip to content

Commit

Permalink
Fix bug where small values of max_bin cause crash.
Browse files Browse the repository at this point in the history
  • Loading branch information
btrotta committed Jul 31, 2019
1 parent 04a5601 commit fe5c8e2
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 6 deletions.
23 changes: 17 additions & 6 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,10 @@ namespace LightGBM {
left_cnt = num_distinct_values;
}

if (left_cnt > 0) {
if ((left_cnt > 0) && (max_bin > 1)) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
left_max_bin = std::max(1, left_max_bin);
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroThreshold;
}

int right_start = -1;
Expand All @@ -192,16 +191,27 @@ namespace LightGBM {
}
}

if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
CHECK(right_max_bin > 0);
if (bin_upper_bound.size() == 0) {
if (max_bin > 1) {
bin_upper_bound.push_back(kZeroThreshold);
}
} else {
bin_upper_bound.back() = -kZeroThreshold;
if (max_bin > 2) {
// create zero bin
bin_upper_bound.push_back(kZeroThreshold);
}
}

int right_max_bin = max_bin - static_cast<int>(bin_upper_bound.size());
if ((right_start >= 0) && (right_max_bin > 0)) {
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroThreshold);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
CHECK(bin_upper_bound.size() <= max_bin);
return bin_upper_bound;
}

Expand Down Expand Up @@ -280,6 +290,7 @@ namespace LightGBM {
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);

} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
Expand Down
19 changes: 19 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,25 @@ def test_max_bin_by_feature(self):
est = lgb.train(params, lgb_data, num_boost_round=1)
self.assertEqual(len(np.unique(est.predict(X))), 3)

def test_small_max_bin(self):
np.random.seed(0)
y = np.random.choice([0, 1], 100)
x = np.zeros((100, 1))
x[:30, 0] = -1
x[30:60, 0] = 1
x[60:, 0] = 2
params = {'objective': 'binary',
'seed': 0,
'min_data_in_leaf': 1,
'verbose': -1,
'max_bin': 2}
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)
x[0, 0] = np.nan
params['max_bin'] = 3
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)

def test_refit(self):
X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
Expand Down

0 comments on commit fe5c8e2

Please sign in to comment.