Skip to content

Commit

Permalink
Handle for missing values (#516)
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed May 15, 2017
1 parent e8cc6ab commit e984b0d
Show file tree
Hide file tree
Showing 18 changed files with 620 additions and 216 deletions.
3 changes: 2 additions & 1 deletion include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ class Bin {
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param default_bin_for_zero defualt bin for the zero(missing) bin
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
Expand All @@ -369,7 +370,7 @@ class Bin {
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t threshold,
uint32_t default_bin, uint32_t default_bin_for_zero, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;

Expand Down
4 changes: 2 additions & 2 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,12 @@ class Dataset {
HistogramBinEntry* data) const;

inline data_size_t Split(int feature,
uint32_t threshold,
uint32_t threshold, uint32_t default_bin_for_zero,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
return feature_groups_[group]->Split(sub_feature, threshold, default_bin_for_zero, data_indices, num_data, lte_indices, gt_indices);
}

inline int SubFeatureBinOffset(int i) const {
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,14 @@ class FeatureGroup {
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
uint32_t default_bin_for_zero,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {

uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
return bin_data_->Split(min_bin, max_bin, default_bin, default_bin_for_zero,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*!
Expand Down
2 changes: 2 additions & 0 deletions include/LightGBM/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();

const score_t kEpsilon = 1e-15f;

const double kMissingValueRange = 1e-20f;

using ReduceFunction = std::function<void(const char*, char*, int)>;

using PredictFunction =
Expand Down
39 changes: 33 additions & 6 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,15 @@ class Tree {
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param gain Split gain
* \param zero_bin bin value for value==0 (missing value)
* \param default_bin default conversion for the missing value, in bin
* \param default_value default conversion for the missing value, in float value
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain,
uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value);

/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Expand Down Expand Up @@ -140,6 +144,23 @@ class Tree {
}
}

static double DefaultValueForZero(double fval, double zero, double out) {
if (fval > -zero && fval <= zero) {
return out;
} else {
return fval;
}
}

static uint32_t DefaultValueForZero(uint32_t fval, uint32_t zero, uint32_t out) {
if (fval == zero) {
return out;
} else {
return fval;
}
}


static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
Expand Down Expand Up @@ -176,7 +197,7 @@ class Tree {
/*! \brief A non-leaf node's right child */
std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_inner;
std::vector<int> split_feature_inner_;
/*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */
Expand All @@ -185,6 +206,10 @@ class Tree {
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_;
/*! \brief Default values for the na/0 feature values */
std::vector<double> default_value_;
std::vector<uint32_t> zero_bin_;
std::vector<uint32_t> default_bin_for_zero_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
// used for leaf node
Expand Down Expand Up @@ -226,8 +251,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
if (has_categorical_) {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (decision_funs[decision_type_[node]](
feature_values[split_feature_[node]],
fval,
threshold_[node])) {
node = left_child_[node];
} else {
Expand All @@ -236,8 +262,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
}
} else {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (NumericalDecision<double>(
feature_values[split_feature_[node]],
fval,
threshold_[node])) {
node = left_child_[node];
} else {
Expand Down
10 changes: 10 additions & 0 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,16 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
return ret;
}

inline static double AvoidInf(double x) {
if (x >= std::numeric_limits<double>::max()) {
return std::numeric_limits<double>::max();
} else if(x <= std::numeric_limits<double>::min()) {
return std::numeric_limits<double>::min();
} else {
return x;
}
}

} // namespace Common

} // namespace LightGBM
Expand Down
4 changes: 2 additions & 2 deletions src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
}
init_score /= num_data_;
std::unique_ptr<Tree> new_tree(new Tree(2));
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1, 0, 0, 0);
train_score_updater_->AddScore(init_score, 0);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0);
Expand Down Expand Up @@ -432,7 +432,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id];
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, num_data_, -1);
output, output, 0, num_data_, -1, 0, 0, 0);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
Expand Down
193 changes: 122 additions & 71 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,76 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
}
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, int total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;

// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());

int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
return bin_upper_bound;
}

void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
Expand Down Expand Up @@ -109,81 +179,62 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (num_distinct_values <= max_bin) {
// use distinct value is enough
bin_upper_bound_.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cnt_in_bin.push_back(cur_cnt_inbin);
cur_cnt_inbin = 0;
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
bin_upper_bound_.clear();
int left_cnt_data = 0;
int missing_cnt_data = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kMissingValueRange) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kMissingValueRange) {
right_cnt_data += counts[i];
} else {
missing_cnt_data += counts[i];
}
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = num_sample_values;
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
}

int left_cnt = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kMissingValueRange) {
left_cnt = i;
break;
}
}

if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - missing_cnt_data) * (max_bin - 1));
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound_.back() = -kMissingValueRange;
}

int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kMissingValueRange) {
right_start = i;
break;
}
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
}

if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound_.size());
auto right_bounds = GreedyFindBin(distinct_values.data() + right_start, counts.data() + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound_.push_back(kMissingValueRange);
bin_upper_bound_.insert(bin_upper_bound_.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
}

num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());

int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
if (distinct_values[i] > bin_upper_bound_[i_bin]) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
CHECK(num_bin_ <= max_bin);
} else {
Expand Down

0 comments on commit e984b0d

Please sign in to comment.