Skip to content

Commit

Permalink
Better missing value handle (#747)
Browse files Browse the repository at this point in the history
* finish the data loading part

* allow prediction.

* fix bug for decision type.

* finish split finding part

* fix bugs.

* bug fixed. add a test .

* fix pep8 .

* update documents.

* fix test bugs.

* fix a format

* fix import error in python test.

* disable missing handle in categorial features.

* fix a bug.

* add more tests.

* fix pep8

* fix bugs.

* remove the missing handle code for categorical feature.
  • Loading branch information
guolinke committed Jul 30, 2017
1 parent db4374e commit 00cb04a
Show file tree
Hide file tree
Showing 25 changed files with 551 additions and 245 deletions.
3 changes: 3 additions & 0 deletions docs/Parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
* The Threshold of margin in early-stopping prediction.
* `use_missing`, default=`true`, type=bool
* Set to `false` will disbale the special handle of missing value.
* `zero_as_missing`, default=`false`, type=bool
* Set to `true` will treat all zero as missing values (including the unshown values in libsvm/sparse matrics).
* Set to `false` will use `na` to represent missing values.


## Objective parameters
Expand Down
31 changes: 28 additions & 3 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ enum BinType {
CategoricalBin
};

enum MissingType {
None,
Zero,
NaN
};

/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
Expand Down Expand Up @@ -63,6 +68,9 @@ class BinMapper {
if (num_bin_ != other.num_bin_) {
return false;
}
if (missing_type_ != other.missing_type_) {
return false;
}
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
Expand All @@ -81,6 +89,8 @@ class BinMapper {

/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief Missing Type */
inline MissingType missing_type() const { return missing_type_; }
/*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
Expand Down Expand Up @@ -129,8 +139,11 @@ class BinMapper {
* \param min_data_in_bin min number of data in one bin
* \param min_split_data
* \param bin_type Type of this bin
* \param use_missing True to enable missing value handle
* \param zero_as_missing True to use zero as missing value
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
bool use_missing, bool zero_as_missing);

/*!
* \brief Use specific number of bin to calculate the size of this class
Expand Down Expand Up @@ -173,6 +186,7 @@ class BinMapper {
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
Expand Down Expand Up @@ -360,7 +374,8 @@ class Bin {
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param default_bin_for_zero defualt bin for the zero(missing) bin
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
Expand All @@ -370,7 +385,7 @@ class Bin {
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t default_bin_for_zero, uint32_t threshold,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;

Expand Down Expand Up @@ -417,10 +432,20 @@ class Bin {
};

inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
} else {
value = 0.0f;
}
}
if (bin_type_ == BinType::NumericalBin) {
// binary search to find bin
int l = 0;
int r = num_bin_ - 1;
if (missing_type_ == MissingType::NaN) {
r -= 1;
}
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
Expand Down
7 changes: 3 additions & 4 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ struct IOConfig: public ConfigBase {
int pred_early_stop_freq = 10;
/*! \brief Threshold of margin of pred_early_stop */
double pred_early_stop_margin = 10.0f;

bool zero_as_missing = false;
bool use_missing = true;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};

Expand Down Expand Up @@ -219,8 +220,6 @@ struct TreeConfig: public ConfigBase {
int gpu_device_id = -1;
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
bool gpu_use_dp = false;
/*! \brief Set to false to disable the handle of missing values */
bool use_missing = true;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};

Expand Down Expand Up @@ -456,7 +455,7 @@ struct ParameterAlias {
"feature_fraction_seed", "enable_bundle", "data_filename", "valid_data_filenames",
"snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
"max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename"
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename", "zero_as_missing"
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
Expand Down
4 changes: 2 additions & 2 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,12 @@ class Dataset {
HistogramBinEntry* data) const;

inline data_size_t Split(int feature,
uint32_t threshold, uint32_t default_bin_for_zero,
uint32_t threshold, bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, default_bin_for_zero, data_indices, num_data, lte_indices, gt_indices);
return feature_groups_[group]->Split(sub_feature, threshold, default_left, data_indices, num_data, lte_indices, gt_indices);
}

inline int SubFeatureBinOffset(int i) const {
Expand Down
5 changes: 3 additions & 2 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ class FeatureGroup {
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
uint32_t default_bin_for_zero,
bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {

uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin, default_bin_for_zero,
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*!
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();

const score_t kEpsilon = 1e-15f;

const double kMissingValueRange = 1e-20f;
const double kZeroAsMissingValueRange = 1e-20f;

using ReduceFunction = std::function<void(const char*, char*, int)>;

Expand Down
87 changes: 63 additions & 24 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
namespace LightGBM {

#define kMaxTreeOutput (100)
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)

/*!
* \brief Tree model
Expand Down Expand Up @@ -44,15 +46,13 @@ class Tree {
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param gain Split gain
* \param zero_bin bin value for value==0 (missing value)
* \param default_bin default conversion for the missing value, in bin
* \param default_value default conversion for the missing value, in float value
* \param missing_type missing type
* \param default_left default direction for missing value
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain,
uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value);
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);

/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Expand Down Expand Up @@ -127,7 +127,7 @@ class Tree {
std::string ToIfElse(int index, bool is_predict_leaf_index);

template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
inline static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
Expand All @@ -136,32 +136,75 @@ class Tree {
}

template<typename T>
static bool NumericalDecision(T fval, T threshold) {
inline static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) {
return true;
} else {
return false;
}
}

static double DefaultValueForZero(double fval, double zero, double out) {
if (fval > -zero && fval <= zero) {
return out;
inline static bool IsZero(double fval) {
if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
return true;
} else {
return fval;
return false;
}
}

static uint32_t DefaultValueForZero(uint32_t fval, uint32_t zero, uint32_t out) {
if (fval == zero) {
return out;
inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
return (decision_type & mask) > 0;
}

inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
if (input) {
(*decision_type) |= mask;
} else {
return fval;
(*decision_type) &= (127 - mask);
}
}

inline static int8_t GetMissingType(int8_t decision_type) {
return (decision_type >> 2) & 3;
}

inline static void SetMissingType(int8_t* decision_type, int8_t input) {
(*decision_type) &= 3;
(*decision_type) |= (input << 2);
}

inline static uint32_t ConvertMissingValue(uint32_t fval, uint32_t threshold, int8_t decision_type, uint32_t default_bin, uint32_t max_bin) {
uint8_t missing_type = GetMissingType(decision_type);
if ((missing_type == 1 && fval == default_bin)
|| (missing_type == 2 && fval == max_bin)) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
} else {
fval = threshold + 1;
}
}
return fval;
}

inline static double ConvertMissingValue(double fval, double threshold, int8_t decision_type) {
uint8_t missing_type = GetMissingType(decision_type);
if (std::isnan(fval)) {
if (missing_type != 2) {
fval = 0.0f;
}
}
if ((missing_type == 1 && IsZero(fval))
|| (missing_type == 2 && std::isnan(fval))) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
} else {
fval = 10.0f * threshold;
}
}
return fval;
}

static const char* GetDecisionTypeName(int8_t type) {
inline static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
} else {
Expand Down Expand Up @@ -204,12 +247,8 @@ class Tree {
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
/*! \brief Store the information for categorical feature handle and mising value handle. */
std::vector<int8_t> decision_type_;
/*! \brief Default values for the na/0 feature values */
std::vector<double> default_value_;
std::vector<uint32_t> zero_bin_;
std::vector<uint32_t> default_bin_for_zero_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
// used for leaf node
Expand Down Expand Up @@ -251,8 +290,8 @@ inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
if (has_categorical_) {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (decision_funs[decision_type_[node]](
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_[node])) {
node = left_child_[node];
Expand All @@ -262,7 +301,7 @@ inline int Tree::GetLeaf(const double* feature_values) const {
}
} else {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (NumericalDecision<double>(
fval,
threshold_[node])) {
Expand Down
10 changes: 5 additions & 5 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ inline static const char* Atof(const char* p, double* out) {
std::string tmp_str(p, cnt);
std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), Common::tolower);
if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
*out = 0;
*out = NAN;
} else if (tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
*out = sign * 1e308;
} else {
Expand Down Expand Up @@ -513,10 +513,10 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
}

inline static double AvoidInf(double x) {
if (x >= std::numeric_limits<double>::max()) {
return std::numeric_limits<double>::max();
} else if(x <= std::numeric_limits<double>::lowest()) {
return std::numeric_limits<double>::lowest();
if (x >= 1e300) {
return 1e300;
} else if(x <= -1e300) {
return -1e300;
} else {
return x;
}
Expand Down
4 changes: 2 additions & 2 deletions src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
auto label = train_data_->metadata().label();
double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
std::unique_ptr<Tree> new_tree(new Tree(2));
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, 0, 0, 0);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(init_score, 0);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0);
Expand Down Expand Up @@ -532,7 +532,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id];
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, 0, -1, 0, 0, 0);
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
Expand Down
2 changes: 1 addition & 1 deletion src/boosting/rf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class RF: public GBDT {
double output = class_default_output_[cur_tree_id];
objective_function_->ConvertOutput(&output, &output);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, 0, -1, 0, 0, 0);
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
Expand Down

0 comments on commit 00cb04a

Please sign in to comment.