Skip to content

Commit

Permalink
Add func to handle sparse testing data (#1045)
Browse files Browse the repository at this point in the history
* first commit

* fix bug

* fix by commits

* fix by commit

* add funcs to IfElse

* fix bug

* fix bug

* fix bug

* change tab to space
  • Loading branch information
ww authored and guolinke committed Nov 15, 2017
1 parent 302f84b commit ba5c745
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 13 deletions.
12 changes: 12 additions & 0 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <vector>
#include <string>
#include <map>

namespace LightGBM {

Expand Down Expand Up @@ -120,6 +121,10 @@ class LIGHTGBM_EXPORT Boosting {
virtual void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;

virtual void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;


/*!
* \brief Prediction for one record, sigmoid transformation will be used if needed
* \param feature_values Feature value on this record
Expand All @@ -129,6 +134,10 @@ class LIGHTGBM_EXPORT Boosting {
virtual void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;

virtual void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;


/*!
* \brief Prediction for one record with leaf index
* \param feature_values Feature value on this record
Expand All @@ -137,6 +146,9 @@ class LIGHTGBM_EXPORT Boosting {
virtual void PredictLeafIndex(
const double* features, double* output) const = 0;

virtual void PredictLeafIndexByMap(
const std::unordered_map<int, double>& features, double* output) const = 0;

/*!
* \brief Feature contributions for the model's prediction of one record
* \param feature_values Feature value on this record
Expand Down
40 changes: 40 additions & 0 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string>
#include <vector>
#include <memory>
#include <map>

namespace LightGBM {

Expand Down Expand Up @@ -118,8 +119,11 @@ class Tree {
* \return Prediction result
*/
inline double Predict(const double* feature_values) const;
inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;

inline int PredictLeafIndex(const double* feature_values) const;
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;


inline void PredictContrib(const double* feature_values, int num_features, double* output);

Expand Down Expand Up @@ -307,13 +311,16 @@ class Tree {
* \return Leaf index
*/
inline int GetLeaf(const double* feature_values) const;
inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;

/*! \brief Serialize one node to json*/
std::string NodeToJSON(int index) const;

/*! \brief Serialize one node to if-else statement*/
std::string NodeToIfElse(int index, bool is_predict_leaf_index) const;

std::string NodeToIfElseByMap(int index, bool is_predict_leaf_index) const;

double ExpectedValue() const;

int MaxDepth();
Expand Down Expand Up @@ -440,6 +447,15 @@ inline double Tree::Predict(const double* feature_values) const {
}
}

inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return LeafOutput(leaf);
} else {
return leaf_value_[0];
}
}

inline int Tree::PredictLeafIndex(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
Expand All @@ -449,6 +465,15 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
}
}

inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return leaf;
} else {
return 0;
}
}

inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
output[num_features] += ExpectedValue();
// Run the recursion with preallocated space for the unique path data
Expand Down Expand Up @@ -484,6 +509,21 @@ inline int Tree::GetLeaf(const double* feature_values) const {
return ~node;
}

inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
int node = 0;
if (num_cat_ > 0) {
while (node >= 0) {
node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
} else {
while (node >= 0) {
node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
}
return ~node;
}


} // namespace LightGBM

#endif // LightGBM_TREE_H_
59 changes: 46 additions & 13 deletions src/application/predictor.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_

#define MAX_FEATURE 10000
#define SPARSITY 100

#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
Expand Down Expand Up @@ -58,16 +61,21 @@ class Predictor {
num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, is_predict_leaf_index, is_predict_contrib);
num_feature_ = boosting_->MaxFeatureIdx() + 1;
predict_buf_ = std::vector<std::vector<double>>(num_threads_, std::vector<double>(num_feature_, 0.0f));

predict_buf_map_ = std::vector<std::unordered_map<int, double>>(num_threads_);
if (is_predict_leaf_index) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
CopyToPredictBuffer(predict_buf_[tid].data(), features);
// get result for leaf index
boosting_->PredictLeafIndex(predict_buf_[tid].data(), output);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictLeafIndexByMap(predict_buf_map_[tid], output);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
// get result for leaf index
boosting_->PredictLeafIndex(predict_buf_[tid].data(), output);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};

} else if (is_predict_contrib) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
Expand All @@ -76,21 +84,32 @@ class Predictor {
boosting_->PredictContrib(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
};

} else {
if (is_raw_score) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->PredictRaw(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictRawByMap(predict_buf_map_[tid], output, &early_stop_);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->PredictRaw(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};
} else {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictByMap(predict_buf_map_[tid], output, &early_stop_);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};
}
}
Expand Down Expand Up @@ -225,6 +244,19 @@ class Predictor {
}
}

void CopyToPredictMap(int tid, const std::vector<std::pair<int, double>>& features) {
int loop_size = static_cast<int>(features.size());
for (int i = 0; i < loop_size; ++i) {
if (features[i].first < num_feature_) {
predict_buf_map_[tid][features[i].first] = features[i].second;
}
}
}

void ClearPredictMap(int tid) {
predict_buf_map_[tid].clear();
}

/*! \brief Boosting model */
const Boosting* boosting_;
/*! \brief function for prediction */
Expand All @@ -234,6 +266,7 @@ class Predictor {
int num_pred_one_row_;
int num_threads_;
std::vector<std::vector<double>> predict_buf_;
std::vector<std::unordered_map<int, double>> predict_buf_map_;
};

} // namespace LightGBM
Expand Down
9 changes: 9 additions & 0 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <fstream>
#include <memory>
#include <mutex>
#include <map>

namespace LightGBM {

Expand Down Expand Up @@ -186,11 +187,19 @@ class GBDT: public GBDTBase {
void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;

void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;

void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;

void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;

void PredictLeafIndex(const double* features, double* output) const override;

void PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const override;

void PredictContrib(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;

Expand Down
63 changes: 63 additions & 0 deletions src/boosting/gbdt_model_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,37 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "}" << std::endl;
str_buf << std::endl;

// PredictRawByMap
str_buf << "double (*PredictTreeByMapPtr[])(const std::unordered_map<int, double>&) = { ";
for (int i = 0; i < num_used_model; ++i) {
if (i > 0) {
str_buf << " , ";
}
str_buf << "PredictTree" << i << "ByMap";
}
str_buf << " };" << std::endl << std::endl;

std::stringstream pred_str_buf_map;

pred_str_buf_map << "\t" << "int early_stop_round_counter = 0;" << std::endl;
pred_str_buf_map << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << std::endl;
pred_str_buf_map << "\t" << "for (int i = 0; i < num_iteration_for_pred_; ++i) {" << std::endl;
pred_str_buf_map << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
pred_str_buf_map << "\t\t\t" << "output[k] += (*PredictTreeByMapPtr[i * num_tree_per_iteration_ + k])(features);" << std::endl;
pred_str_buf_map << "\t\t" << "}" << std::endl;
pred_str_buf_map << "\t\t" << "++early_stop_round_counter;" << std::endl;
pred_str_buf_map << "\t\t" << "if (early_stop->round_period == early_stop_round_counter) {" << std::endl;
pred_str_buf_map << "\t\t\t" << "if (early_stop->callback_function(output, num_tree_per_iteration_))" << std::endl;
pred_str_buf_map << "\t\t\t\t" << "return;" << std::endl;
pred_str_buf_map << "\t\t\t" << "early_stop_round_counter = 0;" << std::endl;
pred_str_buf_map << "\t\t" << "}" << std::endl;
pred_str_buf_map << "\t" << "}" << std::endl;

str_buf << "void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << pred_str_buf_map.str();
str_buf << "}" << std::endl;
str_buf << std::endl;

// Predict
str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl;
Expand All @@ -115,6 +146,21 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "}" << std::endl;
str_buf << std::endl;

// PredictByMap
str_buf << "void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << "\t" << "PredictRawByMap(features, output, early_stop);" << std::endl;
str_buf << "\t" << "if (average_output_) {" << std::endl;
str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << std::endl;
str_buf << "\t\t" << "}" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "\t" << "else if (objective_function_ != nullptr) {" << std::endl;
str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;
str_buf << std::endl;


// PredictLeafIndex
for (int i = 0; i < num_used_model; ++i) {
str_buf << models_[i]->ToIfElse(i, true) << std::endl;
Expand All @@ -136,6 +182,23 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;

//PredictLeafIndexByMap
str_buf << "double (*PredictTreeLeafByMapPtr[])(const std::unordered_map<int, double>&) = { ";
for (int i = 0; i < num_used_model; ++i) {
if (i > 0) {
str_buf << " , ";
}
str_buf << "PredictTree" << i << "LeafByMap";
}
str_buf << " };" << std::endl << std::endl;

str_buf << "void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {" << std::endl;
str_buf << "\t" << "int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;" << std::endl;
str_buf << "\t" << "for (int i = 0; i < total_tree; ++i) {" << std::endl;
str_buf << "\t\t" << "output[i] = (*PredictTreeLeafByMapPtr[i])(features);" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;

str_buf << "} // namespace LightGBM" << std::endl;

return str_buf.str();
Expand Down

0 comments on commit ba5c745

Please sign in to comment.