Skip to content

Commit

Permalink
Speed up saving and loading model (#1083)
Browse files Browse the repository at this point in the history
* remove protobuf

* add version number

* remove pmml script

* use float for split gain

* fix warnings

* refine the read model logic of gbdt

* fix compile error

* improve decode speed

* fix some bugs

* fix double accuracy problem

* fix bug

* multi-thread save model

* speed up save model to string

* parallel save/load model

* fix some warnings.

* fix warnings.

* fix a bug

* remove debug output

* fix doc

* fix max_bin warning in tests.

* fix max_bin warning

* fix pylint

* clean code for stringToArray

* clean code for TToString

* remove max_bin

* replace "class" with typename
  • Loading branch information
guolinke committed Nov 26, 2017
1 parent 8d016c1 commit 8a5ec36
Show file tree
Hide file tree
Showing 26 changed files with 624 additions and 907 deletions.
3 changes: 0 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ env:
- TASK=if-else
- TASK=sdist PYTHON_VERSION=3.4
- TASK=bdist PYTHON_VERSION=3.5
- TASK=proto
- TASK=gpu METHOD=source
- TASK=gpu METHOD=pip

Expand All @@ -39,8 +38,6 @@ matrix:
env: TASK=pylint
- os: osx
env: TASK=check-docs
- os: osx
env: TASK=proto

before_install:
- test -n $CC && unset CC
Expand Down
12 changes: 0 additions & 12 deletions .travis/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,6 @@ if [[ ${TASK} == "if-else" ]]; then
exit 0
fi

if [[ ${TASK} == "proto" ]]; then
conda install numpy
source activate test-env
mkdir build && cd build && cmake .. && make lightgbm || exit -1
cd $TRAVIS_BUILD_DIR/tests/cpp_test && ../../lightgbm config=train.conf && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
cd $TRAVIS_BUILD_DIR && git clone https://github.com/google/protobuf && cd protobuf && ./autogen.sh && ./configure && make && sudo make install && sudo ldconfig
cd $TRAVIS_BUILD_DIR/build && rm -rf * && cmake -DUSE_PROTO=ON .. && make lightgbm || exit -1
cd $TRAVIS_BUILD_DIR/tests/cpp_test && ../../lightgbm config=train.conf model_format=proto && ../../lightgbm config=predict.conf output_result=proto.pred model_format=proto || exit -1
cd $TRAVIS_BUILD_DIR/tests/cpp_test && python test.py || exit -1
exit 0
fi

conda install numpy nose scipy scikit-learn pandas matplotlib pytest

if [[ ${TASK} == "sdist" ]]; then
Expand Down
21 changes: 2 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,25 +124,8 @@ file(GLOB SOURCES
src/treelearner/*.cpp
)

if (USE_PROTO)
if(MSVC)
message(FATAL_ERROR "Cannot use proto with MSVC.")
endif(MSVC)
find_package(Protobuf REQUIRED)
PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS proto/model.proto)
include_directories(${PROTOBUF_INCLUDE_DIRS})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
ADD_DEFINITIONS(-DUSE_PROTO)
SET(PROTO_FILES src/proto/gbdt_model_proto.cpp ${PROTO_HDRS} ${PROTO_SRCS})
endif(USE_PROTO)

add_executable(lightgbm src/main.cpp ${SOURCES} ${PROTO_FILES})
add_library(_lightgbm SHARED src/c_api.cpp src/lightgbm_R.cpp ${SOURCES} ${PROTO_FILES})

if (USE_PROTO)
TARGET_LINK_LIBRARIES(lightgbm ${PROTOBUF_LIBRARIES})
TARGET_LINK_LIBRARIES(_lightgbm ${PROTOBUF_LIBRARIES})
endif(USE_PROTO)
add_executable(lightgbm src/main.cpp ${SOURCES})
add_library(_lightgbm SHARED src/c_api.cpp src/lightgbm_R.cpp ${SOURCES})

if(MSVC)
set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
Expand Down
15 changes: 0 additions & 15 deletions docs/Installation-Guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -271,21 +271,6 @@ Following procedure is for the MSVC (Microsoft Visual C++) build.
**Note**: ``C:\local\boost_1_64_0\`` and ``C:\local\boost_1_64_0\lib64-msvc-14.0`` are locations of your Boost binaries. You also can set them to the environment variable to avoid ``Set ...`` commands when build.

Protobuf Support
^^^^^^^^^^^^^^^^

If you want to use protobuf to save and load models, install `protobuf c++ version <https://github.com/google/protobuf/blob/master/src/README.md>`__ first.

Then run cmake with USE_PROTO on, for example:

.. code::
cmake -DUSE_PROTO=ON ..
You can then use ``model_format=proto`` in parameters when save and load models.

**Note**: for windows user, it's only tested with mingw.

Docker
^^^^^^

Expand Down
14 changes: 0 additions & 14 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,20 +335,6 @@ IO Parameters

- file name of prediction result in ``prediction`` task

- ``model_format``, default=\ ``text``, type=multi-enum, options=\ ``text``, ``proto``

- format to save and load model

- if ``text``, text string will be used

- if ``proto``, Protocol Buffer binary format will be used

- you can save in multiple formats by joining them with comma, like ``text,proto``. In this case, ``model_format`` will be add as suffix after ``output_model``

- **Note**: loading with multiple formats is not supported

- **Note**: to use this parameter you need to `build version with Protobuf Support <./Installation-Guide.rst#protobuf-support>`__

- ``pre_partition``, default=\ ``false``, type=bool, alias=\ ``is_pre_partition``

- used for parallel learning (not include feature parallel)
Expand Down
29 changes: 5 additions & 24 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
#include <LightGBM/meta.h>
#include <LightGBM/config.h>

#ifdef USE_PROTO
#include "model.pb.h"
#endif // USE_PROTO

#include <vector>
#include <string>
#include <map>
Expand Down Expand Up @@ -198,26 +194,11 @@ class LIGHTGBM_EXPORT Boosting {

/*!
* \brief Restore from a serialized string
* \param model_str The string of model
* \return true if succeeded
*/
virtual bool LoadModelFromString(const std::string& model_str) = 0;

#ifdef USE_PROTO
/*!
* \brief Save model with protobuf
* \param num_iterations Number of model that want to save, -1 means save all
* \param filename Filename that want to save to
*/
virtual void SaveModelToProto(int num_iteration, const char* filename) const = 0;

/*!
* \brief Restore from a serialized protobuf file
* \param filename Filename that want to restore from
* \param buffer The content of model
* \param len The length of buffer
* \return true if succeeded
*/
virtual bool LoadModelFromProto(const char* filename) = 0;
#endif // USE_PROTO
virtual bool LoadModelFromString(const char* buffer, size_t len) = 0;

/*!
* \brief Calculate feature importances
Expand Down Expand Up @@ -283,7 +264,7 @@ class LIGHTGBM_EXPORT Boosting {
/*! \brief Disable copy */
Boosting(const Boosting&) = delete;

static bool LoadFileToBoosting(Boosting* boosting, const std::string& format, const char* filename);
static bool LoadFileToBoosting(Boosting* boosting, const char* filename);

/*!
* \brief Create boosting object
Expand All @@ -293,7 +274,7 @@ class LIGHTGBM_EXPORT Boosting {
* \param filename name of model file, if existing will continue to train from this model
* \return The boosting object
*/
static Boosting* CreateBoosting(const std::string& type, const std::string& format, const char* filename);
static Boosting* CreateBoosting(const std::string& type, const char* filename);

};

Expand Down
3 changes: 1 addition & 2 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ struct IOConfig: public ConfigBase {
std::string output_result = "LightGBM_predict_result.txt";
std::string convert_model = "gbdt_prediction.cpp";
std::string input_model = "";
std::string model_format = "text";
int verbosity = 1;
int num_iteration_predict = -1;
bool is_pre_partition = false;
Expand Down Expand Up @@ -449,7 +448,7 @@ struct ParameterAlias {
const std::unordered_set<std::string> parameter_set({
"config", "config_file", "task", "device",
"num_threads", "seed", "boosting_type", "objective", "data",
"output_model", "input_model", "output_result", "model_format", "valid_data",
"output_model", "input_model", "output_result", "valid_data",
"is_enable_sparse", "is_pre_partition", "is_training_metric",
"ndcg_eval_at", "min_data_in_leaf", "min_sum_hessian_in_leaf",
"num_leaves", "feature_fraction", "num_iterations",
Expand Down
32 changes: 9 additions & 23 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@

#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#ifdef USE_PROTO
#include "model.pb.h"
#endif // USE_PROTO

#include <string>
#include <vector>
Expand All @@ -32,15 +29,9 @@ class Tree {
/*!
* \brief Construtor, from a string
* \param str Model string
* \param used_len used count of str
*/
explicit Tree(const std::string& str);
#ifdef USE_PROTO
/*!
* \brief Construtor, from a protobuf object
* \param model_tree Model protobuf object
*/
explicit Tree(const Model_Tree& model_tree);
#endif // USE_PROTO
Tree(const char* str, size_t* used_len);

~Tree();

Expand All @@ -62,7 +53,7 @@ class Tree {
*/
int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
double threshold_double, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);
int left_cnt, int right_cnt, float gain, MissingType missing_type, bool default_left);

/*!
* \brief Performing a split on tree leaves, with categorical feature
Expand All @@ -82,7 +73,7 @@ class Tree {
*/
int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
const uint32_t* threshold, int num_threshold, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type);
int left_cnt, int right_cnt, float gain, MissingType missing_type);

/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
Expand Down Expand Up @@ -179,11 +170,6 @@ class Tree {
/*! \brief Serialize this object to if-else statement*/
std::string ToIfElse(int index, bool is_predict_leaf_index) const;

#ifdef USE_PROTO
/*! \brief Serialize this object to protobuf object*/
void ToProto(Model_Tree& model_tree) const;
#endif // USE_PROTO

inline static bool IsZero(double fval) {
if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
return true;
Expand Down Expand Up @@ -304,7 +290,7 @@ class Tree {
}

inline void Split(int leaf, int feature, int real_feature,
double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
double left_value, double right_value, int left_cnt, int right_cnt, float gain);
/*!
* \brief Find leaf index of which record belongs by features
* \param feature_values Feature value of this record
Expand Down Expand Up @@ -385,25 +371,25 @@ class Tree {
/*! \brief Store the information for categorical feature handle and mising value handle. */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
std::vector<float> split_gain_;
// used for leaf node
/*! \brief The parent of leaf */
std::vector<int> leaf_parent_;
/*! \brief Output of leaves */
std::vector<double> leaf_value_;
/*! \brief DataCount of leaves */
std::vector<data_size_t> leaf_count_;
std::vector<int> leaf_count_;
/*! \brief Output of non-leaf nodes */
std::vector<double> internal_value_;
/*! \brief DataCount of non-leaf nodes */
std::vector<data_size_t> internal_count_;
std::vector<int> internal_count_;
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
double shrinkage_;
};

inline void Tree::Split(int leaf, int feature, int real_feature,
double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
double left_value, double right_value, int left_cnt, int right_cnt, float gain) {
int new_node_idx = num_leaves_ - 1;
// update parent info
int parent = leaf_parent_[leaf];
Expand Down

0 comments on commit 8a5ec36

Please sign in to comment.