Skip to content

Commit

Permalink
refine api of constructing from sampling data.
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed Mar 21, 2017
1 parent 4c7f11a commit c060ca7
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 250 deletions.
3 changes: 2 additions & 1 deletion include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,14 @@ class BinMapper {
/*!
* \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature, Note: not include zero.
* \param num_values number of values.
* \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param max_bin The maximal number of bin
* \param min_data_in_bin min number of data in one bin
* \param min_split_data
* \param bin_type Type of this bin
*/
void FindBin(std::vector<double>& values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);

/*!
* \brief Use specific number of bin to calculate the size of this class
Expand Down
60 changes: 13 additions & 47 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <LightGBM/export.h>

typedef void* ArrayHandle;
typedef void* DatasetHandle;
typedef void* BoosterHandle;

Expand Down Expand Up @@ -53,52 +52,25 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
DatasetHandle* out);

/*!
* \brief create a empty dataset by sampling matrix, if num_sample_row == num_total_row, will construct this dataset.
* Need call LGBM_DatasetPushRows/LGBM_DatasetPushRowsByCSR after calling this function.
* \param data pointer to the data space
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param num_sample_row number of rows
* \brief create a empty dataset by sampling data.
* \param sample_data sampled data, grouped by the column.
* \param sample_indices indices of sampled data.
* \param ncol number columns
* \param num_per_col Size of each sampling column
* \param num_sample_row Number of sampled rows
* \param num_total_row number of total rows
* \param parameters additional parameters
* \param out created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledMat(const void* data,
int data_type,
int32_t num_sample_row,
int32_t ncol,
int32_t num_total_row,
const char* parameters,
DatasetHandle* out);

/*!
* \brief create a empty dataset by sampling CSR data, if num_sample_row == num_total_row, will construct this dataset.
* Need call LGBM_DatasetPushRows/LGBM_DatasetPushRowsByCSR after calling this function.
* \param indptr pointer to row headers
* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param n_sample_elem number of nonzero elements in the matrix
* \param num_col number of columns
* \param num_total_row number of total rows
* \param parameters additional parameters
* \param out created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t n_sample_elem,
int64_t num_col,
int64_t num_total_row,
const char* parameters,
DatasetHandle* out);
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
int** sample_indices,
int32_t ncol,
const int* num_per_col,
int32_t num_sample_row,
int32_t num_total_row,
const char* parameters,
DatasetHandle* out);

/*!
* \brief create a empty dataset by reference Dataset
Expand Down Expand Up @@ -769,10 +741,4 @@ catch(std::string& ex) { return LGBM_APIHandleException(ex); } \
catch(...) { return LGBM_APIHandleException("unknown exception"); } \
return 0;

LIGHTGBM_C_EXPORT int LGBM_AllocateArray(int64_t len, int type, ArrayHandle* out);

LIGHTGBM_C_EXPORT int LGBM_CopyToArray(ArrayHandle arr, int type, int64_t start_idx, const void* src, int64_t len);

LIGHTGBM_C_EXPORT int LGBM_FreeArray(ArrayHandle arr, int type);

#endif // LIGHTGBM_C_API_H_
11 changes: 6 additions & 5 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,19 @@ struct IOConfig: public ConfigBase {
std::string label_column = "";
/*! \brief Index or column name of weight, < 0 means not used
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
* Note: when using Index, it doesn't count the label index */
std::string weight_column = "";
/*! \brief Index or column name of group/query id, < 0 means not used
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
* Note: when using Index, it doesn't count the label index */
std::string group_column = "";
/*! \brief ignored features, separate by ','
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
* Note: when using Index, it doesn't count the label index */
std::string ignore_column = "";
/*! \brief specific categorical columns, Note:only support for integer type categorical
* And add an prefix "name:" while using column name
* Note: when using Index, it dosen't count the label index */
* Note: when using Index, it doesn't count the label index */
std::string categorical_column = "";
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};
Expand Down Expand Up @@ -398,7 +398,8 @@ struct ParameterAlias {
{ "topk", "top_k" },
{ "reg_alpha", "lambda_l1" },
{ "reg_lambda", "lambda_l2" },
{ "num_classes", "num_class" }
{ "num_classes", "num_class" },
{ "unbalanced_sets", "is_unbalance" }
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ class Dataset {

void Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_non_zero_indices,
int** sample_non_zero_indices,
const int* num_per_col,
size_t total_sample_cnt,
const IOConfig& io_config);

Expand Down
4 changes: 2 additions & 2 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class DatasetLoader {

LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);

LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col,
size_t total_sample_size, data_size_t num_data);

/*! \brief Disable copy */
Expand Down
18 changes: 18 additions & 0 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,24 @@ inline static double ApproximateHessianWithGaussian(const double y, const double
return w * std::exp(-(x - b) * (x - b) / (2.0 * c * c)) * a / (c * std::sqrt(2 * pi));
}

template <typename T>
inline static T** Vector2Ptr(std::vector<std::vector<T>>& data) {
T** ptr = new T*[data.size()];
for (size_t i = 0; i < data.size(); ++i) {
ptr[i] = data[i].data();
}
return ptr;
}

template <typename T>
inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& data) {
std::vector<int> ret(data.size());
for (size_t i = 0; i < data.size(); ++i) {
ret[i] = static_cast<int>(data[i].size());
}
return ret;
}

} // namespace Common

} // namespace LightGBM
Expand Down
160 changes: 33 additions & 127 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,85 +310,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
API_END();
}

LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledMat(const void* data,
int data_type,
int32_t num_sample_row,
int32_t ncol,
int32_t num_total_row,
const char* parameters,
DatasetHandle* out) {
if (num_sample_row == num_total_row) {
return LGBM_DatasetCreateFromMat(data, data_type, num_total_row, ncol, 1, parameters, nullptr, out);
} else {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
io_config.Set(param);
auto get_row_fun = RowFunctionFromDenseMatric(data, num_sample_row, ncol, data_type, 1);
std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);
for (int i = 0; i < num_sample_row; ++i) {
auto row = get_row_fun(i);
for (size_t idx = 0; idx < row.size(); ++idx) {
if (std::fabs(row[idx]) > kEpsilon) {
sample_values[idx].emplace_back(row[idx]);
sample_idx[idx].emplace_back(i);
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
*out = loader.CostructFromSampleData(sample_values, sample_idx,
num_sample_row,
static_cast<data_size_t>(num_total_row));
API_END();
}
}

LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t n_sample_elem,
int64_t num_col,
int64_t num_total_row,
const char* parameters,
DatasetHandle* out) {
if (nindptr - 1 == num_total_row) {
return LGBM_DatasetCreateFromCSR(indptr, indptr_type, indices, data,
data_type, nindptr, n_sample_elem, num_col, parameters, nullptr, out);
} else {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
io_config.Set(param);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, n_sample_elem);
int32_t num_sample_row = static_cast<int32_t>(nindptr - 1);
std::vector<std::vector<double>> sample_values(num_col);
std::vector<std::vector<int>> sample_idx(num_col);
for (int i = 0; i < num_sample_row; ++i) {
auto row = get_row_fun(i);
for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
sample_values.resize(inner_data.first + 1);
sample_idx.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > kEpsilon) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(i);
}
}
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr);
*out = loader.CostructFromSampleData(sample_values, sample_idx,
num_sample_row,
static_cast<data_size_t>(num_total_row));
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
int** sample_indices,
int32_t ncol,
const int* num_per_col,
int32_t num_sample_row,
int32_t num_total_row,
const char* parameters,
DatasetHandle* out) {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
io_config.Set(param);
DatasetLoader loader(io_config, nullptr, 1, nullptr);
*out = loader.CostructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
num_sample_row,
static_cast<data_size_t>(num_total_row));
API_END();
}


LIGHTGBM_C_EXPORT int LGBM_DatasetCreateByReference(const DatasetHandle reference,
int64_t num_total_row,
DatasetHandle* out) {
Expand Down Expand Up @@ -480,7 +422,11 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(Common::Vector2Ptr<double>(sample_values),
Common::Vector2Ptr<int>(sample_idx),
static_cast<int>(sample_values.size()),
Common::VectorSize<double>(sample_values).data(),
sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CreateValid(
Expand Down Expand Up @@ -539,7 +485,11 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(Common::Vector2Ptr<double>(sample_values),
Common::Vector2Ptr<int>(sample_idx),
static_cast<int>(sample_values.size()),
Common::VectorSize<double>(sample_values).data(),
sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CreateValid(
Expand Down Expand Up @@ -593,7 +543,11 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(Common::Vector2Ptr<double>(sample_values),
Common::Vector2Ptr<int>(sample_idx),
static_cast<int>(sample_values.size()),
Common::VectorSize<double>(sample_values).data(),
sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CreateValid(
Expand Down Expand Up @@ -1123,54 +1077,6 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle,
API_END();
}


LIGHTGBM_C_EXPORT int LGBM_AllocateArray(int64_t len, int type, ArrayHandle* out) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
*out = new float[len];
} else if (type == C_API_DTYPE_FLOAT64) {
*out = new double[len];
} else if (type == C_API_DTYPE_INT32) {
*out = new int32_t[len];
} else if (type == C_API_DTYPE_INT64) {
*out = new int64_t[len];
}
API_END();
}

template<typename T>
void Copy(T* dst, const T* src, int64_t len) {
std::memcpy(dst, src, sizeof(T) * len);
}

LIGHTGBM_C_EXPORT int LGBM_CopyToArray(ArrayHandle arr, int type, int64_t start_idx, const void* src, int64_t len) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
Copy<float>(static_cast<float*>(arr) + start_idx, static_cast<const float*>(src), len);
} else if (type == C_API_DTYPE_FLOAT64) {
Copy<double>(static_cast<double*>(arr) + start_idx, static_cast<const double*>(src), len);
} else if (type == C_API_DTYPE_INT32) {
Copy<int32_t>(static_cast<int32_t*>(arr) + start_idx, static_cast<const int32_t*>(src), len);
} else if (type == C_API_DTYPE_INT64) {
Copy<int64_t>(static_cast<int64_t*>(arr) + start_idx, static_cast<const int64_t*>(src), len);
}
API_END();
}

LIGHTGBM_C_EXPORT int LGBM_FreeArray(ArrayHandle arr, int type) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
delete[] static_cast<float*>(arr);
} else if (type == C_API_DTYPE_FLOAT64) {
delete[] static_cast<double*>(arr);
} else if (type == C_API_DTYPE_INT32) {
delete[] static_cast<int32_t*>(arr);
} else if (type == C_API_DTYPE_INT64) {
delete[] static_cast<int64_t*>(arr);
}
API_END();
}

// ---- start of some help functions

std::function<std::vector<double>(int row_idx)>
Expand Down

0 comments on commit c060ca7

Please sign in to comment.