Skip to content

Commit

Permalink
Support creating Dataset from list of matrices (#1474)
Browse files Browse the repository at this point in the history
  • Loading branch information
slon authored and guolinke committed Jul 7, 2018
1 parent 2e93cda commit 5df9584
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 19 deletions.
21 changes: 21 additions & 0 deletions include/LightGBM/c_api.h
Expand Up @@ -204,6 +204,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const DatasetHandle reference,
DatasetHandle* out);

/*!
* \brief create dataset from array of dense matrices
* \param data pointer to the data space
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
const void** data,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out);

/*!
* \brief Create subset of a data
* \param handle handle of full dataset
Expand Down
50 changes: 50 additions & 0 deletions python-package/lightgbm/basic.py
Expand Up @@ -712,6 +712,8 @@ def _lazy_init(self, data, label=None, reference=None,
self.__init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset)
elif isinstance(data, list) and len(data) > 0 and all(isinstance(x, np.ndarray) for x in data):
self.__init_from_list_np2d(data, params_str, ref_dataset)
else:
try:
csr = scipy.sparse.csr_matrix(data)
Expand Down Expand Up @@ -775,6 +777,54 @@ def __init_from_np2d(self, mat, params_str, ref_dataset):
ref_dataset,
ctypes.byref(self.handle)))

def __init_from_list_np2d(self, mats, params_str, ref_dataset):
"""
Initialize data from list of 2-D numpy matrices.
"""
ncol = mats[0].shape[1]
nrow = np.zeros((len(mats),), np.int32)
if mats[0].dtype == np.float64:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else:
ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()

holders = []
type_ptr_data = None

for i, mat in enumerate(mats):
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')

if mat.shape[1] != ncol:
raise ValueError('Input arrays must have same number of columns')

nrow[i] = mat.shape[0]

if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
raise ValueError('Input chunks must have same type')
ptr_data[i] = chunk_ptr_data
type_ptr_data = chunk_type_ptr_data
holders.append(holder)

self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromMats(
ctypes.c_int(len(mats)),
ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
ctypes.c_int(type_ptr_data),
nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ctypes.c_int(ncol),
ctypes.c_int(C_API_IS_ROW_MAJOR),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))

def __init_from_csr(self, csr, params_str, ref_dataset):
"""
Initialize data from a CSR matrix.
Expand Down
81 changes: 62 additions & 19 deletions src/c_api.cpp
Expand Up @@ -475,6 +475,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
return LGBM_DatasetCreateFromMats(1,
&data,
data_type,
&nrow,
ncol,
is_row_major,
parameters,
reference,
out);
}


int LGBM_DatasetCreateFromMats(int32_t nmat,
const void** data,
int data_type,
int32_t* nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN();
auto param = Config::Str2Map(parameters);
Config config;
Expand All @@ -483,22 +504,39 @@ int LGBM_DatasetCreateFromMat(const void* data,
omp_set_num_threads(config.num_threads);
}
std::unique_ptr<Dataset> ret;
auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
int32_t total_nrow = 0;
for (int j = 0; j < nmat; ++j) {
total_nrow += nrow[j];
}

std::vector<std::function<std::vector<double>(int row_idx)>> get_row_fun;
for (int j = 0; j < nmat; ++j) {
get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major));
}

if (reference == nullptr) {
// sample data first
Random rand(config.data_random_seed);
int sample_cnt = static_cast<int>(nrow < config.bin_construct_sample_cnt ? nrow : config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
int sample_cnt = static_cast<int>(total_nrow < config.bin_construct_sample_cnt ? total_nrow : config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(total_nrow, sample_cnt);
sample_cnt = static_cast<int>(sample_indices.size());
std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);

int offset = 0;
int j = 0;
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) {
if (std::fabs(row[j]) > kZeroThreshold || std::isnan(row[j])) {
sample_values[j].emplace_back(row[j]);
sample_idx[j].emplace_back(static_cast<int>(i));
while ((idx - offset) >= nrow[j]) {
offset += nrow[j];
++j;
}

auto row = get_row_fun[j](static_cast<int>(idx - offset));
for (size_t k = 0; k < row.size(); ++k) {
if (std::fabs(row[k]) > kZeroThreshold || std::isnan(row[k])) {
sample_values[k].emplace_back(row[k]);
sample_idx[k].emplace_back(static_cast<int>(i));
}
}
}
Expand All @@ -507,22 +545,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
Common::Vector2Ptr<int>(sample_idx).data(),
static_cast<int>(sample_values.size()),
Common::VectorSize<double>(sample_values).data(),
sample_cnt, nrow));
sample_cnt, total_nrow));
} else {
ret.reset(new Dataset(nrow));
ret.reset(new Dataset(total_nrow));
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num();
auto one_row = get_row_fun(i);
ret->PushOneRow(tid, i, one_row);
OMP_LOOP_EX_END();
int32_t start_row = 0;
for (int j = 0; j < nmat; ++j) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow[j]; ++i) {
OMP_LOOP_EX_BEGIN();
const int tid = omp_get_thread_num();
auto one_row = get_row_fun[j](i);
ret->PushOneRow(tid, start_row + i, one_row);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();

start_row += nrow[j];
}
OMP_THROW_EX();
ret->FinishLoad();
*out = ret.release();
API_END();
Expand Down
13 changes: 13 additions & 0 deletions tests/python_package_test/test_basic.py
Expand Up @@ -60,3 +60,16 @@ def test(self):
for preds in zip(pred_early_stopping, pred_from_matr):
# scores likely to be different, but prediction should still be the same
self.assertEqual(preds[0] > 0, preds[1] > 0)

def test_chunked_dataset(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)

chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]

train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})

train_data.construct()
valid_data.construct()

0 comments on commit 5df9584

Please sign in to comment.