Skip to content

Commit

Permalink
tree learning with differential features
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyu1994 committed Mar 28, 2024
1 parent 3cdfd83 commit 3703495
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 134 deletions.
8 changes: 8 additions & 0 deletions include/LightGBM/dataset.h
Expand Up @@ -1062,6 +1062,14 @@ class Dataset {

void CreateCUDAColumnData();

/*! \brief Create differential features for pairwise lambdarank
* \param sample_values sampled values from the file
* \param sample_indices sampled data indices from the file
* \param bin_mappers bin mappers of the original features
* \param filter_cnt filter count for bin finding
* \param num_total_sample_data number of all sampled data
* \param differential_feature_bin_mappers output differential feature bin mapppers
*/
void CreatePairwiseRankingDifferentialFeatures(
const std::vector<std::vector<double>>& sample_values,
const std::vector<std::vector<int>>& sample_indices,
Expand Down
16 changes: 0 additions & 16 deletions include/LightGBM/dataset_loader.h
Expand Up @@ -82,22 +82,6 @@ class DatasetLoader {
*/
void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;

/*! \brief Create differential features for pairwise lambdarank
* \param sample_values sampled values from the file
* \param sample_indices sampled data indices from the file
* \param bin_mappers bin mappers of the original features
* \param filter_cnt filter count for bin finding
* \param num_total_sample_data number of all sampled data
* \param differential_feature_bin_mappers output differential feature bin mapppers
*/
void CreatePairwiseRankingDifferentialFeatures(
const std::vector<std::vector<double>>& sample_values,
const std::vector<std::vector<int>>& sample_indices,
const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const data_size_t filter_cnt,
const data_size_t num_total_sample_data,
std::vector<std::unique_ptr<BinMapper>>* differential_feature_bin_mappers) const;

const Config& config_;
/*! \brief Random generator*/
Random random_;
Expand Down
51 changes: 31 additions & 20 deletions src/io/dataset.cpp
Expand Up @@ -888,12 +888,28 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va

CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config);

used_feature_map_.clear();
used_feature_map_.reserve(2 * dataset->used_feature_map_.size());
used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end());

for (int i = 0; i < dataset->num_total_features_; ++i) {
if (dataset->used_feature_map_[i] != -1) {
used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_);
} else {
used_feature_map_.push_back(-1);
}
}

std::vector<int> used_diff_features;
for (int diff_feature_index = 0; diff_feature_index < static_cast<int>(diff_feature_bin_mappers.size()); ++diff_feature_index) {
if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) {
used_feature_map_.push_back(num_features_);
numeric_feature_map_.push_back(num_features_);
num_numeric_features_ += 1;
num_features_ += 1;
used_diff_features.push_back(diff_feature_index);
} else {
used_feature_map_.push_back(-1);
}
}

Expand Down Expand Up @@ -949,30 +965,24 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va
feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers));

group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back());
num_total_bin += feature_groups_.back()->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}

num_groups_ += static_cast<int>(diff_feature_groups.size());

feature_groups_.shrink_to_fit();

used_feature_map_.clear();
used_feature_map_.reserve(2 * dataset->used_feature_map_.size());
used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end());

for (int i = 0; i < dataset->num_total_features_; ++i) {
if (dataset->used_feature_map_[i] != -1) {
used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_);
} else {
used_feature_map_.push_back(-1);
}
}

feature_names_.clear();
for (const std::string& feature_name : dataset->feature_names_) {
feature_names_.push_back(feature_name + std::string("_i"));
}
for (const std::string& feature_name : dataset->feature_names_) {
feature_names_.push_back(feature_name + std::string("_j"));
}
for (const int real_feature_index : diff_original_feature_index) {
feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k"));
}

real_feature_idx_.clear();
for (const int idx : dataset->real_feature_idx_) {
Expand All @@ -981,18 +991,19 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va
for (const int idx : dataset->real_feature_idx_) {
real_feature_idx_.push_back(idx + dataset->num_total_features_);
}
for (const auto& features_in_diff_group : diff_feature_groups) {
for (const int idx : features_in_diff_group) {
real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_);
}
}

num_total_features_ = dataset->num_total_features_ * 2 + static_cast<int>(diff_feature_bin_mappers.size());

forced_bin_bounds_.clear();
forced_bin_bounds_.reserve(dataset->forced_bin_bounds_.size() * 2);
forced_bin_bounds_.reserve(2 * dataset->num_total_features_);
forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end());
forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end());

num_total_features_ = dataset->num_total_features_ * 2;
for (const auto& bin_mapper_ref : diff_feature_bin_mappers) {
if (!bin_mapper_ref->is_trivial()) {
num_total_features_ += 1;
}
}
forced_bin_bounds_.resize(num_total_features_);

label_idx_ = dataset->label_idx_;
device_type_ = dataset->device_type_;
Expand Down
65 changes: 0 additions & 65 deletions src/io/dataset_loader.cpp
Expand Up @@ -1568,69 +1568,4 @@ void DatasetLoader::CheckCategoricalFeatureNumBin(
}
}

void DatasetLoader::CreatePairwiseRankingDifferentialFeatures(
const std::vector<std::vector<double>>& sample_values,
const std::vector<std::vector<int>>& sample_indices,
const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const data_size_t filter_cnt,
const data_size_t num_total_sample_data,
std::vector<std::unique_ptr<BinMapper>>* differential_feature_bin_mappers) const {
const int num_original_features = static_cast<int>(sample_values.size());
std::vector<int> numerical_feature_indices;
for (int i = 0; i < num_original_features; ++i) {
if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) {
numerical_feature_indices.push_back(i);
}
}
const int num_numerical_features = static_cast<int>(numerical_feature_indices.size());
std::vector<std::vector<double>> sampled_differential_values(num_numerical_features);
for (int i = 0; i < num_numerical_features; ++i) {
differential_feature_bin_mappers->push_back(nullptr);
}
const int num_threads = OMP_NUM_THREADS();
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (int i = 0; i < num_numerical_features; ++i) {
const int feature_index = numerical_feature_indices[i];
const data_size_t num_samples_for_feature = static_cast<data_size_t>(sample_values[feature_index].size());
if (config_.zero_as_missing) {
for (int j = 0; j < num_samples_for_feature; ++j) {
const double value = sample_values[feature_index][j];
for (int k = j + 1; k < num_samples_for_feature; ++k) {
const double diff_value = value - sample_values[feature_index][k];
sampled_differential_values[i].push_back(diff_value);
}
}
} else {
CHECK_GT(sample_indices[feature_index].size(), 0);
int cur_pos_j = 0;
for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) {
double value_j = 0.0;
if (j == sample_indices[feature_index][cur_pos_j]) {
value_j = sample_values[feature_index][cur_pos_j];
++cur_pos_j;
}
int cur_pos_k = 0;
for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) {
double value_k = 0.0;
if (k == sample_indices[feature_index][cur_pos_k]) {
value_k = sample_values[feature_index][cur_pos_k];
++cur_pos_k;
}
const double diff_value = value_j - value_k;
sampled_differential_values[i].push_back(diff_value);
}
}
}
differential_feature_bin_mappers->operator[](i).reset(new BinMapper());
std::vector<double> forced_upper_bounds;
differential_feature_bin_mappers->operator[](i)->FindBin(
sampled_differential_values[i].data(),
static_cast<int>(sampled_differential_values[i].size()),
static_cast<size_t>(num_total_sample_data * (num_total_sample_data) / 2),
config_.max_bin, config_.min_data_in_bin, filter_cnt, config_.feature_pre_filter,
BinType::NumericalBin, config_.use_missing, config_.zero_as_missing, forced_upper_bounds
);
}
}

} // namespace LightGBM
59 changes: 29 additions & 30 deletions src/io/pairwise_lambdarank_bin.cpp
Expand Up @@ -6,8 +6,31 @@

#include "pairwise_lambdarank_bin.hpp"

#include <algorithm>

namespace LightGBM {

template <typename VAL_T, bool IS_4BIT>
uint32_t DensePairwiseRankingDiffBin<VAL_T, IS_4BIT>::GetBinAt(const data_size_t paired_data_index) const {
const data_size_t first_data_index = this->paired_ranking_item_index_map_[paired_data_index].first;
const data_size_t second_data_index = this->paired_ranking_item_index_map_[paired_data_index].second;
const uint32_t first_bin = static_cast<uint32_t>(this->unpaired_bin_->data(first_data_index));
const uint32_t second_bin = static_cast<uint32_t>(this->unpaired_bin_->data(second_data_index));
int first_feature_index = static_cast<int>(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1;
int second_feature_index = static_cast<int>(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1;
// TODO(shiyu1994): better original value, handle nan as missing
const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0;
const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0;
const double diff_value = first_value - second_value;
const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value);
return diff_bin;
}

template uint32_t DensePairwiseRankingDiffBin<uint8_t, true>::GetBinAt(const data_size_t paired_data_index) const;
template uint32_t DensePairwiseRankingDiffBin<uint8_t, false>::GetBinAt(const data_size_t paired_data_index) const;
template uint32_t DensePairwiseRankingDiffBin<uint16_t, false>::GetBinAt(const data_size_t paired_data_index) const;
template uint32_t DensePairwiseRankingDiffBin<uint32_t, false>::GetBinAt(const data_size_t paired_data_index) const;

template <typename BIN_TYPE, template<typename> class ITERATOR_TYPE>
void PairwiseRankingBin<BIN_TYPE, ITERATOR_TYPE>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads) {
unpaired_bin_->InitStreaming(num_thread, omp_max_threads);
Expand Down Expand Up @@ -60,22 +83,12 @@ void DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::ConstructHistogramI
hist_t* grad = out;
hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
const VAL_T* base_data_ptr = reinterpret_cast<const VAL_T*>(this->unpaired_bin_->get_data());
if (USE_PREFETCH) {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto paired_idx = USE_INDICES ? data_indices[i] : i;
const auto idx = this->get_unpaired_index(paired_idx);
const auto paired_pf_idx =
USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
const auto pf_idx = this->get_unpaired_index(paired_pf_idx);
if (IS_4BIT) {
PREFETCH_T0(base_data_ptr + (pf_idx >> 1));
} else {
PREFETCH_T0(base_data_ptr + pf_idx);
}
const auto ti = static_cast<uint32_t>(this->unpaired_bin_->data(idx)) << 1;
const auto ti = GetBinAt(paired_idx) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
Expand All @@ -87,8 +100,7 @@ void DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::ConstructHistogramI
}
for (; i < end; ++i) {
const auto paired_idx = USE_INDICES ? data_indices[i] : i;
const auto idx = this->get_unpaired_index(paired_idx);
const auto ti = static_cast<uint32_t>(this->unpaired_bin_->data(idx)) << 1;
const auto ti = GetBinAt(paired_idx) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
Expand All @@ -109,22 +121,12 @@ void DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::ConstructHistogramI
data_size_t i = start;
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
const int16_t* gradients_ptr = reinterpret_cast<const int16_t*>(ordered_gradients);
const VAL_T* data_ptr_base = reinterpret_cast<const VAL_T*>(this->unpaired_bin_->get_data());
if (USE_PREFETCH) {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto paired_idx = USE_INDICES ? data_indices[i] : i;
const auto paired_pf_idx =
USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
const auto idx = this->get_unpaired_index(paired_idx);
const auto pf_idx = this->get_unpaired_index(paired_pf_idx);
if (IS_4BIT) {
PREFETCH_T0(data_ptr_base + (pf_idx >> 1));
} else {
PREFETCH_T0(data_ptr_base + pf_idx);
}
const auto ti = static_cast<uint32_t>(this->unpaired_bin_->data(idx));
const auto ti = GetBinAt(paired_idx) << 1;
const int16_t gradient_16 = gradients_ptr[i];
if (USE_HESSIAN) {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
Expand All @@ -139,8 +141,7 @@ void DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::ConstructHistogramI
}
for (; i < end; ++i) {
const auto paired_idx = USE_INDICES ? data_indices[i] : i;
const auto idx = this->get_unpaired_index(paired_idx);
const auto ti = static_cast<uint32_t>(this->unpaired_bin_->data(idx));
const auto ti = GetBinAt(paired_idx) << 1;
const int16_t gradient_16 = gradients_ptr[i];
if (USE_HESSIAN) {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
Expand Down Expand Up @@ -341,8 +342,7 @@ data_size_t DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::SplitInner(u
if (min_bin < max_bin) {
for (data_size_t i = 0; i < cnt; ++i) {
const data_size_t paired_idx = data_indices[i];
const data_size_t idx = this->get_unpaired_index(paired_idx);
const auto bin = this->unpaired_bin_->data(idx);
const auto bin = GetBinAt(paired_idx);
if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) ||
(MISS_IS_NA && !MFB_IS_NA && bin == maxb)) {
missing_default_indices[(*missing_default_count)++] = paired_idx;
Expand All @@ -368,8 +368,7 @@ data_size_t DensePairwiseRankingBin<VAL_T, IS_4BIT, ITERATOR_TYPE>::SplitInner(u
}
for (data_size_t i = 0; i < cnt; ++i) {
const data_size_t paired_idx = data_indices[i];
const data_size_t idx = this->get_unpaired_index(paired_idx);
const auto bin = this->unpaired_bin_->data(idx);
const auto bin = GetBinAt(paired_idx);
if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) {
missing_default_indices[(*missing_default_count)++] = paired_idx;
} else if (bin != maxb) {
Expand Down
13 changes: 10 additions & 3 deletions src/io/pairwise_lambdarank_bin.hpp
Expand Up @@ -453,7 +453,7 @@ class DensePairwiseRankingBin: public PairwiseRankingBin<DenseBin<VAL_T, IS_4BIT
data_size_t* lte_indices,
data_size_t* gt_indices) const override;

private:
protected:
template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
void ConstructHistogramInner(const data_size_t* data_indices,
data_size_t start, data_size_t end,
Expand All @@ -475,6 +475,11 @@ class DensePairwiseRankingBin: public PairwiseRankingBin<DenseBin<VAL_T, IS_4BIT
const data_size_t* data_indices, data_size_t cnt,
data_size_t* lte_indices,
data_size_t* gt_indices) const;

virtual inline uint32_t GetBinAt(const data_size_t paired_data_index) const {
const data_size_t idx = this->get_unpaired_index(paired_data_index);
return this->unpaired_bin_->data(idx);
}
};

template <typename VAL_T, template<typename> class ITERATOR_TYPE>
Expand Down Expand Up @@ -527,7 +532,7 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin<VAL_T, IS_4BIT
for (int i = 0; i < static_cast<int>(bin_offsets_->size()); ++i) {
if (bin_offsets_->at(i) == min_bin) {
CHECK_GT(i, 0);
sub_feature_index = i - 1;
sub_feature_index = i;
break;
}
}
Expand All @@ -540,6 +545,8 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin<VAL_T, IS_4BIT
Log::Fatal("get_unpaired_index of DensePairwiseRankingDiffBin should not be called.");
}

inline uint32_t GetBinAt(const data_size_t paired_data_index) const override;

const std::vector<uint32_t>* bin_offsets_;
const std::vector<std::unique_ptr<const BinMapper>>* diff_bin_mappers_;
const std::vector<std::unique_ptr<const BinMapper>>* ori_bin_mappers_;
Expand Down Expand Up @@ -589,7 +596,7 @@ class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin<VAL_T, Pairw
for (int i = 0; i < static_cast<int>(bin_offsets_->size()); ++i) {
if (bin_offsets_->at(i) == min_bin) {
CHECK_GT(i, 0);
sub_feature_index = i - 1;
sub_feature_index = i;
break;
}
}
Expand Down

0 comments on commit 3703495

Please sign in to comment.