diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 8d4d745af63..c295cb8d3d1 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1062,6 +1062,14 @@ class Dataset { void CreateCUDAColumnData(); + /*! \brief Create differential features for pairwise lambdarank + * \param sample_values sampled values from the file + * \param sample_indices sampled data indices from the file + * \param bin_mappers bin mappers of the original features + * \param filter_cnt filter count for bin finding + * \param num_total_sample_data number of all sampled data + * \param differential_feature_bin_mappers output differential feature bin mapppers + */ void CreatePairwiseRankingDifferentialFeatures( const std::vector>& sample_values, const std::vector>& sample_indices, diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 68fba1307bf..73b8e7bfd07 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -82,22 +82,6 @@ class DatasetLoader { */ void CheckCategoricalFeatureNumBin(const std::vector>& bin_mappers, const int max_bin, const std::vector& max_bin_by_feature) const; - /*! \brief Create differential features for pairwise lambdarank - * \param sample_values sampled values from the file - * \param sample_indices sampled data indices from the file - * \param bin_mappers bin mappers of the original features - * \param filter_cnt filter count for bin finding - * \param num_total_sample_data number of all sampled data - * \param differential_feature_bin_mappers output differential feature bin mapppers - */ - void CreatePairwiseRankingDifferentialFeatures( - const std::vector>& sample_values, - const std::vector>& sample_indices, - const std::vector>& bin_mappers, - const data_size_t filter_cnt, - const data_size_t num_total_sample_data, - std::vector>* differential_feature_bin_mappers) const; - const Config& config_; /*! \brief Random generator*/ Random random_; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c99eadb82a3..13ba478949a 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -888,12 +888,28 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); + used_feature_map_.clear(); + used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); + used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + + for (int i = 0; i < dataset->num_total_features_; ++i) { + if (dataset->used_feature_map_[i] != -1) { + used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); + } else { + used_feature_map_.push_back(-1); + } + } + std::vector used_diff_features; for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + used_feature_map_.push_back(num_features_); + numeric_feature_map_.push_back(num_features_); num_numeric_features_ += 1; num_features_ += 1; used_diff_features.push_back(diff_feature_index); + } else { + used_feature_map_.push_back(-1); } } @@ -949,23 +965,14 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers)); group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); + num_total_bin += feature_groups_.back()->num_total_bin_; + group_bin_boundaries_.push_back(num_total_bin); } + num_groups_ += static_cast(diff_feature_groups.size()); feature_groups_.shrink_to_fit(); - used_feature_map_.clear(); - used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); - used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); - - for (int i = 0; i < dataset->num_total_features_; ++i) { - if (dataset->used_feature_map_[i] != -1) { - used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); - } else { - used_feature_map_.push_back(-1); - } - } - feature_names_.clear(); for (const std::string& feature_name : dataset->feature_names_) { feature_names_.push_back(feature_name + std::string("_i")); @@ -973,6 +980,9 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const std::string& feature_name : dataset->feature_names_) { feature_names_.push_back(feature_name + std::string("_j")); } + for (const int real_feature_index : diff_original_feature_index) { + feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k")); + } real_feature_idx_.clear(); for (const int idx : dataset->real_feature_idx_) { @@ -981,18 +991,19 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const int idx : dataset->real_feature_idx_) { real_feature_idx_.push_back(idx + dataset->num_total_features_); } + for (const auto& features_in_diff_group : diff_feature_groups) { + for (const int idx : features_in_diff_group) { + real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_); + } + } + + num_total_features_ = dataset->num_total_features_ * 2 + static_cast(diff_feature_bin_mappers.size()); forced_bin_bounds_.clear(); - forced_bin_bounds_.reserve(dataset->forced_bin_bounds_.size() * 2); + forced_bin_bounds_.reserve(2 * dataset->num_total_features_); forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); - - num_total_features_ = dataset->num_total_features_ * 2; - for (const auto& bin_mapper_ref : diff_feature_bin_mappers) { - if (!bin_mapper_ref->is_trivial()) { - num_total_features_ += 1; - } - } + forced_bin_bounds_.resize(num_total_features_); label_idx_ = dataset->label_idx_; device_type_ = dataset->device_type_; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f7a9311dd47..65ecf38685a 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -1568,69 +1568,4 @@ void DatasetLoader::CheckCategoricalFeatureNumBin( } } -void DatasetLoader::CreatePairwiseRankingDifferentialFeatures( - const std::vector>& sample_values, - const std::vector>& sample_indices, - const std::vector>& bin_mappers, - const data_size_t filter_cnt, - const data_size_t num_total_sample_data, - std::vector>* differential_feature_bin_mappers) const { - const int num_original_features = static_cast(sample_values.size()); - std::vector numerical_feature_indices; - for (int i = 0; i < num_original_features; ++i) { - if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) { - numerical_feature_indices.push_back(i); - } - } - const int num_numerical_features = static_cast(numerical_feature_indices.size()); - std::vector> sampled_differential_values(num_numerical_features); - for (int i = 0; i < num_numerical_features; ++i) { - differential_feature_bin_mappers->push_back(nullptr); - } - const int num_threads = OMP_NUM_THREADS(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int i = 0; i < num_numerical_features; ++i) { - const int feature_index = numerical_feature_indices[i]; - const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); - if (config_.zero_as_missing) { - for (int j = 0; j < num_samples_for_feature; ++j) { - const double value = sample_values[feature_index][j]; - for (int k = j + 1; k < num_samples_for_feature; ++k) { - const double diff_value = value - sample_values[feature_index][k]; - sampled_differential_values[i].push_back(diff_value); - } - } - } else { - CHECK_GT(sample_indices[feature_index].size(), 0); - int cur_pos_j = 0; - for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { - double value_j = 0.0; - if (j == sample_indices[feature_index][cur_pos_j]) { - value_j = sample_values[feature_index][cur_pos_j]; - ++cur_pos_j; - } - int cur_pos_k = 0; - for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) { - double value_k = 0.0; - if (k == sample_indices[feature_index][cur_pos_k]) { - value_k = sample_values[feature_index][cur_pos_k]; - ++cur_pos_k; - } - const double diff_value = value_j - value_k; - sampled_differential_values[i].push_back(diff_value); - } - } - } - differential_feature_bin_mappers->operator[](i).reset(new BinMapper()); - std::vector forced_upper_bounds; - differential_feature_bin_mappers->operator[](i)->FindBin( - sampled_differential_values[i].data(), - static_cast(sampled_differential_values[i].size()), - static_cast(num_total_sample_data * (num_total_sample_data) / 2), - config_.max_bin, config_.min_data_in_bin, filter_cnt, config_.feature_pre_filter, - BinType::NumericalBin, config_.use_missing, config_.zero_as_missing, forced_upper_bounds - ); - } -} - } // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index bade0372444..b025d7f6341 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -6,8 +6,31 @@ #include "pairwise_lambdarank_bin.hpp" +#include + namespace LightGBM { +template +uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const { + const data_size_t first_data_index = this->paired_ranking_item_index_map_[paired_data_index].first; + const data_size_t second_data_index = this->paired_ranking_item_index_map_[paired_data_index].second; + const uint32_t first_bin = static_cast(this->unpaired_bin_->data(first_data_index)); + const uint32_t second_bin = static_cast(this->unpaired_bin_->data(second_data_index)); + int first_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1; + int second_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1; + // TODO(shiyu1994): better original value, handle nan as missing + const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0; + const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0; + const double diff_value = first_value - second_value; + const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value); + return diff_bin; +} + +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; + template class ITERATOR_TYPE> void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads) { unpaired_bin_->InitStreaming(num_thread, omp_max_threads); @@ -60,22 +83,12 @@ void DensePairwiseRankingBin::ConstructHistogramI hist_t* grad = out; hist_t* hess = out + 1; hist_cnt_t* cnt = reinterpret_cast(hess); - const VAL_T* base_data_ptr = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto paired_pf_idx = - USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto pf_idx = this->get_unpaired_index(paired_pf_idx); - if (IS_4BIT) { - PREFETCH_T0(base_data_ptr + (pf_idx >> 1)); - } else { - PREFETCH_T0(base_data_ptr + pf_idx); - } - const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; + const auto ti = GetBinAt(paired_idx) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -87,8 +100,7 @@ void DensePairwiseRankingBin::ConstructHistogramI } for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; + const auto ti = GetBinAt(paired_idx) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -109,22 +121,12 @@ void DensePairwiseRankingBin::ConstructHistogramI data_size_t i = start; PACKED_HIST_T* out_ptr = reinterpret_cast(out); const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); - const VAL_T* data_ptr_base = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto paired_pf_idx = - USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto idx = this->get_unpaired_index(paired_idx); - const auto pf_idx = this->get_unpaired_index(paired_pf_idx); - if (IS_4BIT) { - PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); - } else { - PREFETCH_T0(data_ptr_base + pf_idx); - } - const auto ti = static_cast(this->unpaired_bin_->data(idx)); + const auto ti = GetBinAt(paired_idx) << 1; const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -139,8 +141,7 @@ void DensePairwiseRankingBin::ConstructHistogramI } for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto ti = static_cast(this->unpaired_bin_->data(idx)); + const auto ti = GetBinAt(paired_idx) << 1; const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -341,8 +342,7 @@ data_size_t DensePairwiseRankingBin::SplitInner(u if (min_bin < max_bin) { for (data_size_t i = 0; i < cnt; ++i) { const data_size_t paired_idx = data_indices[i]; - const data_size_t idx = this->get_unpaired_index(paired_idx); - const auto bin = this->unpaired_bin_->data(idx); + const auto bin = GetBinAt(paired_idx); if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISS_IS_NA && !MFB_IS_NA && bin == maxb)) { missing_default_indices[(*missing_default_count)++] = paired_idx; @@ -368,8 +368,7 @@ data_size_t DensePairwiseRankingBin::SplitInner(u } for (data_size_t i = 0; i < cnt; ++i) { const data_size_t paired_idx = data_indices[i]; - const data_size_t idx = this->get_unpaired_index(paired_idx); - const auto bin = this->unpaired_bin_->data(idx); + const auto bin = GetBinAt(paired_idx); if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { missing_default_indices[(*missing_default_count)++] = paired_idx; } else if (bin != maxb) { diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 311a25b588e..db9a8f44f48 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -453,7 +453,7 @@ class DensePairwiseRankingBin: public PairwiseRankingBin void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, @@ -475,6 +475,11 @@ class DensePairwiseRankingBin: public PairwiseRankingBinget_unpaired_index(paired_data_index); + return this->unpaired_bin_->data(idx); + } }; template class ITERATOR_TYPE> @@ -527,7 +532,7 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin(bin_offsets_->size()); ++i) { if (bin_offsets_->at(i) == min_bin) { CHECK_GT(i, 0); - sub_feature_index = i - 1; + sub_feature_index = i; break; } } @@ -540,6 +545,8 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin* bin_offsets_; const std::vector>* diff_bin_mappers_; const std::vector>* ori_bin_mappers_; @@ -589,7 +596,7 @@ class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin(bin_offsets_->size()); ++i) { if (bin_offsets_->at(i) == min_bin) { CHECK_GT(i, 0); - sub_feature_index = i - 1; + sub_feature_index = i; break; } }