-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
goss.hpp
220 lines (200 loc) · 7.93 KB
/
goss.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/boosting.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <string>
#include <algorithm>
#include <chrono>
#include <cstdio>
#include <fstream>
#include <vector>
#include "gbdt.h"
#include "score_updater.hpp"
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> subset_time;
std::chrono::duration<double, std::milli> re_init_tree_time;
#endif
class GOSS: public GBDT {
public:
/*!
* \brief Constructor
*/
GOSS() : GBDT() {
}
~GOSS() {
#ifdef TIMETAG
Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
#endif
}
void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::Init(config, train_data, objective_function, training_metrics);
ResetGoss();
}
void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::ResetTrainingData(train_data, objective_function, training_metrics);
ResetGoss();
}
void ResetConfig(const Config* config) override {
GBDT::ResetConfig(config);
ResetGoss();
}
void ResetGoss() {
CHECK(config_->top_rate + config_->other_rate <= 1.0f);
CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f);
if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) {
Log::Fatal("Cannot use bagging in GOSS");
}
Log::Info("Using GOSS");
bag_data_indices_.resize(num_data_);
tmp_indices_.resize(num_data_);
tmp_indice_right_.resize(num_data_);
offsets_buf_.resize(num_threads_);
left_cnts_buf_.resize(num_threads_);
right_cnts_buf_.resize(num_threads_);
left_write_pos_buf_.resize(num_threads_);
right_write_pos_buf_.resize(num_threads_);
is_use_subset_ = false;
if (config_->top_rate + config_->other_rate <= 0.5) {
auto bag_data_cnt = static_cast<data_size_t>((config_->top_rate + config_->other_rate) * num_data_);
bag_data_cnt = std::max(1, bag_data_cnt);
tmp_subset_.reset(new Dataset(bag_data_cnt));
tmp_subset_->CopyFeatureMapperFrom(train_data_);
is_use_subset_ = true;
}
// flag to not bagging first
bag_data_cnt_ = num_data_;
}
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
if (cnt <= 0) {
return 0;
}
std::vector<score_t> tmp_gradients(cnt, 0.0f);
for (data_size_t i = 0; i < cnt; ++i) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
}
}
data_size_t top_k = static_cast<data_size_t>(cnt * config_->top_rate);
data_size_t other_k = static_cast<data_size_t>(cnt * config_->other_rate);
top_k = std::max(1, top_k);
ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k - 1);
score_t threshold = tmp_gradients[top_k - 1];
score_t multiply = static_cast<score_t>(cnt - top_k) / other_k;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
data_size_t big_weight_cnt = 0;
for (data_size_t i = 0; i < cnt; ++i) {
score_t grad = 0.0f;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
grad += std::fabs(gradients_[idx] * hessians_[idx]);
}
if (grad >= threshold) {
buffer[cur_left_cnt++] = start + i;
++big_weight_cnt;
} else {
data_size_t sampled = cur_left_cnt - big_weight_cnt;
data_size_t rest_need = other_k - sampled;
data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
double prob = (rest_need) / static_cast<double>(rest_all);
if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
gradients_[idx] *= multiply;
hessians_[idx] *= multiply;
}
} else {
buffer_right[cur_right_cnt++] = start + i;
}
}
}
return cur_left_cnt;
}
void Bagging(int iter) override {
bag_data_cnt_ = num_data_;
// not subsample for first iterations
if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
const data_size_t min_inner_size = 100;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
OMP_LOOP_EX_BEGIN();
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
tmp_indice_right_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
bag_data_cnt_ = left_cnt;
// set bagging data to tree learner
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
#ifdef TIMETAG
subset_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
tree_learner_->ResetTrainingData(tmp_subset_.get());
#ifdef TIMETAG
re_init_tree_time += std::chrono::steady_clock::now() - start_time;
#endif
}
}
private:
std::vector<data_size_t> tmp_indice_right_;
};
} // namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_