Skip to content

Commit

Permalink
avoid nan and inf in weight/label/init_score (#2377)
Browse files Browse the repository at this point in the history
* avoid nan and inf in weight/label/init_score

* use prefix increment
  • Loading branch information
guolinke authored and StrikerRUS committed Sep 7, 2019
1 parent de1f3cb commit 33d0378
Showing 1 changed file with 39 additions and 43 deletions.
82 changes: 39 additions & 43 deletions src/io/metadata.cpp
Expand Up @@ -20,7 +20,7 @@ Metadata::Metadata() {
init_score_load_from_file_ = false;
}

void Metadata::Init(const char * data_filename, const char* initscore_file) {
void Metadata::Init(const char* data_filename, const char* initscore_file) {
data_filename_ = data_filename;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries();
Expand All @@ -40,12 +40,8 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
Log::Info("Using weights in data file, ignoring the additional weights file");
weights_.clear();
}
weights_ = std::vector<label_t>(num_data_);
weights_ = std::vector<label_t>(num_data_, 0.0f);
num_weights_ = num_data_;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = 0.0f;
}
weight_load_from_file_ = false;
}
if (query_idx >= 0) {
Expand All @@ -54,11 +50,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
query_boundaries_.clear();
}
if (!query_weights_.empty()) { query_weights_.clear(); }
queries_ = std::vector<data_size_t>(num_data_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
queries_[i] = 0;
}
queries_ = std::vector<data_size_t>(num_data_, 0);
query_load_from_file_ = false;
}
}
Expand All @@ -67,16 +59,16 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
num_data_ = num_used_indices;

label_ = std::vector<label_t>(num_used_indices);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; ++i) {
label_[i] = fullset.label_[used_indices[i]];
}

if (!fullset.weights_.empty()) {
weights_ = std::vector<label_t>(num_used_indices);
num_weights_ = num_used_indices;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; i++) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_used_indices; ++i) {
weights_[i] = fullset.weights_[used_indices[i]];
}
} else {
Expand All @@ -85,12 +77,14 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da

if (!fullset.init_score_.empty()) {
int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_);
init_score_ = std::vector<double>(num_used_indices*num_class);
init_score_ = std::vector<double>(static_cast<size_t>(num_used_indices) * num_class);
num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class;
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k) {
for (data_size_t i = 0; i < num_used_indices; i++) {
init_score_[k*num_data_ + i] = fullset.init_score_[k* fullset.num_data_ + used_indices[i]];
const size_t offset_dest = static_cast<size_t>(k) * num_data_;
const size_t offset_src = static_cast<size_t>(k) * fullset.num_data_;
for (data_size_t i = 0; i < num_used_indices; ++i) {
init_score_[offset_dest + i] = fullset.init_score_[offset_src + used_indices[i]];
}
}
} else {
Expand Down Expand Up @@ -137,7 +131,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
auto old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]];
}
Expand Down Expand Up @@ -208,7 +202,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
auto old_weights = weights_;
num_weights_ = num_data_;
weights_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
}
Expand Down Expand Up @@ -269,10 +263,12 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k) {
const size_t offset_dest = static_cast<size_t>(k) * num_data_;
const size_t offset_src = static_cast<size_t>(k) * num_all_data;
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
init_score_[offset_dest + i] = old_scores[offset_src + used_data_indices[i]];
}
}
old_scores.clear();
Expand All @@ -297,9 +293,9 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
if (!init_score_.empty()) { init_score_.clear(); }
num_init_score_ = len;
init_score_ = std::vector<double>(len);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = init_score[i];
init_score_[i] = Common::AvoidInf(init_score[i]);
}
init_score_load_from_file_ = false;
}
Expand All @@ -314,9 +310,9 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
}
if (!label_.empty()) { label_.clear(); }
label_ = std::vector<label_t>(num_data_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = label[i];
label_[i] = Common::AvoidInf(label[i]);
}
}

Expand All @@ -334,9 +330,9 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
if (!weights_.empty()) { weights_.clear(); }
num_weights_ = num_data_;
weights_ = std::vector<label_t>(num_weights_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = weights[i];
weights_[i] = Common::AvoidInf(weights[i]);
}
LoadQueryWeights();
weight_load_from_file_ = false;
Expand All @@ -351,7 +347,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
return;
}
data_size_t sum = 0;
#pragma omp parallel for schedule(static) reduction(+:sum)
#pragma omp parallel for schedule(static) reduction(+:sum)
for (data_size_t i = 0; i < len; ++i) {
sum += query[i];
}
Expand Down Expand Up @@ -382,11 +378,11 @@ void Metadata::LoadWeights() {
Log::Info("Loading weights...");
num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = std::vector<label_t>(num_weights_);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
weights_[i] = static_cast<label_t>(tmp_weight);
weights_[i] = Common::AvoidInf(static_cast<label_t>(tmp_weight));
}
weight_load_from_file_ = true;
}
Expand All @@ -413,15 +409,15 @@ void Metadata::LoadInitialScore(const char* initscore_file) {

init_score_ = std::vector<double>(num_init_score_);
if (num_class == 1) {
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f;
Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<double>(tmp);
init_score_[i] = Common::AvoidInf(static_cast<double>(tmp));
}
} else {
std::vector<std::string> oneline_init_score;
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_line; ++i) {
double tmp = 0.0f;
oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
Expand All @@ -430,7 +426,7 @@ void Metadata::LoadInitialScore(const char* initscore_file) {
}
for (int k = 0; k < num_class; ++k) {
Common::Atof(oneline_init_score[k].c_str(), &tmp);
init_score_[k * num_line + i] = static_cast<double>(tmp);
init_score_[static_cast<size_t>(k) * num_line + i] = Common::AvoidInf(static_cast<double>(tmp));
}
}
}
Expand Down Expand Up @@ -487,21 +483,21 @@ void Metadata::LoadFromMemory(const void* memory) {

if (!label_.empty()) { label_.clear(); }
label_ = std::vector<label_t>(num_data_);
std::memcpy(label_.data(), mem_ptr, sizeof(label_t)*num_data_);
mem_ptr += sizeof(label_t)*num_data_;
std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_);
mem_ptr += sizeof(label_t) * num_data_;

if (num_weights_ > 0) {
if (!weights_.empty()) { weights_.clear(); }
weights_ = std::vector<label_t>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(label_t)*num_weights_);
mem_ptr += sizeof(label_t)*num_weights_;
std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_);
mem_ptr += sizeof(label_t) * num_weights_;
weight_load_from_file_ = true;
}
if (num_queries_ > 0) {
if (!query_boundaries_.empty()) { query_boundaries_.clear(); }
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
mem_ptr += sizeof(data_size_t)*(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1));
mem_ptr += sizeof(data_size_t) * (num_queries_ + 1);
query_load_from_file_ = true;
}
LoadQueryWeights();
Expand All @@ -520,7 +516,7 @@ void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const {
}
}

size_t Metadata::SizesInByte() const {
size_t Metadata::SizesInByte() const {
size_t size = sizeof(num_data_) + sizeof(num_weights_)
+ sizeof(num_queries_);
size += sizeof(label_t) * num_data_;
Expand Down

0 comments on commit 33d0378

Please sign in to comment.