Force checking the label used in lambdarank.

microsoft · Jul 21, 2017 · cf93bfa · alexeib · Dec 1, 2017 · cf93bfa
1 parent 9bb3b0d
commit cf93bfa
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 6 deletions.
diff --git a/docs/Parameters.md b/docs/Parameters.md
@@ -38,6 +38,8 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
     * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
   * `binary`, binary classification application 
   * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
+    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
+    * `label_gain` can be used to set the gain(weight) of `int` label.
   * `multiclass`, multi-class classification application, should set `num_class` as well
 * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
   * `gbdt`, traditional Gradient Boosting Decision Tree 

diff --git a/docs/Quick-Start.md b/docs/Quick-Start.md
@@ -46,11 +46,18 @@ Some important parameters:
 * ```task```, default=```train```, type=enum, options=```train```,```prediction```
   * ```train``` for training
   * ```prediction``` for prediction.
-* ```application```, default=```regression```, type=enum, options=```regression```,```binary```,```lambdarank```,```multiclass```, alias=```objective```,```app```
-  * ```regression```, regression application
-  * ```binary```, binary classification application 
-  * ```lambdarank```, lambdarank application
-  * ```multiclass```, multi-class classification application, should set ```num_class``` as well
+* `application`, default=`regression`, type=enum, options=`regression`,`regression_l1`,`huber`,`fair`,`poisson`,`binary`,`lambdarank`,`multiclass`, alias=`objective`,`app`
+  * `regression`, regression application
+    * `regression_l2`, L2 loss, alias=`mean_squared_error`,`mse`
+    * `regression_l1`, L1 loss, alias=`mean_absolute_error`,`mae`
+    * `huber`, [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia")
+    * `fair`, [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)
+    * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
+  * `binary`, binary classification application 
+  * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
+    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
+    * `label_gain` can be used to set the gain(weight) of `int` label.
+  * `multiclass`, multi-class classification application, should set `num_class` as well
 * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
   * `gbdt`, traditional Gradient Boosting Decision Tree 
   * `rf`, Random Forest

diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h
@@ -95,6 +95,13 @@ class DCGCalculator {
   static double CalMaxDCGAtK(data_size_t k,
     const float* label, data_size_t num_data);
 
+  /*!
+  * \brief Check the label range for NDCG and lambdarank
+  * \param label Pointer of label
+  * \param num_data Number of data
+  */
+  static void CheckLabel(const float* label, data_size_t num_data);
+
   /*!
   * \brief Calculate the Max DCG score at multi position
   * \param ks The positions want to eval at

diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp
@@ -56,7 +56,6 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
   std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
   // counts for all labels
   for (data_size_t i = 0; i < num_data; ++i) {
-    if (static_cast<size_t>(label[i]) >= label_cnt.size()) { Log::Fatal("Label excel %d", label[i]); }
     ++label_cnt[static_cast<int>(label[i])];
   }
   double cur_result = 0.0f;
@@ -127,4 +126,17 @@ void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const float* labe
   }
 }
 
+void DCGCalculator::CheckLabel(const float* label, data_size_t num_data) {
+  for (data_size_t i = 0; i < num_data; ++i) {
+    float delta = std::fabs(label[i] - static_cast<int>(label[i]));
+    if (delta > kEpsilon) {
+      Log::Fatal("label should be int type (met %f) for ranking task, \
+                 for the gain of label, please set the label_gain parameter.", label[i]);
+    }
+    if (static_cast<size_t>(label[i]) >= label_gain_.size() || label[i] < 0) {
+      Log::Fatal("label (%d) excel the max range %d", label[i], label_gain_.size());
+    }
+  }
+}
+
 }  // namespace LightGBM
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
@@ -40,6 +40,7 @@ class NDCGMetric:public Metric {
     num_data_ = num_data;
     // get label
     label_ = metadata.label();
+    DCGCalculator::CheckLabel(label_, num_data_);
     // get query boundaries
     query_boundaries_ = metadata.query_boundaries();
     if (query_boundaries_ == nullptr) {

diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp
@@ -47,6 +47,7 @@ class LambdarankNDCG: public ObjectiveFunction {
     num_data_ = num_data;
     // get label
     label_ = metadata.label();
+    DCGCalculator::CheckLabel(label_, num_data_);
     // get weights
     weights_ = metadata.weights();
     // get boundries