microsoft · shiyu1994 · Sep 7, 2020 · Sep 7, 2020 · Sep 8, 2020 · Sep 10, 2020
@@ -30,6 +30,7 @@
         , "use_missing"
         , "weight_column"
         , "zero_as_missing"
+        , "category_encoders"
     )])
 }
 

@@ -195,7 +195,6 @@ Dataset <- R6::R6Class(
             }
 
           } else {
-
             # Check if more categorical features were output over the feature space
             if (max(private$categorical_feature) > length(private$colnames)) {
               stop(
@@ -249,18 +248,28 @@ Dataset <- R6::R6Class(
           )
 
         } else if (is.matrix(private$raw_data)) {
-
+          if (is.null(private$info[["label"]])) {
+            label <- NULL
+          } else {
+            label <- as.numeric(private$info[["label"]])
+          }
           # Are we using a matrix?
           handle <- .Call(
             LGBM_DatasetCreateFromMat_R
             , private$raw_data
+            , label
             , nrow(private$raw_data)
             , ncol(private$raw_data)
             , params_str
             , ref_handle
           )
 
         } else if (methods::is(private$raw_data, "dgCMatrix")) {
+          if (is.null(private$info[["label"]])) {
+            label <- NULL
+          } else {
+            label <- as.numeric(private$info[["label"]])
+          }
           if (length(private$raw_data@p) > 2147483647L) {
             stop("Cannot support large CSC matrix")
           }
@@ -270,6 +279,7 @@ Dataset <- R6::R6Class(
             , private$raw_data@p
             , private$raw_data@i
             , private$raw_data@x
+            , label
             , length(private$raw_data@p)
             , length(private$raw_data@x)
             , nrow(private$raw_data)

@@ -29,6 +29,7 @@ OBJECTS = \
     io/bin.o \
     io/config.o \
     io/config_auto.o \
+    io/category_encoding_provider.o \
     io/dataset.o \
     io/dataset_loader.o \
     io/file_io.o \

@@ -30,6 +30,7 @@ OBJECTS = \
     io/bin.o \
     io/config.o \
     io/config_auto.o \
+    io/category_encoding_provider.o \
     io/dataset.o \
     io/dataset_loader.o \
     io/file_io.o \

@@ -145,6 +145,7 @@ SEXP LGBM_DatasetCreateFromFile_R(SEXP filename,
 SEXP LGBM_DatasetCreateFromCSC_R(SEXP indptr,
   SEXP indices,
   SEXP data,
+  SEXP label,
   SEXP num_indptr,
   SEXP nelem,
   SEXP num_row,
@@ -155,17 +156,28 @@ SEXP LGBM_DatasetCreateFromCSC_R(SEXP indptr,
   const int* p_indptr = INTEGER(indptr);
   const int* p_indices = INTEGER(indices);
   const double* p_data = REAL(data);
+  const double* p_label = Rf_isNull(label) ? nullptr : REAL(label);
   int64_t nindptr = static_cast<int64_t>(Rf_asInteger(num_indptr));
   int64_t ndata = static_cast<int64_t>(Rf_asInteger(nelem));
   int64_t nrow = static_cast<int64_t>(Rf_asInteger(num_row));
   const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
+  const float* float_p_label = nullptr;
+  std::vector<float> float_label_vec;
+  if (p_label != nullptr) {
+    float_label_vec.resize(nrow);
+    #pragma omp parallel for schedule(static) if (nrow >= 1024)
+    for (int i = 0; i < nrow; ++i) {
+      float_label_vec[i] = static_cast<float>(p_label[i]);
+    }
+    float_p_label = float_label_vec.data();
+  }
   DatasetHandle handle = nullptr;
   DatasetHandle ref = nullptr;
   if (!Rf_isNull(reference)) {
     ref = R_ExternalPtrAddr(reference);
   }
-  CHECK_CALL(LGBM_DatasetCreateFromCSC(p_indptr, C_API_DTYPE_INT32, p_indices,
-    p_data, C_API_DTYPE_FLOAT64, nindptr, ndata,
+  CHECK_CALL(LGBM_DatasetCreateFromCSCWithLabel(p_indptr, C_API_DTYPE_INT32, p_indices,
+    p_data, float_p_label, C_API_DTYPE_FLOAT64, nindptr, ndata,
     nrow, parameters_ptr, ref, &handle));
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
@@ -175,6 +187,7 @@ SEXP LGBM_DatasetCreateFromCSC_R(SEXP indptr,
 }
 
 SEXP LGBM_DatasetCreateFromMat_R(SEXP data,
+  SEXP label,
   SEXP num_row,
   SEXP num_col,
   SEXP parameters,
@@ -184,13 +197,24 @@ SEXP LGBM_DatasetCreateFromMat_R(SEXP data,
   int32_t nrow = static_cast<int32_t>(Rf_asInteger(num_row));
   int32_t ncol = static_cast<int32_t>(Rf_asInteger(num_col));
   double* p_mat = REAL(data);
+  double* p_label = Rf_isNull(label) ? nullptr : REAL(label);
+  const float* float_p_label = nullptr;
+  std::vector<float> float_label_vec;
+  if (p_label != nullptr) {
+    float_label_vec.resize(nrow);
+    #pragma omp parallel for schedule(static) if (nrow >= 1024)
+    for (int i = 0; i < nrow; ++i) {
+      float_label_vec[i] = static_cast<float>(p_label[i]);
+    }
+    float_p_label = float_label_vec.data();
+  }
   const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
   DatasetHandle handle = nullptr;
   DatasetHandle ref = nullptr;
   if (!Rf_isNull(reference)) {
     ref = R_ExternalPtrAddr(reference);
   }
-  CHECK_CALL(LGBM_DatasetCreateFromMat(p_mat, C_API_DTYPE_FLOAT64, nrow, ncol, COL_MAJOR,
+  CHECK_CALL(LGBM_DatasetCreateFromMatWithLabel(p_mat, float_p_label, C_API_DTYPE_FLOAT64, nrow, ncol, COL_MAJOR,
     parameters_ptr, ref, &handle));
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
@@ -926,8 +950,8 @@ SEXP LGBM_DumpParamAliases_R() {
 static const R_CallMethodDef CallEntries[] = {
   {"LGBM_HandleIsNull_R"              , (DL_FUNC) &LGBM_HandleIsNull_R              , 1},
   {"LGBM_DatasetCreateFromFile_R"     , (DL_FUNC) &LGBM_DatasetCreateFromFile_R     , 3},
-  {"LGBM_DatasetCreateFromCSC_R"      , (DL_FUNC) &LGBM_DatasetCreateFromCSC_R      , 8},
-  {"LGBM_DatasetCreateFromMat_R"      , (DL_FUNC) &LGBM_DatasetCreateFromMat_R      , 5},
+  {"LGBM_DatasetCreateFromCSC_R"      , (DL_FUNC) &LGBM_DatasetCreateFromCSC_R      , 9},
+  {"LGBM_DatasetCreateFromMat_R"      , (DL_FUNC) &LGBM_DatasetCreateFromMat_R      , 6},
   {"LGBM_DatasetGetSubset_R"          , (DL_FUNC) &LGBM_DatasetGetSubset_R          , 4},
   {"LGBM_DatasetSetFeatureNames_R"    , (DL_FUNC) &LGBM_DatasetSetFeatureNames_R    , 2},
   {"LGBM_DatasetGetFeatureNames_R"    , (DL_FUNC) &LGBM_DatasetGetFeatureNames_R    , 1},

@@ -46,6 +46,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetCreateFromFile_R(
 * \param indptr pointer to row headers
 * \param indices findex
 * \param data fvalue
+* \param label label
 * \param num_indptr number of cols in the matrix + 1
 * \param nelem number of nonzero elements in the matrix
 * \param num_row number of rows
@@ -57,6 +58,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetCreateFromCSC_R(
   SEXP indptr,
   SEXP indices,
   SEXP data,
+  SEXP label,
   SEXP num_indptr,
   SEXP nelem,
   SEXP num_row,
@@ -67,6 +69,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetCreateFromCSC_R(
 /*!
 * \brief create Dataset from dense matrix
 * \param data matrix data
+* \param label label
 * \param num_row number of rows
 * \param num_col number columns
 * \param parameters additional parameters
@@ -75,6 +78,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetCreateFromCSC_R(
 */
 LIGHTGBM_C_EXPORT SEXP LGBM_DatasetCreateFromMat_R(
   SEXP data,
+  SEXP label,
   SEXP num_row,
   SEXP num_col,
   SEXP parameters,

@@ -2235,6 +2235,113 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
 
 })
 
+test_that("Category encoding for R package works", {
+  # test category_encoders
+  set.seed(1L)
+  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtest <- lgb.Dataset(test$data, label = test$label, reference = dtrain)
+  cat_fid <- c(1L, 2L, 3L, 4L)
+  # ``` category_encoders = "" ```   is equal to   ``` category_encoders = "raw" ```
+  params <- list(objective = "binary", categorical_feature = cat_fid, category_encoders = "")
+  bst <- lightgbm(
+    data = dtrain
+    , params = params
+    , nrounds = 10L
+    , verbose = 2L
+    , valids = list("valid1" = dtest)
+  )
+  pred1 <- bst$predict(test$data)
+
+  # treat the first 4 features as categorical features
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+    , categorical_feature = cat_fid
+    , category_encoders = "raw"
+  )
+  dtest <- lgb.Dataset(
+    test$data
+    , label = test$label
+    , categorical_feature = cat_fid
+    , reference = dtrain
+  )
+  params <- list(objective = "binary")
+  bst <- lightgbm(
+    data = dtrain
+    , params = params
+    , nrounds = 10L
+    , verbose = 2L
+    , valids = list("valid1" = dtest)
+  )
+  pred2 <- bst$predict(test$data)
+  expect_equal(pred1, pred2)
+
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+    , categorical_feature = cat_fid
+  )
+  dtest <- lgb.Dataset(
+    test$data
+    , label = test$label
+    , categorical_feature = cat_fid
+    , reference = dtrain
+  )
+  params <- list(objective = "binary", category_encoders = "target,count,raw")
+  bst <- lightgbm(
+    data = dtrain
+    , params = params
+    , nrounds = 10L
+    , verbose = 2L
+    , valids = list("valid1" = dtest)
+  )
+  pred3 <- bst$predict(test$data)
+  # one new "count" and "target" feature is added per categorical feature
+  num_new_cat_features <- length(cat_fid) * 2L
+  expect_equal(dim(dtrain), c(nrow(train$data), ncol(train$data) + num_new_cat_features))
+
+  # test gbdt model with category_encoders
+  model_file <- tempfile(fileext = ".model")
+  lgb.save(bst, model_file)
+  # finalize the booster and destroy it so you know we aren't cheating
+  bst$finalize()
+  expect_null(bst$.__enclos_env__$private$handle)
+  rm(bst)
+
+  bst2 <- lgb.load(
+      filename = model_file
+  )
+  pred4 <- predict(bst2, test$data)
+  expect_equal(pred3, pred4)
+
+
+  # test Dataset binary store with category_encoders
+  tmp_file <- tempfile(pattern = "lgb.Dataset_Category_Encoding_")
+  lgb.Dataset.save(
+    dataset = dtrain
+    , fname = tmp_file
+  )
+  dtrain_read_in <- lgb.Dataset(data = tmp_file)
+
+  tmp_file <- tempfile(pattern = "lgb.Dataset_Category_Encoding_2_")
+  lgb.Dataset.save(
+    dataset = dtest
+    , fname = tmp_file
+  )
+  dtest_read_in <- lgb.Dataset(data = tmp_file)
+
+  bst <- lightgbm(
+    data = dtrain_read_in
+    , params = params
+    , nrounds = 10L
+    , verbose = 2L
+    , valids = list("valid1" = dtest_read_in)
+  )
+  pred5 <- bst$predict(test$data)
+  expect_equal(pred3, pred5)
+})
+
+
 context("monotone constraints")
 
 .generate_trainset_for_monotone_constraints_tests <- function(x3_to_categorical) {

@@ -235,6 +235,7 @@ test_that("lgb.Dataset: Dataset should be able to construct from matrix and retu
   handle <- .Call(
     LGBM_DatasetCreateFromMat_R
     , rawData
+    , NULL
     , nrow(rawData)
     , ncol(rawData)
     , lightgbm:::lgb.params2str(params = list())

@@ -15,22 +15,50 @@ Missing Value Handle
 Categorical Feature Support
 ---------------------------
 
--  LightGBM offers good accuracy with integer-encoded categorical features. LightGBM applies
-   `Fisher (1958) <https://www.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479>`_
-   to find the optimal split over categories as
-   `described here <./Features.rst#optimal-split-for-categorical-features>`_. This often performs better than one-hot encoding.
+-  LightGBM offers good accuracy with integer-encoded categorical features. LightGBM offers the following approaches to deal with categorical features:
+
+   -  Method 1: Applies `Fisher (1958) <https://www.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479>`__ to find the optimal split over categories as `described here <./Features.rst#optimal-split-for-categorical-features>`__.
+
+   -  Method 2: Encoding categorical features into numerical values. LightGBM provides two encoding options:
+
+      -  **Target encoding**: encode the categorical feature value by the mean of labels of data with the same feature value in the training set. It is easy to overfit the training data if the encoded value of a training data point uses the label of that training data point itself. So LightGBM randomly divides the training data into folds, and when calculating the target encoding for data in one fold, only considers data in other folds.
+
+      -  **Count encoding**: encode the categorical feature value by the total number of data with the same feature value in the training set.
+
+   These methods often perform better than one-hot encoding.
 
 -  Use ``categorical_feature`` to specify the categorical features.
    Refer to the parameter ``categorical_feature`` in `Parameters <./Parameters.rst#categorical_feature>`__.
 
 -  Categorical features must be encoded as non-negative integers (``int``) less than ``Int32.MaxValue`` (2147483647).
    It is best to use a contiguous range of integers started from zero.
 
--  Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large).
+-  Use ``category_encoders`` to specify the methods used to deal with categorical features. Use
+
+   -  ``raw`` to indicate method 1.
+
+   -  ``target[:prior]`` to indicate target encoding in method 2. The ``prior`` is a real number used to smooth the calculation of encoded values. So ``target[:prior]`` is calculated as: ``(sum_label + prior * prior_weight) / (count + prior_weight)``. Here ``sum_label`` is the sum of labels of data in the training set with the same categorical feature value, ``count`` is the total number of data with the same feature value in the training set (the value of count encoding), and ``prior_weight`` is a hyper-parameter. If the prior value is missing, we use the mean of all labels of training data as default prior.
+
+   -  ``count`` to indicate count encoding in method 2.
+
+   Note that the aforementioned methods can be used simultaneously. Different methods are separated by commas.
+   For example ``category_encoders=target:0.5,target,count,raw`` will enable using splits with method 1, and in addition, convert each categorical feature into 3 numerical features. The first one uses target encoding with prior ``0.5``. The second one uses target encoding with default prior, which is the mean of labels of the training data. The third one uses count encoding.
+   When ``category_encoders`` is empty, ``raw`` will be used by default. The numbers and names of features will be changed when ``category_encoders`` is not ``raw``.
+   Suppose the original name of a feature is ``NAME``, the naming rules of its target and count encoding features are:
+
+   -  For the encoder ``target`` (without user specified prior), it will be named as ``NAME_label_mean_prior_target_encoding_<label_mean>``, where ``<label_mean>`` is the mean of all labels in the training set.
+
+   -  For the encoder ``target:<prior>`` (with user specified prior), it will be named as ``NAME_target_encoding_<prior>``.
+
+   -  For the encoder ``count``, it will be named as ``NAME_count_encoding``.
+
+   Use ``get_feature_name()`` of Python Booster class or ``feature_name()`` of Python Dataset class after training to get the actual feature names used when ``category_encoders`` is set.
+
+-  Use ``num_target_encoding_folds`` to specify the number of folds to divide the training data when using target encoding.
+
+-  Use ``prior_weight`` to specify the weight of prior in target encoding calculation. Higher value will enforce more regularization on target encoding.
 
--  For a categorical feature with high cardinality (``#category`` is large), it often works best to
-   treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or
-   by embedding the categories in a low-dimensional numeric space.
+-  When using method 1 (in other words, ``raw`` is enabled in ``category_encoders``), use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large).
 
 LambdaRank
 ----------