Add categorical feature support back.

microsoft · Mar 1, 2017 · ef77806 · ef77806
1 parent d93eb33
commit ef77806
Show file tree

Hide file tree

Showing 41 changed files with 1,097 additions and 296 deletions.
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -12,6 +12,7 @@ export(lgb.Dataset)
 export(lgb.Dataset.construct)
 export(lgb.Dataset.create.valid)
 export(lgb.Dataset.save)
+export(lgb.Dataset.set.categorical)
 export(lgb.Dataset.set.reference)
 export(lgb.cv)
 export(lgb.dump)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
@@ -12,6 +12,7 @@ Dataset <- R6Class(
                           params              = list(),
                           reference           = NULL,
                           colnames            = NULL,
+                          categorical_feature = NULL,
                           predictor           = NULL,
                           free_raw_data       = TRUE,
                           used_indices        = NULL,
@@ -41,6 +42,7 @@ Dataset <- R6Class(
       private$reference <- reference
       private$colnames  <- colnames
 
+      private$categorical_feature <- categorical_feature
       private$predictor           <- predictor
       private$free_raw_data       <- free_raw_data
       private$used_indices        <- used_indices
@@ -52,6 +54,7 @@ Dataset <- R6Class(
         private$params,
         self,
         private$colnames,
+        private$categorical_feature,
         private$predictor,
         private$free_raw_data,
         NULL,
@@ -73,6 +76,21 @@ Dataset <- R6Class(
       if (is.null(private$colnames) && !is.null(cnames)) {
         private$colnames <- as.character(cnames)
       }
+      # Get categorical feature index
+      if (!is.null(private$categorical_feature)) {
+        if (typeof(private$categorical_feature) == "character") {
+            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
+            if (sum(is.na(cate_indices)) > 0) {
+              stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
+            }
+          } else {
+            if (max(private$categorical_feature) > length(private$colnames)) {
+              stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
+            }
+            cate_indices <- as.list(private$categorical_feature - 1)
+          }
+        private$params$categorical_feature <- cate_indices
+      }
       # Check has header or not
       has_header <- FALSE
       if (!is.null(private$params$has_header) ||
@@ -271,6 +289,7 @@ Dataset <- R6Class(
         private$params,
         self,
         private$colnames,
+        private$categorical_feature,
         private$predictor,
         private$free_raw_data,
         idxset,
@@ -282,7 +301,20 @@ Dataset <- R6Class(
       private$params <- modifyList(private$params, params)
       self
     },
+    set_categorical_feature = function(categorical_feature) {
+      if (identical(private$categorical_feature, categorical_feature)) { return(self) }
+      if (is.null(private$raw_data)) {
+        stop(
+          "set_categorical_feature: cannot set categorical feature after freeing raw data,
+          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
+        )
+      }
+      private$categorical_feature <- categorical_feature
+      self$finalize()
+      self
+    },
     set_reference = function(reference) {
+      self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
       self$set_colnames(reference$get_colnames())
       private$set_predictor(reference$.__enclos_env__$private$predictor)
       if (identical(private$reference, reference)) { return(self) }
@@ -316,6 +348,7 @@ Dataset <- R6Class(
     params              = list(),
     reference           = NULL,
     colnames            = NULL,
+    categorical_feature = NULL,
     predictor           = NULL,
     free_raw_data       = TRUE,
     used_indices        = NULL,
@@ -353,6 +386,7 @@ Dataset <- R6Class(
 #' @param params a list of parameters
 #' @param reference reference dataset
 #' @param colnames names of columns
+#' @param categorical_feature categorical features
 #' @param free_raw_data TRUE for need to free raw data after construct
 #' @param info a list of information of the lgb.Dataset object
 #' @param ... other information to pass to \code{info} or parameters pass to \code{params}
@@ -371,6 +405,7 @@ lgb.Dataset <- function(data,
                         params              = list(),
                         reference           = NULL,
                         colnames            = NULL,
+                        categorical_feature = NULL,
                         free_raw_data       = TRUE,
                         info                = list(),
                         ...) {
@@ -379,6 +414,7 @@ lgb.Dataset <- function(data,
     params,
     reference,
     colnames,
+    categorical_feature,
     NULL,
     free_raw_data,
     NULL,
@@ -628,6 +664,29 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
   dataset$setinfo(name, info)
 }
 
+#' Set categorical feature of \code{lgb.Dataset}
+#'
+#' @param dataset object of class \code{lgb.Dataset}
+#' @param categorical_feature categorical features
+#' @return passed dataset
+#' @examples
+#' \dontrun{
+#'   data(agaricus.train, package='lightgbm')
+#'   train <- agaricus.train
+#'   dtrain <- lgb.Dataset(train$data, label=train$label)
+#'   lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
+#'   dtrain <- lgb.Dataset('lgb.Dataset.data')
+#'   lgb.Dataset.set.categorical(dtrain, 1:2)
+#' }
+#' @rdname lgb.Dataset.set.categorical
+#' @export
+lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
+  if (!lgb.is.Dataset(dataset)) {
+    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
+  }
+  dataset$set_categorical_feature(categorical_feature)
+}
+
 #' Set reference of \code{lgb.Dataset}
 #'
 #' If you want to use validation data, you should set reference to training data

diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
@@ -46,6 +46,9 @@ CVBooster <- R6Class(
 #'        the \code{nfold} and \code{stratified} parameters are ignored.
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
+#' @param categorical_feature list of str or int
+#'        type int represents index,
+#'        type str represents feature names
 #' @param early_stopping_rounds int
 #'        Activates early stopping.
 #'        Requires at least one validation data and one metric
@@ -81,6 +84,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
                    folds                 = NULL,
                    init_model            = NULL,
                    colnames              = NULL,
+                   categorical_feature   = NULL,
                    early_stopping_rounds = NULL,
                    callbacks             = list(), ...) {
   addiction_params <- list(...)
@@ -118,6 +122,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
   data$update_params(params)
   data$.__enclos_env__$private$set_predictor(predictor)
   if (!is.null(colnames)) { data$set_colnames(colnames) }
+  if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
   data$construct()
 
   if (!is.null(folds)) {

diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R
@@ -19,6 +19,7 @@
 #'  \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
 #'  \item \code{split_gain}: Split gain of a node
 #'  \item \code{threshold}: Spliting threshold value of a node
+#'  \item \code{decision_type}: Decision type of a node
 #'  \item \code{internal_value}: Node value
 #'  \item \code{internal_count}: The number of observation collected by a node
 #'  \item \code{leaf_value}: Leaf value
@@ -62,14 +63,14 @@ single.tree.parse <- function(lgb_tree) {
   single_tree_dt <- data.table::data.table(tree_index = integer(0),
                                            split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
                                            leaf_index = integer(0), leaf_parent = integer(0),
-                                           split_gain = numeric(0), threshold = numeric(0),
+                                           split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
                                            internal_value = integer(0), internal_count = integer(0),
                                            leaf_value = integer(0), leaf_count = integer(0))
   pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
     if (!is.null(tree_node_leaf$split_index)) {
       single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
                                                         c(tree_node_leaf[c("split_index", "split_feature",
-                                                                           "split_gain", "threshold",
+                                                                           "split_gain", "threshold", "decision_type",
                                                                            "internal_value", "internal_count")],
                                                           "node_parent" = parent_index)),
                                                use.names = TRUE, fill = TRUE)

diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
@@ -18,6 +18,9 @@
 #' @param eval_freq evalutaion output frequency, only effect when verbose > 0
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
+#' @param categorical_feature list of str or int
+#'        type int represents index,
+#'        type str represents feature names
 #' @param early_stopping_rounds int
 #'        Activates early stopping.
 #'        Requires at least one validation data and one metric
@@ -52,6 +55,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
                       eval_freq             = 1L,
                       init_model            = NULL,
                       colnames              = NULL,
+                      categorical_feature   = NULL,
                       early_stopping_rounds = NULL,
                       callbacks             = list(), ...) {
   additional_params <- list(...)
@@ -96,6 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
   data$update_params(params)
   data$.__enclos_env__$private$set_predictor(predictor)
   if (!is.null(colnames)) { data$set_colnames(colnames) }
+  if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
   data$construct()
   vaild_contain_train <- FALSE
   train_data_name     <- "train"

diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd
diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
diff --git a/README.md b/README.md
@@ -20,8 +20,7 @@ News
 
 01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.
 
-12/05/2016 : [deprecated in v2]**Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
-For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
+12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding.
 
 12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.
 

diff --git a/docs/FAQ.md b/docs/FAQ.md
@@ -20,11 +20,11 @@ LightGBM FAQ
 
 - **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).
 
-- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
+- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
 
 - **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:
 
   + get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label`
   + set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array`
   + get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
-  + set predictor(or reference) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
+  + set predictor(or reference/categorical feature) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
diff --git a/docs/Parameters.md b/docs/Parameters.md
@@ -150,6 +150,11 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can
   * Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored.
   * Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored.
   * Note: Index start from ```0```. And it doesn't count the label column.
+* ```categorical_feature```, default=```""```, type=string, alias=```categorical_column```,```cat_feature```,```cat_column```
+  * specific categorical features
+  * Use number for index, e.g. ```categorical_feature=0,1,2``` means column_0, column_1 and column_2 are categorical features.
+  * Add a prefix ```name:``` for column name, e.g. ```categorical_feature=name:c1,c2,c3``` means c1, c2 and c3 are categorical features.
+  * Note: Only support categorical with ```int``` type. Index start from ```0```. And it doesn't count the label column.
 * ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score```
   * only used in prediction task
   * Set to ```true``` will only predict the raw scores.