Skip to content

Commit

Permalink
Add categorical feature support back.
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed Mar 1, 2017
1 parent d93eb33 commit ef77806
Show file tree
Hide file tree
Showing 41 changed files with 1,097 additions and 296 deletions.
1 change: 1 addition & 0 deletions R-package/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(lgb.Dataset)
export(lgb.Dataset.construct)
export(lgb.Dataset.create.valid)
export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference)
export(lgb.cv)
export(lgb.dump)
Expand Down
59 changes: 59 additions & 0 deletions R-package/R/lgb.Dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Dataset <- R6Class(
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
Expand Down Expand Up @@ -41,6 +42,7 @@ Dataset <- R6Class(
private$reference <- reference
private$colnames <- colnames

private$categorical_feature <- categorical_feature
private$predictor <- predictor
private$free_raw_data <- free_raw_data
private$used_indices <- used_indices
Expand All @@ -52,6 +54,7 @@ Dataset <- R6Class(
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
Expand All @@ -73,6 +76,21 @@ Dataset <- R6Class(
if (is.null(private$colnames) && !is.null(cnames)) {
private$colnames <- as.character(cnames)
}
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
if (typeof(private$categorical_feature) == "character") {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
}
} else {
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
}
cate_indices <- as.list(private$categorical_feature - 1)
}
private$params$categorical_feature <- cate_indices
}
# Check has header or not
has_header <- FALSE
if (!is.null(private$params$has_header) ||
Expand Down Expand Up @@ -271,6 +289,7 @@ Dataset <- R6Class(
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
idxset,
Expand All @@ -282,7 +301,20 @@ Dataset <- R6Class(
private$params <- modifyList(private$params, params)
self
},
set_categorical_feature = function(categorical_feature) {
if (identical(private$categorical_feature, categorical_feature)) { return(self) }
if (is.null(private$raw_data)) {
stop(
"set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
}
private$categorical_feature <- categorical_feature
self$finalize()
self
},
set_reference = function(reference) {
self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
self$set_colnames(reference$get_colnames())
private$set_predictor(reference$.__enclos_env__$private$predictor)
if (identical(private$reference, reference)) { return(self) }
Expand Down Expand Up @@ -316,6 +348,7 @@ Dataset <- R6Class(
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
Expand Down Expand Up @@ -353,6 +386,7 @@ Dataset <- R6Class(
#' @param params a list of parameters
#' @param reference reference dataset
#' @param colnames names of columns
#' @param categorical_feature categorical features
#' @param free_raw_data TRUE for need to free raw data after construct
#' @param info a list of information of the lgb.Dataset object
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
Expand All @@ -371,6 +405,7 @@ lgb.Dataset <- function(data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
info = list(),
...) {
Expand All @@ -379,6 +414,7 @@ lgb.Dataset <- function(data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
Expand Down Expand Up @@ -628,6 +664,29 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
dataset$setinfo(name, info)
}

#' Set categorical feature of \code{lgb.Dataset}
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param categorical_feature categorical features
#' @return passed dataset
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
#' dtrain <- lgb.Dataset('lgb.Dataset.data')
#' lgb.Dataset.set.categorical(dtrain, 1:2)
#' }
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
}
dataset$set_categorical_feature(categorical_feature)
}

#' Set reference of \code{lgb.Dataset}
#'
#' If you want to use validation data, you should set reference to training data
Expand Down
5 changes: 5 additions & 0 deletions R-package/R/lgb.cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ CVBooster <- R6Class(
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
Expand Down Expand Up @@ -81,6 +84,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
addiction_params <- list(...)
Expand Down Expand Up @@ -118,6 +122,7 @@ lgb.cv <- function(params=list(), data, nrounds = 10,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()

if (!is.null(folds)) {
Expand Down
5 changes: 3 additions & 2 deletions R-package/R/lgb.model.dt.tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{internal_value}: Node value
#' \item \code{internal_count}: The number of observation collected by a node
#' \item \code{leaf_value}: Leaf value
Expand Down Expand Up @@ -62,14 +63,14 @@ single.tree.parse <- function(lgb_tree) {
single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0),
split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0))
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
if (!is.null(tree_node_leaf$split_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold",
"split_gain", "threshold", "decision_type",
"internal_value", "internal_count")],
"node_parent" = parent_index)),
use.names = TRUE, fill = TRUE)
Expand Down
5 changes: 5 additions & 0 deletions R-package/R/lgb.train.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
Expand Down Expand Up @@ -52,6 +55,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
additional_params <- list(...)
Expand Down Expand Up @@ -96,6 +100,7 @@ lgb.train <- function(params = list(), data, nrounds = 10,
data$update_params(params)
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
data$construct()
vaild_contain_train <- FALSE
train_data_name <- "train"
Expand Down
30 changes: 30 additions & 0 deletions R-package/man/lgb.Dataset.set.categorical.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions R-package/man/lgb.model.dt.tree.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions R-package/man/lgb.train.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ News

01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.

12/05/2016 : [deprecated in v2]**Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding.

12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.

Expand Down
4 changes: 2 additions & 2 deletions docs/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ LightGBM FAQ

- **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).

- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
- **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already contruct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.

- **Solution 2**: Because LightGBM contructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when contruct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:

+ get label(or weight/init_score/group) before contruct dataset, it's same as get `self.label`
+ set label(or weight/init_score/group) before contruct dataset, it's same as `self.label=some_label_array`
+ get num_data(or num_feature) before contruct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
+ set predictor(or reference) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
+ set predictor(or reference/categorical feature) after contruct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
5 changes: 5 additions & 0 deletions docs/Parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can
* Use number for index, e.g. ```ignore_column=0,1,2``` means column_0, column_1 and column_2 will be ignored.
* Add a prefix ```name:``` for column name, e.g. ```ignore_column=name:c1,c2,c3``` means c1, c2 and c3 will be ignored.
* Note: Index start from ```0```. And it doesn't count the label column.
* ```categorical_feature```, default=```""```, type=string, alias=```categorical_column```,```cat_feature```,```cat_column```
* specific categorical features
* Use number for index, e.g. ```categorical_feature=0,1,2``` means column_0, column_1 and column_2 are categorical features.
* Add a prefix ```name:``` for column name, e.g. ```categorical_feature=name:c1,c2,c3``` means c1, c2 and c3 are categorical features.
* Note: Only support categorical with ```int``` type. Index start from ```0```. And it doesn't count the label column.
* ```predict_raw_score```, default=```false```, type=bool, alias=```raw_score```,```is_predict_raw_score```
* only used in prediction task
* Set to ```true``` will only predict the raw scores.
Expand Down

0 comments on commit ef77806

Please sign in to comment.