[R-package] Add more examples and bank dataset (#887)

* Add efficient demo for LightGBM * Add bank dataset and more examples * Add bank manual
microsoft · Sep 4, 2017 · 898c88d · 898c88d
1 parent af36aaa
commit 898c88d
Show file tree

Hide file tree

Showing 7 changed files with 256 additions and 1 deletion.
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
@@ -96,6 +96,27 @@ NULL
 #' rows and 126 variables
 NULL
 
+#' Bank Marketing Data Set
+#'
+#' This data set is originally from the Bank Marketing data set,
+#' UCI Machine Learning Repository.
+#'
+#' It contains only the following: bank.csv with 10% of the examples and 17 inputs,
+#' randomly selected from 3 (older version of this dataset with less inputs).
+#'
+#' @references
+#' http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
+#' 
+#' S. Moro, P. Cortez and P. Rita. (2014)
+#' A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems
+#'
+#' @docType data
+#' @keywords datasets
+#' @name bank
+#' @usage data(bank)
+#' @format A data.table with 4521 rows and 17 variables
+NULL
+
 # Various imports
 #' @import methods
 #' @importFrom R6 R6Class

diff --git a/R-package/data/bank.rda b/R-package/data/bank.rda
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
@@ -1,7 +1,10 @@
 basic_walkthrough               Basic feature walkthrough
 boost_from_prediction           Boosting from existing prediction
-early_stopping                  Early Stop in training
+categorical_feature_prepare     Categorical Feature Preparation
+categorical_feature_rules       Categorical Feature Preparation with Rules
 cross_validation                Cross Validation
+early_stopping                  Early Stop in training
+efficient_many_training         Efficiency for Many Model Trainings
 multiclass                      Multiclass training/prediction
 leaf_stability                  Leaf (in)Stability example
 weight_param                    Weight-Parameter adjustment relationship
diff --git a/R-package/demo/categorical_features_prepare.R b/R-package/demo/categorical_features_prepare.R
@@ -0,0 +1,81 @@
+# Here we are going to try training a model with categorical features
+
+# Load libraries
+library(data.table)
+library(lightgbm)
+
+# Load data and look at the structure
+# 
+# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+data(bank, package = "lightgbm")
+str(bank)
+
+# We must now transform the data to fit in LightGBM
+# For this task, we use lgb.prepare
+# The function transforms the data into a fittable data
+# 
+# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+bank <- lgb.prepare(data = bank)
+str(bank)
+
+# Remove 1 to label because it must be between 0 and 1
+bank$y <- bank$y - 1
+
+# Data input to LightGBM must be a matrix, without the label
+my_data <- as.matrix(bank[, 1:16, with = FALSE])
+
+# Creating the LightGBM dataset with categorical features
+# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
+lgb_data <- lgb.Dataset(data = my_data,
+                        label = bank$y,
+                        categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
+
+# We can now train a model
+model <- lgb.train(list(objective = "binary",
+                        metric = "l2",
+                        min_data = 1,
+                        learning_rate = 0.1,
+                        min_data = 0,
+                        min_hessian = 1,
+                        max_depth = 2),
+                   lgb_data,
+                   100,
+                   valids = list(train = lgb_data))
+
+# Try to find split_feature: 2
+# If you find it, it means it used a categorical feature in the first tree
+lgb.dump(model, num_iteration = 1)
diff --git a/R-package/demo/categorical_features_rules.R b/R-package/demo/categorical_features_rules.R
@@ -0,0 +1,91 @@
+# Here we are going to try training a model with categorical features
+
+# Load libraries
+library(data.table)
+library(lightgbm)
+
+# Load data and look at the structure
+# 
+# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+data(bank, package = "lightgbm")
+str(bank)
+
+# We are dividing the dataset into two: one train, one validation
+bank_train <- bank[1:4000, ]
+bank_test <- bank[4001:4521, ]
+
+# We must now transform the data to fit in LightGBM
+# For this task, we use lgb.prepare
+# The function transforms the data into a fittable data
+# 
+# Classes 'data.table' and 'data.frame':	521 obs. of  17 variables:
+# $ age      : int  53 36 58 26 34 55 55 34 41 38 ...
+# $ job      : num  1 10 10 9 10 2 2 3 3 4 ...
+# $ marital  : num  1 2 1 3 3 2 2 2 1 1 ...
+# $ education: num  2 2 2 2 2 1 2 3 2 2 ...
+# $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
+# $ balance  : int  26 191 -123 -147 179 1086 471 105 1588 70 ...
+# $ housing  : num  2 1 1 1 1 2 2 2 2 1 ...
+# $ loan     : num  1 1 1 1 1 1 1 1 2 1 ...
+# $ contact  : num  1 1 1 3 1 1 3 3 3 1 ...
+# $ day      : int  7 31 5 4 19 6 30 28 20 27 ...
+# $ month    : num  9 2 2 7 2 9 9 9 7 11 ...
+# $ duration : int  56 69 131 95 294 146 58 249 10 255 ...
+# $ campaign : int  1 1 2 2 3 1 2 2 8 3 ...
+# $ pdays    : int  359 -1 -1 -1 -1 272 -1 -1 -1 148 ...
+# $ previous : int  1 0 0 0 0 2 0 0 0 1 ...
+# $ poutcome : num  1 4 4 4 4 1 4 4 4 3 ...
+# $ y        : num  1 1 1 1 1 1 1 1 1 2 ...
+bank_rules <- lgb.prepare_rules(data = bank_train)
+bank_train <- bank_rules$data
+bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
+str(bank_test)
+
+# Remove 1 to label because it must be between 0 and 1
+bank_train$y <- bank_train$y - 1
+bank_test$y <- bank_test$y - 1
+
+# Data input to LightGBM must be a matrix, without the label
+my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE])
+my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
+
+# Creating the LightGBM dataset with categorical features
+# The categorical features can be passed to lgb.train to not copy and paste a lot
+dtrain <- lgb.Dataset(data = my_data_train,
+                      label = bank_train$y)
+dtest <- lgb.Dataset(data = my_data_test,
+                     label = bank_test$y)
+
+# We can now train a model
+model <- lgb.train(list(objective = "binary",
+                        metric = "l2",
+                        min_data = 1,
+                        learning_rate = 0.1,
+                        min_data = 0,
+                        min_hessian = 1,
+                        max_depth = 2,
+                        categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)),
+                   dtrain,
+                   100,
+                   valids = list(train = dtrain, valid = dtest))
+
+# Try to find split_feature: 11
+# If you find it, it means it used a categorical feature in the first tree
+lgb.dump(model, num_iteration = 1)
diff --git a/R-package/demo/efficient_many_training.R b/R-package/demo/efficient_many_training.R
@@ -0,0 +1,34 @@
+# Efficient training means training without giving up too much RAM
+# In the case of many trainings (like 100+ models), RAM will be eaten very quickly
+# Therefore, it is essential to know a strategy to deal with such issue
+
+# More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580
+# Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine).
+# With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB.
+# Just doing reset=TRUE will already improve things: OS reports 4.6GB.
+# Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip."
+
+# Load library
+library(lightgbm)
+
+# Generate fictive data of size 1M x 100
+set.seed(11111)
+x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100)
+y_data <- rnorm(n = 1000000, mean = 0, sd = 5)
+
+# Create lgb.Dataset for training
+data <- lgb.Dataset(x_data, label = y_data)
+data$construct()
+
+# Loop through a training of 1000 models, please check your RAM on your task manager
+# It MUST remain constant (if not increasing very slightly)
+gbm <- list()
+
+for (i in 1:1000) {
+  print(i)
+  gbm[[i]] <- lgb.train(params = list(objective = "regression"),
+                        data = data,
+                        1,
+                        reset_data = TRUE)
+  gc(verbose = FALSE)
+}
diff --git a/R-package/man/bank.Rd b/R-package/man/bank.Rd