-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[R-package] Add more examples and bank dataset (#887)
* Add efficient demo for LightGBM * Add bank dataset and more examples * Add bank manual
- Loading branch information
Showing
7 changed files
with
256 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
basic_walkthrough Basic feature walkthrough | ||
boost_from_prediction Boosting from existing prediction | ||
early_stopping Early Stop in training | ||
categorical_feature_prepare Categorical Feature Preparation | ||
categorical_feature_rules Categorical Feature Preparation with Rules | ||
cross_validation Cross Validation | ||
early_stopping Early Stop in training | ||
efficient_many_training Efficiency for Many Model Trainings | ||
multiclass Multiclass training/prediction | ||
leaf_stability Leaf (in)Stability example | ||
weight_param Weight-Parameter adjustment relationship |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# Here we are going to try training a model with categorical features | ||
|
||
# Load libraries | ||
library(data.table) | ||
library(lightgbm) | ||
|
||
# Load data and look at the structure | ||
# | ||
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: | ||
# $ age : int 30 33 35 30 59 35 36 39 41 43 ... | ||
# $ job : chr "unemployed" "services" "management" "management" ... | ||
# $ marital : chr "married" "married" "single" "married" ... | ||
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... | ||
# $ default : chr "no" "no" "no" "no" ... | ||
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... | ||
# $ housing : chr "no" "yes" "yes" "yes" ... | ||
# $ loan : chr "no" "yes" "no" "yes" ... | ||
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... | ||
# $ day : int 19 11 16 3 5 23 14 6 14 17 ... | ||
# $ month : chr "oct" "may" "apr" "jun" ... | ||
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... | ||
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... | ||
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... | ||
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... | ||
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... | ||
# $ y : chr "no" "no" "no" "no" ... | ||
data(bank, package = "lightgbm") | ||
str(bank) | ||
|
||
# We must now transform the data to fit in LightGBM | ||
# For this task, we use lgb.prepare | ||
# The function transforms the data into a fittable data | ||
# | ||
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: | ||
# $ age : int 30 33 35 30 59 35 36 39 41 43 ... | ||
# $ job : chr "unemployed" "services" "management" "management" ... | ||
# $ marital : chr "married" "married" "single" "married" ... | ||
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... | ||
# $ default : chr "no" "no" "no" "no" ... | ||
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... | ||
# $ housing : chr "no" "yes" "yes" "yes" ... | ||
# $ loan : chr "no" "yes" "no" "yes" ... | ||
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... | ||
# $ day : int 19 11 16 3 5 23 14 6 14 17 ... | ||
# $ month : chr "oct" "may" "apr" "jun" ... | ||
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... | ||
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... | ||
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... | ||
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... | ||
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... | ||
# $ y : chr "no" "no" "no" "no" ... | ||
bank <- lgb.prepare(data = bank) | ||
str(bank) | ||
|
||
# Remove 1 to label because it must be between 0 and 1 | ||
bank$y <- bank$y - 1 | ||
|
||
# Data input to LightGBM must be a matrix, without the label | ||
my_data <- as.matrix(bank[, 1:16, with = FALSE]) | ||
|
||
# Creating the LightGBM dataset with categorical features | ||
# The categorical features must be indexed like in R (1-indexed, not 0-indexed) | ||
lgb_data <- lgb.Dataset(data = my_data, | ||
label = bank$y, | ||
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)) | ||
|
||
# We can now train a model | ||
model <- lgb.train(list(objective = "binary", | ||
metric = "l2", | ||
min_data = 1, | ||
learning_rate = 0.1, | ||
min_data = 0, | ||
min_hessian = 1, | ||
max_depth = 2), | ||
lgb_data, | ||
100, | ||
valids = list(train = lgb_data)) | ||
|
||
# Try to find split_feature: 2 | ||
# If you find it, it means it used a categorical feature in the first tree | ||
lgb.dump(model, num_iteration = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Here we are going to try training a model with categorical features | ||
|
||
# Load libraries | ||
library(data.table) | ||
library(lightgbm) | ||
|
||
# Load data and look at the structure | ||
# | ||
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: | ||
# $ age : int 30 33 35 30 59 35 36 39 41 43 ... | ||
# $ job : chr "unemployed" "services" "management" "management" ... | ||
# $ marital : chr "married" "married" "single" "married" ... | ||
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... | ||
# $ default : chr "no" "no" "no" "no" ... | ||
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... | ||
# $ housing : chr "no" "yes" "yes" "yes" ... | ||
# $ loan : chr "no" "yes" "no" "yes" ... | ||
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... | ||
# $ day : int 19 11 16 3 5 23 14 6 14 17 ... | ||
# $ month : chr "oct" "may" "apr" "jun" ... | ||
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... | ||
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... | ||
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... | ||
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... | ||
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... | ||
# $ y : chr "no" "no" "no" "no" ... | ||
data(bank, package = "lightgbm") | ||
str(bank) | ||
|
||
# We are dividing the dataset into two: one train, one validation | ||
bank_train <- bank[1:4000, ] | ||
bank_test <- bank[4001:4521, ] | ||
|
||
# We must now transform the data to fit in LightGBM | ||
# For this task, we use lgb.prepare | ||
# The function transforms the data into a fittable data | ||
# | ||
# Classes 'data.table' and 'data.frame': 521 obs. of 17 variables: | ||
# $ age : int 53 36 58 26 34 55 55 34 41 38 ... | ||
# $ job : num 1 10 10 9 10 2 2 3 3 4 ... | ||
# $ marital : num 1 2 1 3 3 2 2 2 1 1 ... | ||
# $ education: num 2 2 2 2 2 1 2 3 2 2 ... | ||
# $ default : num 1 1 1 1 1 1 1 1 1 1 ... | ||
# $ balance : int 26 191 -123 -147 179 1086 471 105 1588 70 ... | ||
# $ housing : num 2 1 1 1 1 2 2 2 2 1 ... | ||
# $ loan : num 1 1 1 1 1 1 1 1 2 1 ... | ||
# $ contact : num 1 1 1 3 1 1 3 3 3 1 ... | ||
# $ day : int 7 31 5 4 19 6 30 28 20 27 ... | ||
# $ month : num 9 2 2 7 2 9 9 9 7 11 ... | ||
# $ duration : int 56 69 131 95 294 146 58 249 10 255 ... | ||
# $ campaign : int 1 1 2 2 3 1 2 2 8 3 ... | ||
# $ pdays : int 359 -1 -1 -1 -1 272 -1 -1 -1 148 ... | ||
# $ previous : int 1 0 0 0 0 2 0 0 0 1 ... | ||
# $ poutcome : num 1 4 4 4 4 1 4 4 4 3 ... | ||
# $ y : num 1 1 1 1 1 1 1 1 1 2 ... | ||
bank_rules <- lgb.prepare_rules(data = bank_train) | ||
bank_train <- bank_rules$data | ||
bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data | ||
str(bank_test) | ||
|
||
# Remove 1 to label because it must be between 0 and 1 | ||
bank_train$y <- bank_train$y - 1 | ||
bank_test$y <- bank_test$y - 1 | ||
|
||
# Data input to LightGBM must be a matrix, without the label | ||
my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE]) | ||
my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE]) | ||
|
||
# Creating the LightGBM dataset with categorical features | ||
# The categorical features can be passed to lgb.train to not copy and paste a lot | ||
dtrain <- lgb.Dataset(data = my_data_train, | ||
label = bank_train$y) | ||
dtest <- lgb.Dataset(data = my_data_test, | ||
label = bank_test$y) | ||
|
||
# We can now train a model | ||
model <- lgb.train(list(objective = "binary", | ||
metric = "l2", | ||
min_data = 1, | ||
learning_rate = 0.1, | ||
min_data = 0, | ||
min_hessian = 1, | ||
max_depth = 2, | ||
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)), | ||
dtrain, | ||
100, | ||
valids = list(train = dtrain, valid = dtest)) | ||
|
||
# Try to find split_feature: 11 | ||
# If you find it, it means it used a categorical feature in the first tree | ||
lgb.dump(model, num_iteration = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Efficient training means training without giving up too much RAM | ||
# In the case of many trainings (like 100+ models), RAM will be eaten very quickly | ||
# Therefore, it is essential to know a strategy to deal with such issue | ||
|
||
# More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580 | ||
# Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine). | ||
# With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB. | ||
# Just doing reset=TRUE will already improve things: OS reports 4.6GB. | ||
# Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip." | ||
|
||
# Load library | ||
library(lightgbm) | ||
|
||
# Generate fictive data of size 1M x 100 | ||
set.seed(11111) | ||
x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100) | ||
y_data <- rnorm(n = 1000000, mean = 0, sd = 5) | ||
|
||
# Create lgb.Dataset for training | ||
data <- lgb.Dataset(x_data, label = y_data) | ||
data$construct() | ||
|
||
# Loop through a training of 1000 models, please check your RAM on your task manager | ||
# It MUST remain constant (if not increasing very slightly) | ||
gbm <- list() | ||
|
||
for (i in 1:1000) { | ||
print(i) | ||
gbm[[i]] <- lgb.train(params = list(objective = "regression"), | ||
data = data, | ||
1, | ||
reset_data = TRUE) | ||
gc(verbose = FALSE) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.