Skip to content

Commit

Permalink
[python] refined examples (#1769)
Browse files Browse the repository at this point in the history
  • Loading branch information
StrikerRUS authored and henry0312 committed Oct 23, 2018
1 parent 0312ecd commit 3ad9cba
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 58 deletions.
6 changes: 4 additions & 2 deletions examples/python-guide/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ Examples include:
- Self-defined eval metric with sklearn interface
- Find best parameters for the model with sklearn's GridSearchCV
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Construct Dataset
- Set feature names
- Directly use categorical features without one-hot encoding
- Dump model to json format
- Get feature importances
- Save model to file
- Dump model to JSON format
- Get feature names
- Get feature importances
- Load model to predict
- Dump and load model with pickle
- Load model file to continue training
Expand Down
46 changes: 24 additions & 22 deletions examples/python-guide/advanced_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
except BaseException:
import pickle

print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]

y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

num_train, num_feature = X_train.shape

Expand All @@ -45,10 +45,10 @@
'verbose': 0
}

# generate a feature name
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Start training...')
print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
Expand All @@ -57,15 +57,16 @@
feature_name=feature_name,
categorical_feature=[21])

print('Finished first 10 rounds...')
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))
print('7th feature name is:', lgb_train.feature_name[6])

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
print('Dump model to JSON...')
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
Expand All @@ -77,14 +78,15 @@
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

print('Loading model to predict...')
# load model to predict
print('Load model to predict')
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

print('Dumping and loading model with pickle...')
# dump model with pickle
with open('model.pkl', 'wb') as fout:
pickle.dump(gbm, fout)
Expand All @@ -94,7 +96,7 @@
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
print('The rmse of pickled model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

# continue training
# init_model accepts:
Expand All @@ -106,7 +108,7 @@
init_model='model.txt',
valid_sets=lgb_eval)

print('Finish 10 - 20 rounds with model file...')
print('Finished 10 - 20 rounds with model file...')

# decay learning rates
# learning_rates accepts:
Expand All @@ -119,7 +121,7 @@
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)

print('Finish 20 - 30 rounds with decay learning rates...')
print('Finished 20 - 30 rounds with decay learning rates...')

# change other parameters during training
gbm = lgb.train(params,
Expand All @@ -129,13 +131,13 @@
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finish 30 - 40 rounds with changing bagging_fraction...')
print('Finished 30 - 40 rounds with changing bagging_fraction...')


# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
Expand All @@ -155,13 +157,13 @@ def binary_error(preds, train_data):
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelood,
fobj=loglikelihood,
feval=binary_error,
valid_sets=lgb_eval)

print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')

print('Start a new training job...')
print('Starting a new training job...')


# callback
Expand All @@ -170,7 +172,7 @@ def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid')
env.model.add_valid(lgb_eval_new, 'new_valid')
callback.before_iteration = True
callback.order = 0
return callback
Expand All @@ -182,4 +184,4 @@ def callback(env):
valid_sets=lgb_train,
callbacks=[reset_metrics()])

print('Finish first 10 rounds with callback function...')
print('Finished first 10 rounds with callback function...')
22 changes: 11 additions & 11 deletions examples/python-guide/plot_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
else:
raise ImportError('You need to install matplotlib for plot_example.py.')

print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
Expand All @@ -31,29 +31,29 @@

evals_result = {} # to record eval results for plotting

print('Start training...')
print('Starting training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=[lgb_train, lgb_test],
feature_name=['f' + str(i + 1) for i in range(28)],
feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
categorical_feature=[21],
evals_result=evals_result,
verbose_eval=10)

print('Plot metrics recorded during training...')
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...') # one tree use categorical feature to split
print('Plotting 84th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
print('Plotting 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
20 changes: 9 additions & 11 deletions examples/python-guide/simple_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,25 @@
import pandas as pd
from sklearn.metrics import mean_squared_error


print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'auc'},
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
Expand All @@ -33,19 +31,19 @@
'verbose': 0
}

print('Start training...')
print('Starting training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)

print('Save model...')
print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
Expand Down
22 changes: 10 additions & 12 deletions examples/python-guide/sklearn_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,27 @@
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Start training...')
print('Starting training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
gbm = lgb.LGBMRegressor(num_leaves=31,
learning_rate=0.05,
n_estimators=20)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='l1',
early_stopping_rounds=5)

print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
Expand All @@ -45,14 +44,14 @@ def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Start training with custom eval function...')
print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=rmsle,
early_stopping_rounds=5)

print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
Expand All @@ -67,7 +66,6 @@ def rmsle(y_true, y_pred):
}

gbm = GridSearchCV(estimator, param_grid, cv=3)

gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

0 comments on commit 3ad9cba

Please sign in to comment.