[python] refined examples (#1769)

microsoft · Oct 23, 2018 · 3ad9cba · 3ad9cba
1 parent 0312ecd
commit 3ad9cba
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 58 deletions.
diff --git a/examples/python-guide/README.md b/examples/python-guide/README.md
@@ -32,11 +32,13 @@ Examples include:
     - Self-defined eval metric with sklearn interface
     - Find best parameters for the model with sklearn's GridSearchCV
 - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
+    - Construct Dataset
     - Set feature names
     - Directly use categorical features without one-hot encoding
-    - Dump model to json format
-    - Get feature importances
+    - Save model to file
+    - Dump model to JSON format
     - Get feature names
+    - Get feature importances
     - Load model to predict
     - Dump and load model with pickle
     - Load model file to continue training

diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py
@@ -11,17 +11,17 @@
 except BaseException:
     import pickle
 
+print('Loading data...')
 # load or create your dataset
-print('Load data...')
 df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
 df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
 W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
 W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
 
-y_train = df_train[0].values
-y_test = df_test[0].values
-X_train = df_train.drop(0, axis=1).values
-X_test = df_test.drop(0, axis=1).values
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
 
 num_train, num_feature = X_train.shape
 
@@ -45,10 +45,10 @@
     'verbose': 0
 }
 
-# generate a feature name
+# generate feature names
 feature_name = ['feature_' + str(col) for col in range(num_feature)]
 
-print('Start training...')
+print('Starting training...')
 # feature_name and categorical_feature
 gbm = lgb.train(params,
                 lgb_train,
@@ -57,15 +57,16 @@
                 feature_name=feature_name,
                 categorical_feature=[21])
 
+print('Finished first 10 rounds...')
 # check feature name
-print('Finish first 10 rounds...')
-print('7th feature name is:', repr(lgb_train.feature_name[6]))
+print('7th feature name is:', lgb_train.feature_name[6])
 
+print('Saving model...')
 # save model to file
 gbm.save_model('model.txt')
 
+print('Dumping model to JSON...')
 # dump model to JSON (and save to file)
-print('Dump model to JSON...')
 model_json = gbm.dump_model()
 
 with open('model.json', 'w+') as f:
@@ -77,14 +78,15 @@
 # feature importances
 print('Feature importances:', list(gbm.feature_importance()))
 
+print('Loading model to predict...')
 # load model to predict
-print('Load model to predict')
 bst = lgb.Booster(model_file='model.txt')
 # can only predict with the best iteration (or the saving iteration)
 y_pred = bst.predict(X_test)
 # eval with loaded model
-print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
 
+print('Dumping and loading model with pickle...')
 # dump model with pickle
 with open('model.pkl', 'wb') as fout:
     pickle.dump(gbm, fout)
@@ -94,7 +96,7 @@
 # can predict with any iteration when loaded in pickle way
 y_pred = pkl_bst.predict(X_test, num_iteration=7)
 # eval with loaded model
-print('The rmse of pickled model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
 
 # continue training
 # init_model accepts:
@@ -106,7 +108,7 @@
                 init_model='model.txt',
                 valid_sets=lgb_eval)
 
-print('Finish 10 - 20 rounds with model file...')
+print('Finished 10 - 20 rounds with model file...')
 
 # decay learning rates
 # learning_rates accepts:
@@ -119,7 +121,7 @@
                 learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                 valid_sets=lgb_eval)
 
-print('Finish 20 - 30 rounds with decay learning rates...')
+print('Finished 20 - 30 rounds with decay learning rates...')
 
 # change other parameters during training
 gbm = lgb.train(params,
@@ -129,13 +131,13 @@
                 valid_sets=lgb_eval,
                 callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
 
-print('Finish 30 - 40 rounds with changing bagging_fraction...')
+print('Finished 30 - 40 rounds with changing bagging_fraction...')
 
 
 # self-defined objective function
 # f(preds: array, train_data: Dataset) -> grad: array, hess: array
 # log likelihood loss
-def loglikelood(preds, train_data):
+def loglikelihood(preds, train_data):
     labels = train_data.get_label()
     preds = 1. / (1. + np.exp(-preds))
     grad = preds - labels
@@ -155,13 +157,13 @@ def binary_error(preds, train_data):
                 lgb_train,
                 num_boost_round=10,
                 init_model=gbm,
-                fobj=loglikelood,
+                fobj=loglikelihood,
                 feval=binary_error,
                 valid_sets=lgb_eval)
 
-print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')
+print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
 
-print('Start a new training job...')
+print('Starting a new training job...')
 
 
 # callback
@@ -170,7 +172,7 @@ def callback(env):
         lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
         if env.iteration - env.begin_iteration == 5:
             print('Add a new valid dataset at iteration 5...')
-            env.model.add_valid(lgb_eval_new, 'new valid')
+            env.model.add_valid(lgb_eval_new, 'new_valid')
     callback.before_iteration = True
     callback.order = 0
     return callback
@@ -182,4 +184,4 @@ def callback(env):
                 valid_sets=lgb_train,
                 callbacks=[reset_metrics()])
 
-print('Finish first 10 rounds with callback function...')
+print('Finished first 10 rounds with callback function...')
diff --git a/examples/python-guide/plot_example.py b/examples/python-guide/plot_example.py
@@ -8,15 +8,15 @@
 else:
     raise ImportError('You need to install matplotlib for plot_example.py.')
 
+print('Loading data...')
 # load or create your dataset
-print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
 
-y_train = df_train[0].values
-y_test = df_test[0].values
-X_train = df_train.drop(0, axis=1).values
-X_test = df_test.drop(0, axis=1).values
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
 
 # create dataset for lightgbm
 lgb_train = lgb.Dataset(X_train, y_train)
@@ -31,29 +31,29 @@
 
 evals_result = {}  # to record eval results for plotting
 
-print('Start training...')
+print('Starting training...')
 # train
 gbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=100,
                 valid_sets=[lgb_train, lgb_test],
-                feature_name=['f' + str(i + 1) for i in range(28)],
+                feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
                 categorical_feature=[21],
                 evals_result=evals_result,
                 verbose_eval=10)
 
-print('Plot metrics recorded during training...')
+print('Plotting metrics recorded during training...')
 ax = lgb.plot_metric(evals_result, metric='l1')
 plt.show()
 
-print('Plot feature importances...')
+print('Plotting feature importances...')
 ax = lgb.plot_importance(gbm, max_num_features=10)
 plt.show()
 
-print('Plot 84th tree...')  # one tree use categorical feature to split
+print('Plotting 84th tree...')  # one tree use categorical feature to split
 ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
 plt.show()
 
-print('Plot 84th tree with graphviz...')
+print('Plotting 84th tree with graphviz...')
 graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
 graph.render(view=True)
diff --git a/examples/python-guide/simple_example.py b/examples/python-guide/simple_example.py
@@ -4,27 +4,25 @@
 import pandas as pd
 from sklearn.metrics import mean_squared_error
 
-
+print('Loading data...')
 # load or create your dataset
-print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
 
-y_train = df_train[0].values
-y_test = df_test[0].values
-X_train = df_train.drop(0, axis=1).values
-X_test = df_test.drop(0, axis=1).values
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
 
 # create dataset for lightgbm
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
 
 # specify your configurations as a dict
 params = {
-    'task': 'train',
     'boosting_type': 'gbdt',
     'objective': 'regression',
-    'metric': {'l2', 'auc'},
+    'metric': {'l2', 'l1'},
     'num_leaves': 31,
     'learning_rate': 0.05,
     'feature_fraction': 0.9,
@@ -33,19 +31,19 @@
     'verbose': 0
 }
 
-print('Start training...')
+print('Starting training...')
 # train
 gbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=20,
                 valid_sets=lgb_eval,
                 early_stopping_rounds=5)
 
-print('Save model...')
+print('Saving model...')
 # save model to file
 gbm.save_model('model.txt')
 
-print('Start predicting...')
+print('Starting predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval

diff --git a/examples/python-guide/sklearn_example.py b/examples/python-guide/sklearn_example.py
@@ -7,28 +7,27 @@
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import GridSearchCV
 
+print('Loading data...')
 # load or create your dataset
-print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
 
-y_train = df_train[0].values
-y_test = df_test[0].values
-X_train = df_train.drop(0, axis=1).values
-X_test = df_test.drop(0, axis=1).values
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
 
-print('Start training...')
+print('Starting training...')
 # train
-gbm = lgb.LGBMRegressor(objective='regression',
-                        num_leaves=31,
+gbm = lgb.LGBMRegressor(num_leaves=31,
                         learning_rate=0.05,
                         n_estimators=20)
 gbm.fit(X_train, y_train,
         eval_set=[(X_test, y_test)],
         eval_metric='l1',
         early_stopping_rounds=5)
 
-print('Start predicting...')
+print('Starting predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
 # eval
@@ -45,14 +44,14 @@ def rmsle(y_true, y_pred):
     return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
 
 
-print('Start training with custom eval function...')
+print('Starting training with custom eval function...')
 # train
 gbm.fit(X_train, y_train,
         eval_set=[(X_test, y_test)],
         eval_metric=rmsle,
         early_stopping_rounds=5)
 
-print('Start predicting...')
+print('Starting predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
 # eval
@@ -67,7 +66,6 @@ def rmsle(y_true, y_pred):
 }
 
 gbm = GridSearchCV(estimator, param_grid, cv=3)
-
 gbm.fit(X_train, y_train)
 
 print('Best parameters found by grid search are:', gbm.best_params_)