diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 972e4206afc..973fc5e3954 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -140,7 +140,7 @@ lgb.cv <- function(params = list(),
     begin_iteration <- predictor$current_iter() + 1
   }
   # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
-  n_trees <- c("num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds")
+  n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
   if (any(names(params) %in% n_trees)) {
     end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1
   } else {
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 25cd31ab191..ec1074ba360 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -117,7 +117,7 @@ lgb.train <- function(params = list(),
     begin_iteration <- predictor$current_iter() + 1
   }
   # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
-  n_rounds <- c("num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds")
+  n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
   if (any(names(params) %in% n_rounds)) {
     end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1
   } else {
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 18b3b645e31..c2703aa6770 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -73,7 +73,7 @@ Core Parameters
 
       -  ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`__
 
-   -  ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``xentropy`` for general probability labels in [0, 1]
+   -  ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``cross-entropy`` application for general probability labels in [0, 1]
 
    -  multi-class classification application
 
@@ -109,13 +109,13 @@ Core Parameters
 
    -  ``goss``, Gradient-based One-Side Sampling
 
--  ``data`` :raw-html:`<a id="data" title="Permalink to this parameter" href="#data">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``data_filename``
+-  ``data`` :raw-html:`<a id="data" title="Permalink to this parameter" href="#data">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename``
 
    -  path of training data, LightGBM will train from this data
 
    -  **Note**: can be used only in CLI version
 
--  ``valid`` :raw-html:`<a id="valid" title="Permalink to this parameter" href="#valid">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``test``, ``valid_data``, ``valid_data_file``, ``test_data``, ``valid_filenames``
+-  ``valid`` :raw-html:`<a id="valid" title="Permalink to this parameter" href="#valid">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``test``, ``valid_data``, ``valid_data_file``, ``test_data``, ``test_data_file``, ``valid_filenames``
 
    -  path(s) of validation/test data, LightGBM will output metrics for these data
 
@@ -123,7 +123,7 @@ Core Parameters
 
    -  **Note**: can be used only in CLI version
 
--  ``num_iterations`` :raw-html:`<a id="num_iterations" title="Permalink to this parameter" href="#num_iterations">&#x1F517;&#xFE0E;</a>`, default = ``100``, type = int, aliases: ``num_iteration``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``, ``num_boost_round``, ``n_estimators``, constraints: ``num_iterations >= 0``
+-  ``num_iterations`` :raw-html:`<a id="num_iterations" title="Permalink to this parameter" href="#num_iterations">&#x1F517;&#xFE0E;</a>`, default = ``100``, type = int, aliases: ``num_iteration``, ``n_iter``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``, ``num_boost_round``, ``n_estimators``, constraints: ``num_iterations >= 0``
 
    -  number of boosting iterations
 
@@ -131,17 +131,17 @@ Core Parameters
 
    -  **Note**: internally, LightGBM constructs ``num_class * num_iterations`` trees for multi-class classification problems
 
--  ``learning_rate`` :raw-html:`<a id="learning_rate" title="Permalink to this parameter" href="#learning_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.1``, type = double, aliases: ``shrinkage_rate``, constraints: ``learning_rate > 0.0``
+-  ``learning_rate`` :raw-html:`<a id="learning_rate" title="Permalink to this parameter" href="#learning_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.1``, type = double, aliases: ``shrinkage_rate``, ``eta``, constraints: ``learning_rate > 0.0``
 
    -  shrinkage rate
 
    -  in ``dart``, it also affects on normalization weights of dropped trees
 
--  ``num_leaves`` :raw-html:`<a id="num_leaves" title="Permalink to this parameter" href="#num_leaves">&#x1F517;&#xFE0E;</a>`, default = ``31``, type = int, aliases: ``num_leaf``, constraints: ``num_leaves > 1``
+-  ``num_leaves`` :raw-html:`<a id="num_leaves" title="Permalink to this parameter" href="#num_leaves">&#x1F517;&#xFE0E;</a>`, default = ``31``, type = int, aliases: ``num_leaf``, ``max_leaves``, ``max_leaf``, constraints: ``num_leaves > 1``
 
    -  max number of leaves in one tree
 
--  ``tree_learner`` :raw-html:`<a id="tree_learner" title="Permalink to this parameter" href="#tree_learner">&#x1F517;&#xFE0E;</a>`, default = ``serial``, type = enum, options: ``serial``, ``feature``, ``data``, ``voting``, aliases: ``tree``, ``tree_learner_type``
+-  ``tree_learner`` :raw-html:`<a id="tree_learner" title="Permalink to this parameter" href="#tree_learner">&#x1F517;&#xFE0E;</a>`, default = ``serial``, type = enum, options: ``serial``, ``feature``, ``data``, ``voting``, aliases: ``tree``, ``tree_type``, ``tree_learner_type``
 
    -  ``serial``, single machine tree learner
 
@@ -153,7 +153,7 @@ Core Parameters
 
    -  refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
 
--  ``num_threads`` :raw-html:`<a id="num_threads" title="Permalink to this parameter" href="#num_threads">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``num_thread``, ``nthread``, ``nthreads``
+-  ``num_threads`` :raw-html:`<a id="num_threads" title="Permalink to this parameter" href="#num_threads">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``num_thread``, ``nthread``, ``nthreads``, ``n_jobs``
 
    -  number of threads for LightGBM
 
@@ -177,7 +177,7 @@ Core Parameters
 
    -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
 
--  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``random_seed``
+-  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``random_seed``, ``random_state``
 
    -  this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``
 
@@ -252,7 +252,7 @@ Learning Control Parameters
 
    -  L1 regularization
 
--  ``lambda_l2`` :raw-html:`<a id="lambda_l2" title="Permalink to this parameter" href="#lambda_l2">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``reg_lambda``, constraints: ``lambda_l2 >= 0.0``
+-  ``lambda_l2`` :raw-html:`<a id="lambda_l2" title="Permalink to this parameter" href="#lambda_l2">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``reg_lambda``, ``lambda``, constraints: ``lambda_l2 >= 0.0``
 
    -  L2 regularization
 
@@ -260,17 +260,17 @@ Learning Control Parameters
 
    -  the minimal gain to perform split
 
--  ``drop_rate`` :raw-html:`<a id="drop_rate" title="Permalink to this parameter" href="#drop_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.1``, type = double, constraints: ``0.0 <= drop_rate <= 1.0``
+-  ``drop_rate`` :raw-html:`<a id="drop_rate" title="Permalink to this parameter" href="#drop_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.1``, type = double, aliases: ``rate_drop``, constraints: ``0.0 <= drop_rate <= 1.0``
 
    -  used only in ``dart``
 
-   -  dropout rate
+   -  dropout rate: a fraction of previous trees to drop during the dropout
 
 -  ``max_drop`` :raw-html:`<a id="max_drop" title="Permalink to this parameter" href="#max_drop">&#x1F517;&#xFE0E;</a>`, default = ``50``, type = int
 
    -  used only in ``dart``
 
-   -  max number of dropped trees on one iteration
+   -  max number of dropped trees during one boosting iteration
 
    -  ``<=0`` means no limit
 
@@ -278,7 +278,7 @@ Learning Control Parameters
 
    -  used only in ``dart``
 
-   -  probability of skipping drop
+   -  probability of skipping the dropout procedure during a boosting iteration
 
 -  ``xgboost_dart_mode`` :raw-html:`<a id="xgboost_dart_mode" title="Permalink to this parameter" href="#xgboost_dart_mode">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
 
@@ -350,7 +350,7 @@ Learning Control Parameters
 
    -  you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
 
--  ``feature_contri`` :raw-html:`<a id="feature_contri" title="Permalink to this parameter" href="#feature_contri">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = multi-double, aliases: ``fc``, ``fp``, ``feature_penalty``
+-  ``feature_contri`` :raw-html:`<a id="feature_contri" title="Permalink to this parameter" href="#feature_contri">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = multi-double, aliases: ``feature_contrib``, ``fc``, ``fp``, ``feature_penalty``
 
    -  used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
 
@@ -397,13 +397,13 @@ IO Parameters
 
    -  set this to larger value if data is very sparse
 
--  ``histogram_pool_size`` :raw-html:`<a id="histogram_pool_size" title="Permalink to this parameter" href="#histogram_pool_size">&#x1F517;&#xFE0E;</a>`, default = ``-1.0``, type = double
+-  ``histogram_pool_size`` :raw-html:`<a id="histogram_pool_size" title="Permalink to this parameter" href="#histogram_pool_size">&#x1F517;&#xFE0E;</a>`, default = ``-1.0``, type = double, aliases: ``hist_pool_size``
 
    -  max cache size in MB for historical histogram
 
    -  ``< 0`` means no limit
 
--  ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int
+-  ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``data_seed``
 
    -  random seed for data partition in parallel learning (excluding the ``feature_parallel`` mode)
 
@@ -413,7 +413,7 @@ IO Parameters
 
    -  **Note**: can be used only in CLI version
 
--  ``snapshot_freq`` :raw-html:`<a id="snapshot_freq" title="Permalink to this parameter" href="#snapshot_freq">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
+-  ``snapshot_freq`` :raw-html:`<a id="snapshot_freq" title="Permalink to this parameter" href="#snapshot_freq">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int, aliases: ``save_period``
 
    -  frequency of saving model file snapshot
 
@@ -431,7 +431,7 @@ IO Parameters
 
    -  **Note**: can be used only in CLI version
 
--  ``output_result`` :raw-html:`<a id="output_result" title="Permalink to this parameter" href="#output_result">&#x1F517;&#xFE0E;</a>`, default = ``LightGBM_predict_result.txt``, type = string, aliases: ``predict_result``, ``prediction_result``
+-  ``output_result`` :raw-html:`<a id="output_result" title="Permalink to this parameter" href="#output_result">&#x1F517;&#xFE0E;</a>`, default = ``LightGBM_predict_result.txt``, type = string, aliases: ``predict_result``, ``prediction_result``, ``predict_name``, ``prediction_name``, ``pred_name``, ``name_pred``
 
    -  filename of prediction result in ``prediction`` task
 
@@ -644,7 +644,7 @@ Objective Parameters
 
    -  used only in ``multi-class`` classification application
 
--  ``is_unbalance`` :raw-html:`<a id="is_unbalance" title="Permalink to this parameter" href="#is_unbalance">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``unbalanced_sets``
+-  ``is_unbalance`` :raw-html:`<a id="is_unbalance" title="Permalink to this parameter" href="#is_unbalance">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``unbalance``, ``unbalanced_sets``
 
    -  used only in ``binary`` application
 
@@ -729,7 +729,7 @@ Metric Parameters
 
    -  metric(s) to be evaluated on the evaluation sets **in addition** to what is provided in the training arguments
 
-      -  ``""`` (empty string or not specific) means that metric corresponding to specified ``objective`` will be used (this is possible only for pre-defined objective functions, otherwise no evaluation metric will be added)
+      -  ``""`` (empty string or not specified) means that metric corresponding to specified ``objective`` will be used (this is possible only for pre-defined objective functions, otherwise no evaluation metric will be added)
 
       -  ``"None"`` (string, **not** a ``None`` value) means that no metric will be registered, aliases: ``na``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 5c49d2063a7..20e48218a37 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -110,7 +110,7 @@ struct Config {
   // descl2 = ``mape``, `MAPE loss <https://en.wikipedia.org/wiki/Mean_absolute_percentage_error>`__, aliases: ``mean_absolute_percentage_error``
   // descl2 = ``gamma``, Gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`__
   // descl2 = ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`__
-  // desc = ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``xentropy`` for general probability labels in [0, 1]
+  // desc = ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``cross-entropy`` application for general probability labels in [0, 1]
   // desc = multi-class classification application
   // descl2 = ``multiclass``, `softmax <https://en.wikipedia.org/wiki/Softmax_function>`__ objective function, aliases: ``softmax``
   // descl2 = ``multiclassova``, `One-vs-All <https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest>`__ binary objective function, aliases: ``multiclass_ova``, ``ova``, ``ovr``
@@ -135,33 +135,33 @@ struct Config {
   // desc = ``goss``, Gradient-based One-Side Sampling
   std::string boosting = "gbdt";
 
-  // alias = train, train_data, data_filename
+  // alias = train, train_data, train_data_file, data_filename
   // desc = path of training data, LightGBM will train from this data
   // desc = **Note**: can be used only in CLI version
   std::string data = "";
 
-  // alias = test, valid_data, valid_data_file, test_data, valid_filenames
+  // alias = test, valid_data, valid_data_file, test_data, test_data_file, valid_filenames
   // default = ""
   // desc = path(s) of validation/test data, LightGBM will output metrics for these data
   // desc = support multiple validation data, separated by ``,``
   // desc = **Note**: can be used only in CLI version
   std::vector<std::string> valid;
 
-  // alias = num_iteration, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators
+  // alias = num_iteration, n_iter, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators
   // check = >=0
   // desc = number of boosting iterations
   // desc = **Note**: for Python/R-package, **this parameter is ignored**, use ``num_boost_round`` (Python) or ``nrounds`` (R) input arguments of ``train`` and ``cv`` methods instead
   // desc = **Note**: internally, LightGBM constructs ``num_class * num_iterations`` trees for multi-class classification problems
   int num_iterations = 100;
 
-  // alias = shrinkage_rate
+  // alias = shrinkage_rate, eta
   // check = >0.0
   // desc = shrinkage rate
   // desc = in ``dart``, it also affects on normalization weights of dropped trees
   double learning_rate = 0.1;
 
   // default = 31
-  // alias = num_leaf
+  // alias = num_leaf, max_leaves, max_leaf
   // check = >1
   // desc = max number of leaves in one tree
   int num_leaves = kDefaultNumLeaves;
@@ -169,7 +169,7 @@ struct Config {
   // [doc-only]
   // type = enum
   // options = serial, feature, data, voting
-  // alias = tree, tree_learner_type
+  // alias = tree, tree_type, tree_learner_type
   // desc = ``serial``, single machine tree learner
   // desc = ``feature``, feature parallel tree learner, aliases: ``feature_parallel``
   // desc = ``data``, data parallel tree learner, aliases: ``data_parallel``
@@ -177,7 +177,7 @@ struct Config {
   // desc = refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
   std::string tree_learner = "serial";
 
-  // alias = num_thread, nthread, nthreads
+  // alias = num_thread, nthread, nthreads, n_jobs
   // desc = number of threads for LightGBM
   // desc = ``0`` means default number of threads in OpenMP
   // desc = for the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPUs use `hyper-threading <https://en.wikipedia.org/wiki/Hyper-threading>`__ to generate 2 threads per CPU core)
@@ -197,7 +197,7 @@ struct Config {
   std::string device_type = "cpu";
 
   // [doc-only]
-  // alias = random_seed
+  // alias = random_seed, random_state
   // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``
   // desc = will be overridden, if you set other seeds
   int seed = 0;
@@ -266,7 +266,7 @@ struct Config {
   // desc = L1 regularization
   double lambda_l1 = 0.0;
 
-  // alias = reg_lambda
+  // alias = reg_lambda, lambda
   // check = >=0.0
   // desc = L2 regularization
   double lambda_l2 = 0.0;
@@ -276,21 +276,22 @@ struct Config {
   // desc = the minimal gain to perform split
   double min_gain_to_split = 0.0;
 
+  // alias = rate_drop
   // check = >=0.0
   // check = <=1.0
   // desc = used only in ``dart``
-  // desc = dropout rate
+  // desc = dropout rate: a fraction of previous trees to drop during the dropout
   double drop_rate = 0.1;
 
   // desc = used only in ``dart``
-  // desc = max number of dropped trees on one iteration
+  // desc = max number of dropped trees during one boosting iteration
   // desc = ``<=0`` means no limit
   int max_drop = 50;
 
   // check = >=0.0
   // check = <=1.0
   // desc = used only in ``dart``
-  // desc = probability of skipping drop
+  // desc = probability of skipping the dropout procedure during a boosting iteration
   double skip_drop = 0.5;
 
   // desc = used only in ``dart``
@@ -355,7 +356,7 @@ struct Config {
   std::vector<int8_t> monotone_constraints;
 
   // type = multi-double
-  // alias = fc, fp, feature_penalty
+  // alias = feature_contrib, fc, fp, feature_penalty
   // default = None
   // desc = used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
   // desc = you need to specify all features in order
@@ -395,10 +396,12 @@ struct Config {
   // desc = set this to larger value if data is very sparse
   int bin_construct_sample_cnt = 200000;
 
+  // alias = hist_pool_size
   // desc = max cache size in MB for historical histogram
   // desc = ``< 0`` means no limit
   double histogram_pool_size = -1.0;
 
+  // alias = data_seed
   // desc = random seed for data partition in parallel learning (excluding the ``feature_parallel`` mode)
   int data_random_seed = 1;
 
@@ -407,6 +410,7 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   std::string output_model = "LightGBM_model.txt";
 
+  // alias = save_period
   // desc = frequency of saving model file snapshot
   // desc = set this to positive value to enable this function. For example, the model file will be snapshotted at each iteration if ``snapshot_freq=1``
   // desc = **Note**: can be used only in CLI version
@@ -419,7 +423,7 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   std::string input_model = "";
 
-  // alias = predict_result, prediction_result
+  // alias = predict_result, prediction_result, predict_name, prediction_name, pred_name, name_pred
   // desc = filename of prediction result in ``prediction`` task
   // desc = **Note**: can be used only in CLI version
   std::string output_result = "LightGBM_predict_result.txt";
@@ -588,7 +592,7 @@ struct Config {
   // desc = used only in ``multi-class`` classification application
   int num_class = 1;
 
-  // alias = unbalanced_sets
+  // alias = unbalance, unbalanced_sets
   // desc = used only in ``binary`` application
   // desc = set this to ``true`` if training data are unbalance
   // desc = **Note**: this parameter cannot be used at the same time with ``scale_pos_weight``, choose only **one** of them
@@ -658,7 +662,7 @@ struct Config {
   // default = ""
   // type = multi-enum
   // desc = metric(s) to be evaluated on the evaluation sets **in addition** to what is provided in the training arguments
-  // descl2 = ``""`` (empty string or not specific) means that metric corresponding to specified ``objective`` will be used (this is possible only for pre-defined objective functions, otherwise no evaluation metric will be added)
+  // descl2 = ``""`` (empty string or not specified) means that metric corresponding to specified ``objective`` will be used (this is possible only for pre-defined objective functions, otherwise no evaluation metric will be added)
   // descl2 = ``"None"`` (string, **not** a ``None`` value) means that no metric will be registered, aliases: ``na``
   // descl2 = ``l1``, absolute loss, aliases: ``mean_absolute_error``, ``mae``, ``regression_l1``
   // descl2 = ``l2``, square loss, aliases: ``mean_squared_error``, ``mse``, ``regression_l2``, ``regression``
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 4da4b7c4a05..33dda5ed5e6 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -660,12 +660,8 @@ def _lazy_init(self, data, label=None, reference=None,
                 warnings.warn('{0} keyword has been found in `params` and will be ignored. '
                               'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
         self.predictor = predictor
-        if "verbosity" in params:
-            params.setdefault("verbose", params.pop("verbosity"))
         if silent:
             params["verbose"] = 0
-        elif "verbose" not in params:
-            params["verbose"] = 1
         # get categorical features
         if categorical_feature is not None:
             categorical_indices = set()
@@ -1340,12 +1336,8 @@ def __init__(self, params=None, train_set=None, model_file=None, silent=False):
         self.best_iteration = -1
         self.best_score = {}
         params = {} if params is None else params
-        if "verbosity" in params:
-            params.setdefault("verbose", params.pop("verbosity"))
         if silent:
             params["verbose"] = 0
-        elif "verbose" not in params:
-            params["verbose"] = 1
         if train_set is not None:
             # Training task
             if not isinstance(train_set, Dataset):
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 0c3131ad90a..21d2678c799 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -98,7 +98,8 @@ def train(params, train_set, num_boost_round=100,
         The trained Booster model.
     """
     # create predictor first
-    for alias in ["num_boost_round", "num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds", "n_estimators"]:
+    for alias in ["num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees",
+                  "num_round", "num_rounds", "num_boost_round", "n_estimators"]:
         if alias in params:
             num_boost_round = int(params.pop(alias))
             warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
@@ -396,7 +397,8 @@ def cv(params, train_set, num_boost_round=100,
     if not isinstance(train_set, Dataset):
         raise TypeError("Traninig only accepts Dataset object")
 
-    for alias in ["num_boost_round", "num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds", "n_estimators"]:
+    for alias in ["num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees",
+                  "num_round", "num_rounds", "num_boost_round", "n_estimators"]:
         if alias in params:
             warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
             num_boost_round = params.pop(alias)
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 6f60c2d3628..ecdf25e7ab3 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -397,9 +397,6 @@ def fit(self, X, y,
             self._fobj = None
         evals_result = {}
         params = self.get_params()
-        # sklearn interface has another naming convention
-        params.setdefault('seed', params.pop('random_state'))
-        params.setdefault('nthread', params.pop('n_jobs'))
         # user can set verbose with kwargs, it has higher priority
         if 'verbose' not in params and self.silent:
             params['verbose'] = 0
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 788ff6566d7..3a6e872b7e3 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -11,13 +11,16 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"boost", "boosting"},
   {"train", "data"},
   {"train_data", "data"},
+  {"train_data_file", "data"},
   {"data_filename", "data"},
   {"test", "valid"},
   {"valid_data", "valid"},
   {"valid_data_file", "valid"},
   {"test_data", "valid"},
+  {"test_data_file", "valid"},
   {"valid_filenames", "valid"},
   {"num_iteration", "num_iterations"},
+  {"n_iter", "num_iterations"},
   {"num_tree", "num_iterations"},
   {"num_trees", "num_iterations"},
   {"num_round", "num_iterations"},
@@ -25,14 +28,20 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"num_boost_round", "num_iterations"},
   {"n_estimators", "num_iterations"},
   {"shrinkage_rate", "learning_rate"},
+  {"eta", "learning_rate"},
   {"num_leaf", "num_leaves"},
+  {"max_leaves", "num_leaves"},
+  {"max_leaf", "num_leaves"},
   {"tree", "tree_learner"},
+  {"tree_type", "tree_learner"},
   {"tree_learner_type", "tree_learner"},
   {"num_thread", "num_threads"},
   {"nthread", "num_threads"},
   {"nthreads", "num_threads"},
+  {"n_jobs", "num_threads"},
   {"device", "device_type"},
   {"random_seed", "seed"},
+  {"random_state", "seed"},
   {"min_data_per_leaf", "min_data_in_leaf"},
   {"min_data", "min_data_in_leaf"},
   {"min_child_samples", "min_data_in_leaf"},
@@ -53,10 +62,13 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"max_leaf_output", "max_delta_step"},
   {"reg_alpha", "lambda_l1"},
   {"reg_lambda", "lambda_l2"},
+  {"lambda", "lambda_l2"},
   {"min_split_gain", "min_gain_to_split"},
+  {"rate_drop", "drop_rate"},
   {"topk", "top_k"},
   {"mc", "monotone_constraints"},
   {"monotone_constraint", "monotone_constraints"},
+  {"feature_contrib", "feature_contri"},
   {"fc", "feature_contri"},
   {"fp", "feature_contri"},
   {"feature_penalty", "feature_contri"},
@@ -66,12 +78,19 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"forced_splits", "forcedsplits_filename"},
   {"verbose", "verbosity"},
   {"subsample_for_bin", "bin_construct_sample_cnt"},
+  {"hist_pool_size", "histogram_pool_size"},
+  {"data_seed", "data_random_seed"},
   {"model_output", "output_model"},
   {"model_out", "output_model"},
+  {"save_period", "snapshot_freq"},
   {"model_input", "input_model"},
   {"model_in", "input_model"},
   {"predict_result", "output_result"},
   {"prediction_result", "output_result"},
+  {"predict_name", "output_result"},
+  {"prediction_name", "output_result"},
+  {"pred_name", "output_result"},
+  {"name_pred", "output_result"},
   {"init_score_filename", "initscore_filename"},
   {"init_score_file", "initscore_filename"},
   {"init_score", "initscore_filename"},
@@ -114,6 +133,7 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"contrib", "predict_contrib"},
   {"convert_model_file", "convert_model"},
   {"num_classes", "num_class"},
+  {"unbalance", "is_unbalance"},
   {"unbalanced_sets", "is_unbalance"},
   {"metrics", "metric"},
   {"metric_types", "metric"},
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index b2334291d12..c3640d2306f 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -179,17 +179,6 @@ def test_feature_importances_type(self):
         importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
         self.assertNotEqual(importance_split_top1, importance_gain_top1)
 
-    def test_sklearn_backward_compatibility(self):
-        iris = load_iris()
-        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
-
-        # Tests that `seed` is the same as `random_state`
-        clf_1 = lgb.sklearn.LGBMClassifier(seed=42, subsample=0.6, colsample_bytree=0.8)
-        clf_2 = lgb.sklearn.LGBMClassifier(random_state=42, subsample=0.6, colsample_bytree=0.8)
-        y_pred_1 = clf_1.fit(X_train, y_train).predict_proba(X_test)
-        y_pred_2 = clf_2.fit(X_train, y_train).predict_proba(X_test)
-        np.testing.assert_allclose(y_pred_1, y_pred_2)
-
     # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1
     @unittest.skipIf(not sklearn_at_least_019, 'scikit-learn version is less than 0.19')
     def test_sklearn_integration(self):