From d83995271bbf319735efccfd0a36c99b10243d7a Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 22 Jun 2015 22:40:19 -0700
Subject: [PATCH] [SPARK-7781] [MLLIB] gradient boosted trees.train regressor
 missing max bins

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits:

2894695 [Holden Karau] remove extra blank line
2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too
3a09170 [Holden Karau] add maxBins to to the train method as well
af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100

(cherry picked from commit 164fe2aa44993da6c77af6de5efdae47a8b3958c)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../mllib/api/python/PythonMLLibAPI.scala     |  4 +++-
 python/pyspark/mllib/tests.py                 |  7 ++++++
 python/pyspark/mllib/tree.py                  | 22 ++++++++++++-------
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 16f3131796709..d1b2c98a547ed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -685,12 +685,14 @@ private[python] class PythonMLLibAPI extends Serializable {
       lossStr: String,
       numIterations: Int,
       learningRate: Double,
-      maxDepth: Int): GradientBoostedTreesModel = {
+      maxDepth: Int,
+      maxBins: Int): GradientBoostedTreesModel = {
     val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
     boostingStrategy.setLoss(Losses.fromString(lossStr))
     boostingStrategy.setNumIterations(numIterations)
     boostingStrategy.setLearningRate(learningRate)
     boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
+    boostingStrategy.treeStrategy.setMaxBins(maxBins)
     boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap
 
     val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 7a113f8751ff8..4335143a8dd59 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -444,6 +444,13 @@ def test_regression(self):
         except ValueError:
             self.fail()
 
+        # Verify that maxBins is being passed through
+        GradientBoostedTrees.trainRegressor(
+            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
+        with self.assertRaises(Exception) as cm:
+            GradientBoostedTrees.trainRegressor(
+                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
+
 
 class StatTests(MLlibTestCase):
     # SPARK-4023
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index cfcbea573fd22..372b86a7c95d9 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -299,7 +299,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                  1 internal node + 2 leaf nodes. (default: 4)
         :param maxBins: maximum number of bins used for splitting
                  features
-                 (default: 100)
+                 (default: 32)
         :param seed: Random seed for bootstrapping and choosing feature
                  subsets.
         :return: RandomForestModel that can be used for prediction
@@ -377,7 +377,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
                  1 leaf node; depth 1 means 1 internal node + 2 leaf
                  nodes. (default: 4)
         :param maxBins: maximum number of bins used for splitting
-                 features (default: 100)
+                 features (default: 32)
         :param seed: Random seed for bootstrapping and choosing feature
                  subsets.
         :return: RandomForestModel that can be used for prediction
@@ -435,16 +435,17 @@ class GradientBoostedTrees(object):
 
     @classmethod
     def _train(cls, data, algo, categoricalFeaturesInfo,
-               loss, numIterations, learningRate, maxDepth):
+               loss, numIterations, learningRate, maxDepth, maxBins):
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
         model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
-                              loss, numIterations, learningRate, maxDepth)
+                              loss, numIterations, learningRate, maxDepth, maxBins)
         return GradientBoostedTreesModel(model)
 
     @classmethod
     def trainClassifier(cls, data, categoricalFeaturesInfo,
-                        loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
+                        loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
+                        maxBins=32):
         """
         Method to train a gradient-boosted trees model for
         classification.
@@ -467,6 +468,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
         :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
                  1 leaf node; depth 1 means 1 internal node + 2 leaf
                  nodes. (default: 3)
+        :param maxBins: maximum number of bins used for splitting
+                 features (default: 32) DecisionTree requires maxBins >= max categories
         :return: GradientBoostedTreesModel that can be used for
                    prediction
 
@@ -499,11 +502,12 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
         [1.0, 0.0]
         """
         return cls._train(data, "classification", categoricalFeaturesInfo,
-                          loss, numIterations, learningRate, maxDepth)
+                          loss, numIterations, learningRate, maxDepth, maxBins)
 
     @classmethod
     def trainRegressor(cls, data, categoricalFeaturesInfo,
-                       loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3):
+                       loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
+                       maxBins=32):
         """
         Method to train a gradient-boosted trees model for regression.
 
@@ -522,6 +526,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
                  contribution of each estimator. The learning rate
                  should be between in the interval (0, 1].
                  (default: 0.1)
+        :param maxBins: maximum number of bins used for splitting
+                 features (default: 32) DecisionTree requires maxBins >= max categories
         :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
                  1 leaf node; depth 1 means 1 internal node + 2 leaf
                  nodes.  (default: 3)
@@ -556,7 +562,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         [1.0, 0.0]
         """
         return cls._train(data, "regression", categoricalFeaturesInfo,
-                          loss, numIterations, learningRate, maxDepth)
+                          loss, numIterations, learningRate, maxDepth, maxBins)
 
 
 def _test():