Skip to content

Commit

Permalink
[SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing m…
Browse files Browse the repository at this point in the history
…ax bins

Author: Holden Karau <holden@pigscanfly.ca>

Closes apache#6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits:

2894695 [Holden Karau] remove extra blank line
2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too
3a09170 [Holden Karau] add maxBins to to the train method as well
af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100

(cherry picked from commit 164fe2a)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
  • Loading branch information
holdenk authored and nemccarthy committed Jun 25, 2015
1 parent 805b62a commit d839952
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -685,12 +685,14 @@ private[python] class PythonMLLibAPI extends Serializable {
lossStr: String,
numIterations: Int,
learningRate: Double,
maxDepth: Int): GradientBoostedTreesModel = {
maxDepth: Int,
maxBins: Int): GradientBoostedTreesModel = {
val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
boostingStrategy.setLoss(Losses.fromString(lossStr))
boostingStrategy.setNumIterations(numIterations)
boostingStrategy.setLearningRate(learningRate)
boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
boostingStrategy.treeStrategy.setMaxBins(maxBins)
boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap

val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)
Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,13 @@ def test_regression(self):
except ValueError:
self.fail()

# Verify that maxBins is being passed through
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
with self.assertRaises(Exception) as cm:
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)


class StatTests(MLlibTestCase):
# SPARK-4023
Expand Down
22 changes: 14 additions & 8 deletions python/pyspark/mllib/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
1 internal node + 2 leaf nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting
features
(default: 100)
(default: 32)
:param seed: Random seed for bootstrapping and choosing feature
subsets.
:return: RandomForestModel that can be used for prediction
Expand Down Expand Up @@ -377,7 +377,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting
features (default: 100)
features (default: 32)
:param seed: Random seed for bootstrapping and choosing feature
subsets.
:return: RandomForestModel that can be used for prediction
Expand Down Expand Up @@ -435,16 +435,17 @@ class GradientBoostedTrees(object):

@classmethod
def _train(cls, data, algo, categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth):
loss, numIterations, learningRate, maxDepth, maxBins):
first = data.first()
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth)
loss, numIterations, learningRate, maxDepth, maxBins)
return GradientBoostedTreesModel(model)

@classmethod
def trainClassifier(cls, data, categoricalFeaturesInfo,
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
"""
Method to train a gradient-boosted trees model for
classification.
Expand All @@ -467,6 +468,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3)
:param maxBins: maximum number of bins used for splitting
features (default: 32) DecisionTree requires maxBins >= max categories
:return: GradientBoostedTreesModel that can be used for
prediction
Expand Down Expand Up @@ -499,11 +502,12 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
[1.0, 0.0]
"""
return cls._train(data, "classification", categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth)
loss, numIterations, learningRate, maxDepth, maxBins)

@classmethod
def trainRegressor(cls, data, categoricalFeaturesInfo,
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3):
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
"""
Method to train a gradient-boosted trees model for regression.
Expand All @@ -522,6 +526,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
contribution of each estimator. The learning rate
should be between in the interval (0, 1].
(default: 0.1)
:param maxBins: maximum number of bins used for splitting
features (default: 32) DecisionTree requires maxBins >= max categories
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3)
Expand Down Expand Up @@ -556,7 +562,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
[1.0, 0.0]
"""
return cls._train(data, "regression", categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth)
loss, numIterations, learningRate, maxDepth, maxBins)


def _test():
Expand Down

0 comments on commit d839952

Please sign in to comment.