Inherit from StreamingLinearAlgorithm

maropu · Jun 25, 2015 · d47cc24 · d47cc24
1 parent 1b4ddd6
commit d47cc24
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 53 deletions.
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
@@ -803,7 +803,7 @@ model.setInitialWeights([0.0, 0.0, 0.0])
 
 Now we register the streams for training and testing and start the job.
 
-{% highlight scala %}
+{% highlight python %}
 model.trainOn(trainingData)
 print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
 

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
@@ -630,10 +630,10 @@ def predictOnValues(self, dstream):
 @inherit_doc
 class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
     """
-    Run LogisticRegression with SGD on a stream of data.
+    Run LogisticRegression with SGD on a batch of data.
 
     The weights obtained at the end of training a stream are used as initial
-    weights for the next stream.
+    weights for the next batch.
 
     :param stepSize: Step size for each iteration of gradient descent.
     :param numIterations: Number of iterations run for each batch of data.

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
@@ -20,6 +20,7 @@
 
 from pyspark import RDD
 from pyspark.streaming.dstream import DStream
+from pyspark.mllib.classification import StreamingLinearAlgorithm
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
 from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector
 from pyspark.mllib.util import Saveable, Loader
@@ -572,17 +573,16 @@ def train(cls, data, isotonic=True):
 
 
 @inherit_doc
-class StreamingLinearRegressionWithSGD(LinearRegressionModel):
+class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     """
-    Run LinearRegression with SGD on a stream of data.
+    Run LinearRegression with SGD on a batch of data.
 
     The problem minimized is (1 / n_samples) * (y - weights'X)**2.
-    After training on a stream of data, the weights obtained at the end of
-    training are used as initial weights for the next stream of data.
+    After training on a batch of data, the weights obtained at the end of
+    training are used as initial weights for the next batch.
 
-    :param: stepSize          Step size for each iteration of gradient
-                              descent.
-    :param: numIterations     Total number of iterations run.
+    :param: stepSize Step size for each iteration of gradient descent.
+    :param: numIterations Total number of iterations run.
     :param: miniBatchFraction Fraction of data on which SGD is run for each
                               iteration.
     """
@@ -591,29 +591,8 @@ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0):
         self.numIterations = numIterations
         self.miniBatchFraction = miniBatchFraction
         self._model = None
-
-    def _validate_dstream(self, dstream):
-        if not isinstance(dstream, DStream):
-            raise TypeError(
-                "dstream should be a DStream object, got %s" % type(dstream))
-        if not self._model:
-            raise ValueError(
-                "Model must be intialized using setInitialWeights")
-
-    @property
-    def latestModel(self):
-        """Returns a LinearRegressionModel fit on the latest stream of data.
-
-        The weights and intercepts can be got from the `weights` and
-        `intercept` attributes.
-        """
-        return self._model
-
-    def __repr__(self):
-        if self._model is None:
-            return '(weights=None, intercept=None)'
-        else:
-            return str(self._model)
+        super(StreamingLinearRegressionWithSGD, self).__init__(
+            model=self._model)
 
     def setInitialWeights(self, initialWeights):
         """
@@ -639,23 +618,6 @@ def update(rdd):
 
         dstream.foreachRDD(update)
 
-    def predictOn(self, dstream):
-        """
-        Make predictions on a dstream of Vectors.
-
-        :return: Transformed dstream object.
-        """
-        self._validate_dstream(dstream)
-        return dstream.map(lambda x: self._model.predict(x))
-
-    def predictOnValues(self, dstream):
-        """Make predictions on a keyed dstream where the values are Vectors.
-
-        :return: Transformed dstream object.
-        """
-        self._validate_dstream(dstream)
-        return dstream.mapValues(lambda x: self._model.predict(x))
-
 
 def _test():
     import doctest

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
@@ -1201,8 +1201,8 @@ def test_parameter_accuracy(self):
         self.ssc.start()
         self._ssc_wait(t, 10, 0.01)
         self.assertArrayAlmostEqual(
-            slr.latestModel.weights.array, [10., 10.], 1)
-        self.assertAlmostEqual(slr.latestModel.intercept, 0.0, 1)
+            slr.latestModel().weights.array, [10., 10.], 1)
+        self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1)
 
     def test_parameter_convergence(self):
         """Test that the model parameters improve with streaming data."""
@@ -1219,7 +1219,7 @@ def test_parameter_convergence(self):
         model_weights = []
         input_stream = self.ssc.queueStream(batches)
         input_stream.foreachRDD(
-            lambda x: model_weights.append(slr.latestModel.weights[0]))
+            lambda x: model_weights.append(slr.latestModel().weights[0]))
         t = time()
         slr.trainOn(input_stream)
         self.ssc.start()