Add a test, it fails!

Add a test, it fails! oops, compare with the values that were produced by R/in the comments murh
maropu · Jun 18, 2015 · e2140ba · e2140ba
1 parent 5e84a0b
commit e2140ba
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 6 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -73,6 +73,15 @@ class LinearRegression(override val uid: String)
   def setRegParam(value: Double): this.type = set(regParam, value)
   setDefault(regParam -> 0.0)
 
+  /**
+   * Set if we should fit the intercept
+   * Default is true.
+   * @group setParam
+   */
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
+
   /**
    * Set the ElasticNet mixing parameter.
    * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
@@ -124,7 +133,7 @@ class LinearRegression(override val uid: String)
     val numFeatures = summarizer.mean.size
     val yMean = if ($(fitIntercept)) statCounter.mean else 0.0
     val yStd = math.sqrt(statCounter.variance)
-      // look at glmnet6.m L761 maaaybe that has info
+    // look at glmnet5.m L761 maaaybe that has info
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
@@ -247,11 +256,11 @@ class LinearRegressionModel private[ml] (
  * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
  * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
  *
- * If we are training with intercept disabled (that is forced through 0.0),
- * we can use the same equation except \bar{y} and \bar{x_i} are 0 instead of the
- * respective means.
+ * If we fitting the intercept disabled (that is forced through 0.0),
+ * we can use the same equation except we set \bar{y} and \bar{x_i} to 0 instead
+ * of the respective means.
  *
- * With the intercept, this can be rewritten as
+ * This can be rewritten as
  * {{{
  * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
  *     + \bar{y} / \hat{y}||^2

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
+  @transient var datasetNR: DataFrame = _
 
   /**
    * In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -35,13 +36,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
    * import org.apache.spark.mllib.util.LinearDataGenerator
    * val data =
    *   sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), 10000, 42), 2)
-   * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).saveAsTextFile("path")
+   * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("path")
+   * val dataNR =
+   *   sc.parallelize(LinearDataGenerator.generateLinearInput(0.0, Array(4.7, 7.2), 10000, 42), 2)
+   * dataNR.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("pathNR")
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
+    datasetNR = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
   }
 
   test("linear regression with intercept without regularization") {
@@ -78,6 +85,48 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("linear regression without intercept without regularization") {
+    val trainer = (new LinearRegression).setFitIntercept(false)
+    val model = trainer.fit(dataset)
+    val modelNR = trainer.fit(datasetNR)
+
+    /**
+     * Using the following R code to load the data and train the model using glmnet package.
+     *
+     * library("glmnet")
+     * data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+     * features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
+     * label <- as.numeric(data$V1)
+     * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
+     * > weights
+     *  3 x 1 sparse Matrix of class "dgCMatrix"
+     *                           s0
+     * (Intercept)         .
+     * as.numeric.data.V2. 4.648385
+     * as.numeric.data.V3. 7.462729
+     */
+    val weightsR = Array(4.648385, 7.462729)
+
+    assert(model.intercept ~== 0 relTol 1E-3)
+    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    /**
+     * Then again with the data with no intercept:
+     * > weightsNR
+     * 3 x 1 sparse Matrix of class "dgCMatrix"
+     *                             s0
+     * (Intercept)           .
+     * as.numeric.dataNR.V2. 4.701019
+     * as.numeric.dataNR.V3. 7.198280
+     */
+    val weightsRNR = Array(4.701019, 7.198280)
+
+    assert(modelNR.intercept ~== 0 relTol 1E-3)
+    assert(modelNR.weights(0) ~== weightsRNR(0) relTol 1E-3)
+    assert(modelNR.weights(1) ~== weightsRNR(1) relTol 1E-3)
+  }
+
+
   test("linear regression with intercept with L1 regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
     val model = trainer.fit(dataset)