diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 9de5e341f4b5b..f7bd5008287ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -73,6 +73,15 @@ class LinearRegression(override val uid: String) def setRegParam(value: Double): this.type = set(regParam, value) setDefault(regParam -> 0.0) + /** + * Set if we should fit the intercept + * Default is true. + * @group setParam + */ + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + setDefault(fitIntercept -> true) + + /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. @@ -124,7 +133,7 @@ class LinearRegression(override val uid: String) val numFeatures = summarizer.mean.size val yMean = if ($(fitIntercept)) statCounter.mean else 0.0 val yStd = math.sqrt(statCounter.variance) - // look at glmnet6.m L761 maaaybe that has info + // look at glmnet5.m L761 maaaybe that has info // If the yStd is zero, then the intercept is yMean with zero weights; // as a result, training is not needed. @@ -247,11 +256,11 @@ class LinearRegressionModel private[ml] ( * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i, * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label. * - * If we are training with intercept disabled (that is forced through 0.0), - * we can use the same equation except \bar{y} and \bar{x_i} are 0 instead of the - * respective means. + * If we fitting the intercept disabled (that is forced through 0.0), + * we can use the same equation except we set \bar{y} and \bar{x_i} to 0 instead + * of the respective means. * - * With the intercept, this can be rewritten as + * This can be rewritten as * {{{ * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y} * + \bar{y} / \hat{y}||^2 diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index 732e2c42be144..1309b21c248af 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.{DataFrame, Row} class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var dataset: DataFrame = _ + @transient var datasetNR: DataFrame = _ /** * In `LinearRegressionSuite`, we will make sure that the model trained by SparkML @@ -35,13 +36,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { * import org.apache.spark.mllib.util.LinearDataGenerator * val data = * sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), 10000, 42), 2) - * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).saveAsTextFile("path") + * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("path") + * val dataNR = + * sc.parallelize(LinearDataGenerator.generateLinearInput(0.0, Array(4.7, 7.2), 10000, 42), 2) + * dataNR.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("pathNR") */ override def beforeAll(): Unit = { super.beforeAll() dataset = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( 6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)) + datasetNR = sqlContext.createDataFrame( + sc.parallelize(LinearDataGenerator.generateLinearInput( + 0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)) } test("linear regression with intercept without regularization") { @@ -78,6 +85,48 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("linear regression without intercept without regularization") { + val trainer = (new LinearRegression).setFitIntercept(false) + val model = trainer.fit(dataset) + val modelNR = trainer.fit(datasetNR) + + /** + * Using the following R code to load the data and train the model using glmnet package. + * + * library("glmnet") + * data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE) + * features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3))) + * label <- as.numeric(data$V1) + * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0)) + * > weights + * 3 x 1 sparse Matrix of class "dgCMatrix" + * s0 + * (Intercept) . + * as.numeric.data.V2. 4.648385 + * as.numeric.data.V3. 7.462729 + */ + val weightsR = Array(4.648385, 7.462729) + + assert(model.intercept ~== 0 relTol 1E-3) + assert(model.weights(0) ~== weightsR(0) relTol 1E-3) + assert(model.weights(1) ~== weightsR(1) relTol 1E-3) + /** + * Then again with the data with no intercept: + * > weightsNR + * 3 x 1 sparse Matrix of class "dgCMatrix" + * s0 + * (Intercept) . + * as.numeric.dataNR.V2. 4.701019 + * as.numeric.dataNR.V3. 7.198280 + */ + val weightsRNR = Array(4.701019, 7.198280) + + assert(modelNR.intercept ~== 0 relTol 1E-3) + assert(modelNR.weights(0) ~== weightsRNR(0) relTol 1E-3) + assert(modelNR.weights(1) ~== weightsRNR(1) relTol 1E-3) + } + + test("linear regression with intercept with L1 regularization") { val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) val model = trainer.fit(dataset)