Skip to content

Commit

Permalink
Add a test, it fails!
Browse files Browse the repository at this point in the history
Add a test, it fails!

oops, compare with the values that were produced by R/in the comments

murh
  • Loading branch information
holdenk committed Jun 18, 2015
1 parent 5e84a0b commit e2140ba
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ class LinearRegression(override val uid: String)
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set if we should fit the intercept
* Default is true.
* @group setParam
*/
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)


/**
* Set the ElasticNet mixing parameter.
* For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
Expand Down Expand Up @@ -124,7 +133,7 @@ class LinearRegression(override val uid: String)
val numFeatures = summarizer.mean.size
val yMean = if ($(fitIntercept)) statCounter.mean else 0.0
val yStd = math.sqrt(statCounter.variance)
// look at glmnet6.m L761 maaaybe that has info
// look at glmnet5.m L761 maaaybe that has info

// If the yStd is zero, then the intercept is yMean with zero weights;
// as a result, training is not needed.
Expand Down Expand Up @@ -247,11 +256,11 @@ class LinearRegressionModel private[ml] (
* where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
* \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
*
* If we are training with intercept disabled (that is forced through 0.0),
* we can use the same equation except \bar{y} and \bar{x_i} are 0 instead of the
* respective means.
* If we fitting the intercept disabled (that is forced through 0.0),
* we can use the same equation except we set \bar{y} and \bar{x_i} to 0 instead
* of the respective means.
*
* With the intercept, this can be rewritten as
* This can be rewritten as
* {{{
* L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
* + \bar{y} / \hat{y}||^2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.spark.sql.{DataFrame, Row}
class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

@transient var dataset: DataFrame = _
@transient var datasetNR: DataFrame = _

/**
* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
Expand All @@ -35,13 +36,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
* import org.apache.spark.mllib.util.LinearDataGenerator
* val data =
* sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), 10000, 42), 2)
* data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).saveAsTextFile("path")
* data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("path")
* val dataNR =
* sc.parallelize(LinearDataGenerator.generateLinearInput(0.0, Array(4.7, 7.2), 10000, 42), 2)
* dataNR.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1).saveAsTextFile("pathNR")
*/
override def beforeAll(): Unit = {
super.beforeAll()
dataset = sqlContext.createDataFrame(
sc.parallelize(LinearDataGenerator.generateLinearInput(
6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
datasetNR = sqlContext.createDataFrame(
sc.parallelize(LinearDataGenerator.generateLinearInput(
0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
}

test("linear regression with intercept without regularization") {
Expand Down Expand Up @@ -78,6 +85,48 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
}
}

test("linear regression without intercept without regularization") {
val trainer = (new LinearRegression).setFitIntercept(false)
val model = trainer.fit(dataset)
val modelNR = trainer.fit(datasetNR)

/**
* Using the following R code to load the data and train the model using glmnet package.
*
* library("glmnet")
* data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
* features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
* label <- as.numeric(data$V1)
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
* > weights
* 3 x 1 sparse Matrix of class "dgCMatrix"
* s0
* (Intercept) .
* as.numeric.data.V2. 4.648385
* as.numeric.data.V3. 7.462729
*/
val weightsR = Array(4.648385, 7.462729)

assert(model.intercept ~== 0 relTol 1E-3)
assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
/**
* Then again with the data with no intercept:
* > weightsNR
* 3 x 1 sparse Matrix of class "dgCMatrix"
* s0
* (Intercept) .
* as.numeric.dataNR.V2. 4.701019
* as.numeric.dataNR.V3. 7.198280
*/
val weightsRNR = Array(4.701019, 7.198280)

assert(modelNR.intercept ~== 0 relTol 1E-3)
assert(modelNR.weights(0) ~== weightsRNR(0) relTol 1E-3)
assert(modelNR.weights(1) ~== weightsRNR(1) relTol 1E-3)
}


test("linear regression with intercept with L1 regularization") {
val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
val model = trainer.fit(dataset)
Expand Down

0 comments on commit e2140ba

Please sign in to comment.