Skip to content

Commit

Permalink
Merge branch 'master' of github.com:apache/spark into unsafe_obj
Browse files Browse the repository at this point in the history
  • Loading branch information
Davies Liu committed Jun 23, 2015
2 parents 035501e + a803118 commit 39f09ca
Show file tree
Hide file tree
Showing 26 changed files with 505 additions and 223 deletions.
16 changes: 15 additions & 1 deletion R/pkg/inst/profile/shell.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,21 @@
sc <- SparkR::sparkR.init()
assign("sc", sc, envir=.GlobalEnv)
sqlContext <- SparkR::sparkRSQL.init(sc)
sparkVer <- SparkR:::callJMethod(sc, "version")
assign("sqlContext", sqlContext, envir=.GlobalEnv)
cat("\n Welcome to SparkR!")
cat("\n Welcome to")
cat("\n")
cat(" ____ __", "\n")
cat(" / __/__ ___ _____/ /__", "\n")
cat(" _\\ \\/ _ \\/ _ `/ __/ '_/", "\n")
cat(" /___/ .__/\\_,_/_/ /_/\\_\\")
if (nchar(sparkVer) == 0) {
cat("\n")
} else {
cat(" version ", sparkVer, "\n")
}
cat(" /_/", "\n")
cat("\n")

cat("\n Spark context is available as sc, SQL context is available as sqlContext\n")
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.Logging
import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS._
Expand All @@ -41,7 +41,8 @@ import org.apache.spark.util.StatCounter
* Params for linear regression.
*/
private[regression] trait LinearRegressionParams extends PredictorParams
with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
with HasFitIntercept

/**
* :: Experimental ::
Expand Down Expand Up @@ -72,6 +73,14 @@ class LinearRegression(override val uid: String)
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)

/**
* Set if we should fit the intercept
* Default is true.
* @group setParam
*/
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)

/**
* Set the ElasticNet mixing parameter.
* For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
Expand Down Expand Up @@ -123,6 +132,7 @@ class LinearRegression(override val uid: String)
val numFeatures = summarizer.mean.size
val yMean = statCounter.mean
val yStd = math.sqrt(statCounter.variance)
// look at glmnet5.m L761 maaaybe that has info

// If the yStd is zero, then the intercept is yMean with zero weights;
// as a result, training is not needed.
Expand All @@ -142,7 +152,7 @@ class LinearRegression(override val uid: String)
val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam
val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam

val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
val costFun = new LeastSquaresCostFun(instances, yStd, yMean, $(fitIntercept),
featuresStd, featuresMean, effectiveL2RegParam)

val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
Expand Down Expand Up @@ -180,7 +190,7 @@ class LinearRegression(override val uid: String)
// The intercept in R's GLMNET is computed using closed form after the coefficients are
// converged. See the following discussion for detail.
// http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
val intercept = if ($(fitIntercept)) yMean - dot(weights, Vectors.dense(featuresMean)) else 0.0
if (handlePersistence) instances.unpersist()

// TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
Expand Down Expand Up @@ -234,13 +244,18 @@ class LinearRegressionModel private[ml] (
* See this discussion for detail.
* http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
*
* When training with intercept enabled,
* The objective function in the scaled space is given by
* {{{
* L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
* }}}
* where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
* \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
*
* If we fitting the intercept disabled (that is forced through 0.0),
* we can use the same equation except we set \bar{y} and \bar{x_i} to 0 instead
* of the respective means.
*
* This can be rewritten as
* {{{
* L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
Expand All @@ -255,6 +270,7 @@ class LinearRegressionModel private[ml] (
* \sum_i w_i^\prime x_i - y / \hat{y} + offset
* }}}
*
*
* Note that the effective weights and offset don't depend on training dataset,
* so they can be precomputed.
*
Expand Down Expand Up @@ -301,6 +317,7 @@ private class LeastSquaresAggregator(
weights: Vector,
labelStd: Double,
labelMean: Double,
fitIntercept: Boolean,
featuresStd: Array[Double],
featuresMean: Array[Double]) extends Serializable {

Expand All @@ -321,7 +338,7 @@ private class LeastSquaresAggregator(
}
i += 1
}
(weightsArray, -sum + labelMean / labelStd, weightsArray.length)
(weightsArray, if (fitIntercept) labelMean / labelStd - sum else 0.0, weightsArray.length)
}

private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
Expand Down Expand Up @@ -404,6 +421,7 @@ private class LeastSquaresCostFun(
data: RDD[(Double, Vector)],
labelStd: Double,
labelMean: Double,
fitIntercept: Boolean,
featuresStd: Array[Double],
featuresMean: Array[Double],
effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
Expand All @@ -412,7 +430,7 @@ private class LeastSquaresCostFun(
val w = Vectors.fromBreeze(weights)

val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(w, labelStd,
labelMean, featuresStd, featuresMean))(
labelMean, fitIntercept, featuresStd, featuresMean))(
seqOp = (c, v) => (c, v) match {
case (aggregator, (label, features)) => aggregator.add(label, features)
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import org.apache.spark.mllib.tree.loss.Losses
import org.apache.spark.mllib.tree.model.{DecisionTreeModel, GradientBoostedTreesModel, RandomForestModel}
import org.apache.spark.mllib.tree.{DecisionTree, GradientBoostedTrees, RandomForest}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.storage.StorageLevel
Expand Down Expand Up @@ -972,7 +973,7 @@ private[python] class PythonMLLibAPI extends Serializable {
def estimateKernelDensity(
sample: JavaRDD[Double],
bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = {
return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
points.asScala.toArray)
}

Expand All @@ -991,6 +992,35 @@ private[python] class PythonMLLibAPI extends Serializable {
List[AnyRef](model.clusterCenters, Vectors.dense(model.clusterWeights)).asJava
}

/**
* Wrapper around the generateLinearInput method of LinearDataGenerator.
*/
def generateLinearInputWrapper(
intercept: Double,
weights: JList[Double],
xMean: JList[Double],
xVariance: JList[Double],
nPoints: Int,
seed: Int,
eps: Double): Array[LabeledPoint] = {
LinearDataGenerator.generateLinearInput(
intercept, weights.asScala.toArray, xMean.asScala.toArray,
xVariance.asScala.toArray, nPoints, seed, eps).toArray
}

/**
* Wrapper around the generateLinearRDD method of LinearDataGenerator.
*/
def generateLinearRDDWrapper(
sc: JavaSparkContext,
nexamples: Int,
nfeatures: Int,
eps: Double,
nparts: Int,
intercept: Double): JavaRDD[LabeledPoint] = {
LinearDataGenerator.generateLinearRDD(
sc, nexamples, nfeatures, eps, nparts, intercept)
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ private[mllib] object NumericParser {
}
} else if (token == ")") {
parsing = false
} else if (token.trim.isEmpty){
// ignore whitespaces between delim chars, e.g. ", ["
} else {
// expecting a number
items.append(parseDouble(token))
Expand Down
Loading

0 comments on commit 39f09ca

Please sign in to comment.