diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ec6289a1acbe9..9de5e341f4b5b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -26,7 +26,7 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
+import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasFitIntercept, HasMaxIter, HasRegParam, HasTol}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -42,7 +42,7 @@ import org.apache.spark.util.StatCounter
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
-    with HasOptionalIntercept
+    with HasFitIntercept
 
 /**
  * :: Experimental ::
@@ -122,8 +122,8 @@ class LinearRegression(override val uid: String)
       })
 
     val numFeatures = summarizer.mean.size
-    val yMean = if (hasIntercept) statCounter.mean else 0.0
-    val yStd = if (hasIntercept) math.sqrt(statCounter.variance) else
+    val yMean = if ($(fitIntercept)) statCounter.mean else 0.0
+    val yStd = math.sqrt(statCounter.variance)
       // look at glmnet6.m L761 maaaybe that has info
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
@@ -135,7 +135,11 @@ class LinearRegression(override val uid: String)
       return new LinearRegressionModel(uid, Vectors.sparse(numFeatures, Seq()), yMean)
     }
 
-    val featuresMean = summarizer.mean.toArray
+    val featuresMean = if ($(fitIntercept)) {
+      summarizer.mean.toArray
+    } else {
+      new Array[Double](numFeatures)
+    }
     val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 
     // Since we implicitly do the feature scaling when we compute the cost function
@@ -235,6 +239,7 @@ class LinearRegressionModel private[ml] (
  * See this discussion for detail.
  * http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
  *
+ * When training with intercept enabled,
  * The objective function in the scaled space is given by
  * {{{
  * L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
@@ -242,7 +247,11 @@ class LinearRegressionModel private[ml] (
  * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
  * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
  *
- * This can be rewritten as
+ * If we are training with intercept disabled (that is forced through 0.0),
+ * we can use the same equation except \bar{y} and \bar{x_i} are 0 instead of the
+ * respective means.
+ *
+ * With the intercept, this can be rewritten as
  * {{{
  * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
  *     + \bar{y} / \hat{y}||^2
@@ -256,6 +265,7 @@ class LinearRegressionModel private[ml] (
  * \sum_i w_i^\prime x_i - y / \hat{y} + offset
  * }}}
  *
+ *
  * Note that the effective weights and offset don't depend on training dataset,
  * so they can be precomputed.
  *