# Linear Regression

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

In [2]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [3]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'

In [5]:
# Load training data
training = sqc.read.format("libsvm").load(path + "sample_linear_regression_data.txt")
training.toPandas()

Unnamed: 0,label,features
0,-9.490010,"(0.4551273600657362, 0.36644694351969087, -0.3..."
1,0.257782,"(0.8386555657374337, -0.1270180511534269, 0.49..."
2,-4.438870,"(0.5025608135349202, 0.14208069682973434, 0.16..."
3,-19.782763,"(-0.0388509668871313, -0.4166870051763918, 0.8..."
4,-7.966594,"(-0.06195495876886281, 0.6546448480299902, -0...."
...,...,...
496,-0.874349,"(-0.9087681208947878, -0.292625136739453, -0.3..."
497,-5.615144,"(-0.6688289820084299, -0.4623159855015393, 0.0..."
498,-0.182940,"(0.09377558075225512, 0.5774384503027374, -0.7..."
499,18.479681,"(0.9635517137863321, 0.9954507816218203, 0.119..."


In [6]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426]
Intercept: 0.1598936844239736
numIterations: 7
objectiveHistory: [0.49999999999999994, 0.4967620357443381, 0.4936361664340463, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]
+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053554|
|  -5.204019455758823|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719486|
|  -10.00431602969873|
|   2.062397807050484|
|  3.1117508432954772|
| -15.893608229419382|
|  -5.036284254673026|
|   6.483215876994333|
|  12.429497299109002|
|  -20.32003219007654|
| -2.0049838218725005|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

RMSE: 10.189077
r2: 0.022861


In [None]:
sc.stop()

## Credits & Links

http://spark.apache.org/docs/2.2.0/ml-classification-regression.html