In [1]:
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [5]:
# Create a SparkSession 
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

# Load up our data and convert it to the format MLLib expects.
inputLines = spark.sparkContext.textFile("/home/jovyan/work/linear-regression-example/regression.txt")
data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))
data.take(10)

[(-1.74, DenseVector([1.66])),
 (1.24, DenseVector([-1.18])),
 (0.29, DenseVector([-0.4])),
 (-0.13, DenseVector([0.09])),
 (-0.39, DenseVector([0.38])),
 (-1.79, DenseVector([1.73])),
 (0.71, DenseVector([-0.77])),
 (1.39, DenseVector([-1.48])),
 (1.15, DenseVector([-1.43])),
 (0.13, DenseVector([-0.07]))]

In [6]:
# Convert this RDD to a DataFrame
colNames = ["label", "features"]
df = data.toDF(colNames)
df.show(5)

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
| 0.29|  [-0.4]|
|-0.13|  [0.09]|
|-0.39|  [0.38]|
+-----+--------+
only showing top 5 rows



In [12]:
trainTest = df.randomSplit([0.8, 0.2])
trainingDF = trainTest[0]
testDF = trainTest[1]

<class 'list'>
DataFrame[label: double, features: vector]


In [8]:
# Now create our linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [9]:
trained_algo = lir.fit(trainingDF)

In [None]:
prediction_DF = trained_algo.transform(testDF)

In [None]:
prediction_DF.show(5)

In [None]:
score = trained_algo.evaluate(testDF)

In [None]:
print(score.rootMeanSquaredError)