In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Pyspark Linear Regression example").getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [5]:
data = spark.read.load("/home/hduser/Downloads/sharedfolder/linregdata1.csv", format="csv",sep = ',',inferSchema='true',header="true")


In [6]:
data.show()

+-----------+--------------+----------------+-----------------+-------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|energy_output|
+-----------+--------------+----------------+-----------------+-------------+
|       8.34|         40.77|         1010.84|            90.01|       480.48|
|      23.64|         58.49|          1011.4|             74.2|       445.75|
|      29.74|          56.9|         1007.15|            41.91|       438.76|
|      19.07|         49.69|         1007.22|            76.79|       453.09|
|       11.8|         40.66|         1017.13|             97.2|       464.43|
|      13.97|         39.16|         1016.05|             84.6|       470.96|
|       22.1|         71.29|          1008.2|            75.38|       442.35|
|      14.47|         41.76|         1021.98|            78.41|        464.0|
|      31.25|         69.51|         1010.25|            36.83|       428.77|
|       6.77|         38.18|          1017.8|            81.13| 

In [7]:
data

DataFrame[temperature: double, exhaust_vacuum: double, ambient_pressure: double, relative_humidity: double, energy_output: double]

In [8]:
features=["temperature","exhaust_vacuum","ambient_pressure","relative_humidity"]

In [9]:
lr_data = data.select(col("energy_output").alias("label"), *features)

In [10]:
lr_data.show(5)

+------+-----------+--------------+----------------+-----------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|
+------+-----------+--------------+----------------+-----------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|
|445.75|      23.64|         58.49|          1011.4|             74.2|
|438.76|      29.74|          56.9|         1007.15|            41.91|
|453.09|      19.07|         49.69|         1007.22|            76.79|
|464.43|       11.8|         40.66|         1017.13|             97.2|
+------+-----------+--------------+----------------+-----------------+
only showing top 5 rows



In [11]:
vectorAssembler = VectorAssembler(inputCols= features, outputCol = "unscaled_features")

In [12]:
va_data = vectorAssembler.transform(lr_data)

In [13]:
va_data.show(2)

+------+-----------+--------------+----------------+-----------------+--------------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|   unscaled_features|
+------+-----------+--------------+----------------+-----------------+--------------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|[8.34,40.77,1010....|
|445.75|      23.64|         58.49|          1011.4|             74.2|[23.64,58.49,1011...|
+------+-----------+--------------+----------------+-----------------+--------------------+
only showing top 2 rows



In [14]:
standardScaler = StandardScaler(inputCol = "unscaled_features",outputCol="features")

In [15]:
ss_model = standardScaler.fit(va_data)

In [16]:
ss_data = ss_model.transform(va_data)

In [17]:
ss_data.show(4)

+------+-----------+--------------+----------------+-----------------+--------------------+--------------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|   unscaled_features|            features|
+------+-----------+--------------+----------------+-----------------+--------------------+--------------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|[8.34,40.77,1010....|[1.11909157444034...|
|445.75|      23.64|         58.49|          1011.4|             74.2|[23.64,58.49,1011...|[3.17210129733451...|
|438.76|      29.74|          56.9|         1007.15|            41.91|[29.74,56.9,1007....|[3.99062151365179...|
|453.09|      19.07|         49.69|         1007.22|            76.79|[19.07,49.69,1007...|[2.55888205330664...|
+------+-----------+--------------+----------------+-----------------+--------------------+--------------------+
only showing top 4 rows



In [18]:
(training, test)=ss_data.randomSplit([0.7,0.3])

In [19]:
training.count()

6646

In [20]:
test.count()

2922

In [21]:
lr = LinearRegression(maxIter=10,regParam=0.1)

In [22]:
lr_model = lr.fit(training)

In [23]:
lr_model.coefficients


DenseVector([-14.438, -3.1493, 0.4324, -2.1898])

In [24]:
lr_model.intercept

443.1685661200362

In [25]:
trainingSummary = lr_model.summary

In [26]:
trainingSummary.rootMeanSquaredError

4.545732135762056

In [27]:
trainingSummary.meanAbsoluteError

3.6366708586713754

In [28]:
trainingSummary.meanSquaredError


20.663680650099863

In [29]:
prediction_df = lr_model.transform(test)

In [30]:
prediction_df.show()

+------+-----------+--------------+----------------+-----------------+--------------------+--------------------+------------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|   unscaled_features|            features|        prediction|
+------+-----------+--------------+----------------+-----------------+--------------------+--------------------+------------------+
|420.26|      24.27|         63.87|         1018.88|            53.96|[24.27,63.87,1018...|[3.25663699180662...| 446.4195485172035|
|425.14|      29.67|         71.98|         1005.16|            67.75|[29.67,71.98,1005...|[3.98122865871044...|  430.880808487506|
|425.14|      31.93|         72.58|          1006.9|            56.27|[31.93,72.58,1006...|[4.28448368967389...| 428.2022130231028|
|425.17|      32.66|         73.68|         1014.64|            40.88|[32.66,73.68,1014...|[4.38243774834793...|429.38716278980024|
|425.18|      32.84|         68.14|         1003.59|            43.88|[32.84