In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Python Linear Regression example").getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [5]:
data = spark.read.load("linregdata1.csv", format="csv", sep=",", inferSchema="true", header="true")
data.printSchema()

root
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- energy_output: double (nullable = true)



In [6]:
data.describe()

DataFrame[summary: string, temperature: string, exhaust_vacuum: string, ambient_pressure: string, relative_humidity: string, energy_output: string]

In [7]:
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]

In [8]:
lr_data = data.select(col("energy_output").alias("label"), *features)
lr_data.printSchema()

root
 |-- label: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)



In [9]:
lr_data.show()

+------+-----------+--------------+----------------+-----------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|
+------+-----------+--------------+----------------+-----------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|
|445.75|      23.64|         58.49|          1011.4|             74.2|
|438.76|      29.74|          56.9|         1007.15|            41.91|
|453.09|      19.07|         49.69|         1007.22|            76.79|
|464.43|       11.8|         40.66|         1017.13|             97.2|
|470.96|      13.97|         39.16|         1016.05|             84.6|
|442.35|       22.1|         71.29|          1008.2|            75.38|
| 464.0|      14.47|         41.76|         1021.98|            78.41|
|428.77|      31.25|         69.51|         1010.25|            36.83|
|484.31|       6.77|         38.18|          1017.8|            81.13|
|435.29|      28.28|         68.67|         1006.36|             69.9|
|451.4

VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [10]:
vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

In [11]:
va_data = vectorAssembler.transform(lr_data)

In [12]:
va_data.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |
+------+-----------+--------------+----------------+-----------------+---------------------------+
|480.48|8.34       |40.77         |1010.84         |90.01            |[8.34,40.77,1010.84,90.01] |
|445.75|23.64      |58.49         |1011.4          |74.2             |[23.64,58.49,1011.4,74.2]  |
|438.76|29.74      |56.9          |1007.15         |41.91            |[29.74,56.9,1007.15,41.91] |
|453.09|19.07      |49.69         |1007.22         |76.79            |[19.07,49.69,1007.22,76.79]|
|464.43|11.8       |40.66         |1017.13         |97.2             |[11.8,40.66,1017.13,97.2]  |
|470.96|13.97      |39.16         |1016.05         |84.6             |[13.97,39.16,1016.05,84.6] |
|442.35|22.1       |71.29         |1008.2          |75.38            |[22.1,71.29,1008.2,75.38]  |
|464.0 |14

StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation or zero mean.
Uses 'withStd' by default i.e. scales the data to unit standard deviation.

In [13]:
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")

In [14]:
ss_model = standardScaler.fit(va_data)

In [15]:
ss_data = ss_model.transform(va_data)

In [16]:
ss_data.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |features                                                                     |
+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+
|480.48|8.34       |40.77         |1010.84         |90.01            |[8.34,40.77,1010.84,90.01] |[1.1190915744403476,3.208242310929751,170.20993692880273,6.164955008688884]  |
|445.75|23.64      |58.49         |1011.4          |74.2             |[23.64,58.49,1011.4,74.2]  |[3.1721012973345104,4.602651281978933,170.30423233131958,5.082098229582438]  |
|438.76|29.74      |56.9          |1007.15         |41.91            |[29.74,56.9,1007.15,41.91] |[3.99062151365179

In [17]:
(training, test) = ss_data.randomSplit([.7, .3])

In [18]:
training.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|             label|      temperature|    exhaust_vacuum|  ambient_pressure| relative_humidity|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|              6787|             6787|              6787|              6787|              6787|
|   mean| 454.4679416531598|19.61153676145574|54.279319286871676|1013.2649580079581| 73.36396051274491|
| stddev|17.104800476576067|7.452224011402169|12.712821706006975| 5.976528716571987|14.539095522286205|
|    min|            420.26|             1.81|             25.36|            992.89|             25.56|
|    max|            495.76|            37.11|             81.56|            1033.3|            100.16|
+-------+------------------+-----------------+------------------+------------------+------------------+



In [19]:
test.describe().show()

+-------+------------------+------------------+------------------+------------------+-----------------+
|summary|             label|       temperature|    exhaust_vacuum|  ambient_pressure|relative_humidity|
+-------+------------------+------------------+------------------+------------------+-----------------+
|  count|              2781|              2781|              2781|              2781|             2781|
|   mean|454.11380438691145|19.748104998202066| 54.37043869111828|1013.2447285149223|73.17479323984189|
| stddev|16.974822891921228| 7.453533923880616|12.697909515604232|  5.84668460809035|14.75026608961445|
|    min|            425.18|              2.34|             25.36|            993.31|            25.89|
|    max|            495.35|              35.1|             80.18|           1032.88|           100.13|
+-------+------------------+------------------+------------------+------------------+-----------------+



In [20]:
lr = LinearRegression(maxIter=10, regParam=.01)

In [21]:
lr_model = lr.fit(training)

Now that the linear regression model is built we can apply it on the test data using transform method.
Before that we can look at the characteristics of our model i.e. coefficients and other parameters.

In [22]:
lr_model.coefficients

DenseVector([-14.698, -3.0327, 0.3655, -2.3092])

In [23]:
lr_model.intercept

455.34227359104057

In [24]:
trainingSummary = lr_model.summary

In [25]:
trainingSummary.rootMeanSquaredError

4.582782318444667

In [26]:
trainingSummary.meanAbsoluteError

3.637714542125518

In [27]:
trainingSummary.meanSquaredError

21.001893778249077

In [28]:
trainingSummary.r2

0.92820628506889

In [29]:
prediction_df = lr_model.transform(test)

In [30]:
prediction_df.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+---------------------------------------------------------------------------+------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |features                                                                   |prediction        |
+------+-----------+--------------+----------------+-----------------+---------------------------+---------------------------------------------------------------------------+------------------+
|425.18|32.84      |68.14         |1003.59         |43.88            |[32.84,68.14,1003.59,43.88]|[4.406590803911393,5.362021855941948,168.98914823550427,3.005424128222067] |429.13826616081616|
|425.19|31.92      |69.13         |1000.77         |58.91            |[31.92,69.13,1000.77,58.91]|[4.283141853253705,5.439926194617946,168.514303529973,4.034857233216999]   |428.16576832598963|
|425.21|31.12      |67.69     

In [31]:
prediction_df.select("label","prediction").show(truncate=False)

+------+------------------+
|label |prediction        |
+------+------------------+
|425.18|429.13826616081616|
|425.19|428.16576832598963|
|425.21|431.70244872064245|
|425.28|429.88739867714946|
|425.48|430.2824289837105 |
|425.5 |428.83455018495096|
|425.64|433.02549518339026|
|425.68|428.75242548286417|
|425.71|429.95591538965556|
|425.75|430.07927817097726|
|425.82|432.9413927909671 |
|425.89|428.9186701523052 |
|426.1 |429.12965388818736|
|426.15|430.2397100337043 |
|426.22|424.9797145142233 |
|426.25|431.3288773027631 |
|426.3 |424.9159209328144 |
|426.31|429.9240351537835 |
|426.35|432.3810663066821 |
|426.37|428.47579240196154|
+------+------------------+
only showing top 20 rows



In [32]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [33]:
rmse = eval.evaluate(prediction_df)
print("RMSE: %.3f" % rmse)

RMSE: 4.495


In [34]:
mse = eval.evaluate(prediction_df, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

MSE: 20.202


In [35]:
mae = eval.evaluate(prediction_df, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

MAE: 3.596


In [36]:
r2 = eval.evaluate(prediction_df, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

r2: 0.930
