# Linear Regression using Pyspark

In [3]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

import findspark
findspark.init()

import pyspark

In [4]:
#create sparksession object
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [5]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [6]:
#Load the dataset
df=spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)

In [7]:
#validate the size of data
print((df.count(), len(df.columns)))

(1232, 6)


In [8]:
#explore the data
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [9]:
#view statistical measures of data 
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [10]:
#sneak into the dataset
df.head(3)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417)]

In [11]:
#import corr function from pyspark functions
from pyspark.sql.functions import corr

In [12]:
# check for correlation
df.select(corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



This is the part where we create a single vector combining all input features
by using Spark’s VectorAssembler. It creates only a single feature that
captures the input values for that row. So, instead of five input columns, it
essentially merges all input columns into a single feature vector column.

In [13]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [14]:
#select the columns to create input vector
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [15]:
#create the vector assembler 
vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [16]:
#transform the values
features_df = vec_assmebler.transform(df)

As, we can see, we have an additional column (‘features’) that contains
the single dense vector for all of the inputs.

In [17]:
#validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
#view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [19]:
#create data containing input features and output column
model_df = features_df.select('features','output')

In [20]:
model_df.show(5,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [21]:
#size of model df
print((model_df.count(), len(model_df.columns)))

(1232, 2)


### Split Data - Train & Test sets


In [22]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df = model_df.randomSplit([0.7,0.3])

In [23]:
print((train_df.count(), len(train_df.columns)))

(847, 2)


In [24]:
print((test_df.count(), len(test_df.columns)))

(385, 2)


In [25]:
train_df.describe().show()

+-------+------------------+
|summary|            output|
+-------+------------------+
|  count|               847|
|   mean|0.3973494687131049|
| stddev|0.0328662767599641|
|    min|             0.311|
|    max|             0.491|
+-------+------------------+



## Build Linear Regression Model 

In [26]:
#Build Linear Regression model 
lin_Reg = LinearRegression(labelCol='output')

In [27]:
#fit the linear regression model on training data set 
lr_model = lin_Reg.fit(train_df)

In [28]:
lr_model.intercept

0.19214760476722678

$ y= B_0 + B_1 \times X_1 + B_2 \times X_2 + B_3 \times X_3 + B_4 \times X_4 + B_5 \times X_5$

$B_0$ is the intercelpt and $B_1$ to $B_5$ are the coefficients

In [29]:
print(lr_model.coefficients)

[0.000344461766975692,5.4590697647819076e-05,0.00018217813785411933,-0.6756913262114124,0.48514719933198297]


In [30]:
training_predictions=lr_model.evaluate(train_df)

In [31]:
training_predictions.meanSquaredError

0.00014556910095739974

In [32]:
training_predictions.r2

0.8650784782003608

In [33]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [34]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|-0.01345140677213058|
|0.007014098603287111|
|-0.00136208151862...|
|0.012969045643907773|
|-0.00586832463586...|
|-0.00497101733159...|
|-0.01112778358444838|
|0.007325719980361078|
|0.011234270318497774|
|-0.00940719116239...|
+--------------------+
only showing top 10 rows



In [35]:
#coefficient of determination value for model
test_results.r2

0.8775000335274008

In [36]:
test_results.rootMeanSquaredError

0.011945744867855568

In [37]:
test_results.meanSquaredError

0.00014270082044789766