## Import the Libraries

In [None]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
conf = pyspark.SparkConf().setAppName("Linear Regression")
sc = SparkContext.getOrCreate(conf=conf)

## Download the Dataset

In [4]:
!wget https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/datasets/data/boston_house_prices.csv

--2020-06-17 20:57:12--  https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/datasets/data/boston_house_prices.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34742 (34K) [text/plain]
Saving to: ‘boston_house_prices.csv’


2020-06-17 20:57:13 (585 KB/s) - ‘boston_house_prices.csv’ saved [34742/34742]



##  Load the Data in Spark DataFrame

In [3]:
df = spark.read.format('csv').options(header='true', inferschema='true').load('boston_house_prices.csv')
df.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

## Exploring the Dataset  

In [4]:
df.describe()

DataFrame[summary: string, CRIM: string, ZN: string, INDUS: string, CHAS: string, NOX: string, RM: string, AGE: string, DIS: string, RAD: string, TAX: string, PTRATIO: string, B: string, LSTAT: string, MEDV: string]

## Processing the Dataset

In [5]:
vectorAssembler = VectorAssembler(inputCols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], outputCol = 'features')
transformed_df = vectorAssembler.transform(df)
transformed_df.show()


+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

In [6]:
final_df=transformed_df.select(['features','MEDV'])
splitting_df = final_df.randomSplit([0.8, 0.2])
training_df = splitting_df[0]
testing_df = splitting_df[1]

## Model Implementation and Fitting 

In [9]:
linearregression = LinearRegression\
(featuresCol = 'features', labelCol='MEDV', maxIter=10)

model = linearregression.fit(training_df)
model

LinearRegression_48a6b1b0f3e2714bc7e5

## Model Prediction 

In [29]:
prediction = model.transform(testing_df)
prediction.select("prediction","MEDV","features").show(10)

+------------------+----+--------------------+
|        prediction|MEDV|            features|
+------------------+----+--------------------+
|30.168370914353503|32.7|[0.01301,35.0,1.5...|
|32.107781977964635|31.1|[0.02187,60.0,2.9...|
|  25.5393895881433|28.7|[0.02985,0.0,2.18...|
| 29.60036487940278|34.9|[0.0315,95.0,1.47...|
|33.755567691479506|28.5|[0.03502,80.0,4.9...|
|29.036760911431166|22.0|[0.03537,34.0,6.0...|
|26.171482129673656|24.8|[0.03659,25.0,4.8...|
|27.259133540676046|23.2|[0.03871,52.5,5.3...|
| 20.89884891981159|21.1|[0.03961,0.0,5.19...|
| 26.90435038034048|24.8|[0.04297,52.5,5.3...|
+------------------+----+--------------------+
only showing top 10 rows

