<a href="https://colab.research.google.com/github/panashematsaudza/PySpark-House-Price-Pregression/blob/master/PySpark_Boston_House_Price_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz

In [0]:
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark


In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [37]:
dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [38]:
#input all the features in one vector column 
assembler = VectorAssembler(inputCols=['crim' ,'zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','b','lstat'], outputCol= 'Attributes')

output = assembler.transform(dataset)


#input vs output 

finalized_data = output.select("Attributes","medv")

finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [0]:
#spit training and testing data 
train_data ,test_data = finalized_data.randomSplit([0.8,0.2])

regressor = LinearRegression(featuresCol= 'Attributes' ,labelCol='medv')

#learn to fit the model from training set
regressor = regressor.fit(train_data)



In [0]:
pred = regressor.evaluate(test_data)

In [41]:
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01301,35.0,1.5...|32.7|30.389312922155497|
|[0.01432,100.0,1....|31.6| 33.69896935507755|
|[0.01439,60.0,2.9...|29.1|31.438398274365923|
|[0.01501,90.0,1.2...|50.0| 43.86016081349476|
|[0.01951,17.5,1.3...|33.0| 23.73405820595761|
|[0.03041,0.0,5.19...|18.5| 19.42756575950385|
|[0.03306,0.0,5.19...|20.6|22.074845604337217|
|[0.03537,34.0,6.0...|22.0|28.531108147198992|
|[0.03578,20.0,3.3...|45.4| 38.62209896341011|
|[0.03615,80.0,4.9...|27.9|31.988093728155874|
|[0.04544,0.0,3.24...|19.8|20.973572515545346|
|[0.04666,80.0,1.5...|30.3| 32.50592033278573|
|[0.04684,0.0,3.41...|22.6| 26.71418137621392|
|[0.04981,21.0,5.6...|23.4|23.830975345530533|
|[0.05059,0.0,4.49...|23.9|24.372981396710564|
|[0.05083,0.0,5.19...|22.2|22.439665496302418|
|[0.05188,0.0,4.49...|22.5|21.533916841506922|
|[0.05479,33.0,2.1...|28.4| 30.46075854029526|
|[0.05561,70.

In [42]:
#coefficient of the regression model
coeff = regressor.coefficients
#X and Y intercept
intr = regressor.intercept
print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.1258, 0.0538, 0.0537, 1.9086, -14.3937, 3.7502, 0.01, -1.3239, 0.2903, -0.0133, -0.8937, 0.0086, -0.5759])
The Intercept of the model is : 33.571265


In [43]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 5.486
MSE: 30.100
MAE: 3.712
r2: 0.672
