In [None]:
#Step 1: Install Dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark

#Step 2: Add environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.3.0-bin-hadoop3"

#Step 3: Initialize Pyspark
import findspark
findspark.init()

In [None]:
#creating spark context
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [None]:
from google.colab import files
uploaded = files.upload()

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [39]:
df = spark.read.csv("nyc.csv",inferSchema=True,header=True)
df.show()

+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|_c0|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio| black|lstat|medv|
+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|  3|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|  5|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|  6|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|  7|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|  8|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|  9|0.21124|12.5| 7.

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [24]:
df.columns

['_c0',
 'crim',
 'zn',
 'indus',
 'chas',
 'nox',
 'rm',
 'age',
 'dis',
 'rad',
 'tax',
 'ptratio',
 'black',
 'lstat',
 'medv']

In [40]:
assembler = VectorAssembler(
    inputCols=["crim", "zn", 
               "indus","chas","nox","rm","age","dis","rad","tax","ptratio","black","lstat"],
    outputCol="features")

In [41]:
output = assembler.transform(df)
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|[0.00632,18.0,2.3...|
|[0.02731,0.0,7.07...|
|[0.02729,0.0,7.07...|
|[0.03237,0.0,2.18...|
|[0.06905,0.0,2.18...|
|[0.02985,0.0,2.18...|
|[0.08829,12.5,7.8...|
|[0.14455,12.5,7.8...|
|[0.21124,12.5,7.8...|
|[0.17004,12.5,7.8...|
|[0.22489,12.5,7.8...|
|[0.11747,12.5,7.8...|
|[0.09378,12.5,7.8...|
|[0.62976,0.0,8.14...|
|[0.63796,0.0,8.14...|
|[0.62739,0.0,8.14...|
|[1.05393,0.0,8.14...|
|[0.7842,0.0,8.14,...|
|[0.80271,0.0,8.14...|
|[0.7258,0.0,8.14,...|
+--------------------+
only showing top 20 rows



In [42]:
output.show()

+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|_c0|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio| black|lstat|medv|            features|
+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|  3|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|  5|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|  6|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|  7|0.088

In [43]:
final_data = output.select("features",'medv')
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [44]:
train_data.describe().show()

+-------+-----------------+
|summary|             medv|
+-------+-----------------+
|  count|              350|
|   mean|22.51028571428572|
| stddev| 9.20784874051275|
|    min|              5.0|
|    max|             50.0|
+-------+-----------------+



In [45]:
test_data.describe().show()

+-------+------------------+
|summary|              medv|
+-------+------------------+
|  count|               156|
|   mean|22.583333333333332|
| stddev|  9.20236178193356|
|    min|               5.0|
|    max|              50.0|
+-------+------------------+



In [46]:
lr = LinearRegression(labelCol='medv')
lrModel = lr.fit(train_data,)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [-0.06667085152526049,0.03970968299209699,0.05999600865797583,3.6972308938224807,-20.02946550821388,3.999609200699213,0.0018756266642881631,-1.3091557266626648,0.3315502064974477,-0.012234447959125535,-1.0221210993812573,0.010730925502982257,-0.4891162949765928] Intercept: 35.38235594800484


In [47]:
test_results = lrModel.evaluate(test_data)
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  -5.445150563189781|
|   2.285803809065257|
|   9.438668985005528|
| -1.4141685459797841|
|   4.568754407859522|
|  -2.088445899385839|
| -0.0513932865954132|
|  5.4734419578358455|
| -0.7862393321193366|
|  -5.554554339431046|
|   4.441285101014181|
|  -3.830120285249599|
| -7.1061809275390395|
| -0.9675247514606582|
| -0.2822432102430952|
|  -2.918989226797528|
|-0.16893059257199994|
|  -6.405636720875652|
| -2.9483280168432806|
| -0.7841208801692261|
+--------------------+
only showing top 20 rows



In [48]:
unlabeled_data = test_data.select('features')
predictions = lrModel.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.01096,55.0,2.2...| 27.44515056318978|
|[0.01301,35.0,1.5...|30.414196190934746|
|[0.01381,80.0,0.4...| 40.56133101499447|
|[0.01432,100.0,1....|33.014168545979786|
|[0.01501,90.0,1.2...| 45.43124559214048|
|[0.0187,85.0,4.15...| 25.18844589938584|
|[0.01965,80.0,1.7...|20.151393286595415|
|[0.02177,82.5,2.0...| 36.82655804216415|
|[0.02187,60.0,2.9...|31.886239332119338|
|[0.02498,0.0,1.89...|22.054554339431046|
|[0.02899,40.0,1.2...| 22.15871489898582|
|[0.03466,35.0,6.0...|23.230120285249598|
|[0.03537,34.0,6.0...| 29.10618092753904|
|[0.03659,25.0,4.8...| 25.76752475146066|
|[0.03768,80.0,1.5...|  34.8822432102431|
|[0.04011,80.0,1.5...|36.218989226797525|
|[0.04113,25.0,4.8...|   28.168930592572|
|[0.04294,28.0,15....|27.005636720875653|
|[0.04462,25.0,4.8...| 26.84832801684328|
|[0.04981,21.0,5.6...|24.184120880169225|
+--------------------+------------

In [49]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))

RMSE: 4.535422675848368
MSE: 20.57005884859957
