In [1]:
%%bash
apt-get install openjdk-17-jdk-headless -qq > /dev/null

In [2]:
%%bash
wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz 
tar xf spark-3.3.0-bin-hadoop3.tgz

In [3]:
%%bash
pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [4]:
import os
os.environ['JAVA_HOME']='/usr/lib/jvm/java-1.17.0-openjdk-amd64'
os.environ['SPARK_HOME']='/content/spark-3.3.0-bin-hadoop3'

In [5]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [7]:
df = spark.read.csv('/content/boston.csv',
                    inferSchema=True,header=True)

In [8]:
df.show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|

In [9]:
df_study = df

In [30]:
from pyspark.ml.feature import VectorAssembler
input_features = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]
df_assembler = VectorAssembler(inputCols=input_features,outputCol='features')
df_cover_type = df_assembler.transform(df_study)
df_cover_type.printSchema()
df_train,df_test = df_cover_type.randomSplit([0.75,0.25])


root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: double (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: double (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- features: vector (nullable = true)



In [36]:
from pyspark.ml.regression import LinearRegression

sum = 0
calculateNum = 100

for i in range(calculateNum):
  lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8).setLabelCol("MEDV")
  lrModel = lr.fit(df_train)
  lrPredict = lrModel.transform(df_test)
  trainingSummary = lrModel.summary
  sum += trainingSummary.rootMeanSquaredError

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

Average RMSE is: 0.3620950275720272


In [37]:
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

sum = 0

for i in range(calculateNum):
  glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3).setLabelCol("MEDV")
  glrModel = glr.fit(df_train)
  predict = glrModel.transform(df_test)
  evaluator = RegressionEvaluator(
  labelCol="MEDV", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predict)
  sum += rmse

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

Average RMSE is: 0.3699083081902787


In [38]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor

sum = 0

for i in range(calculateNum):
  inputData = df
  assembler = VectorAssembler(
  inputCols=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"],
  outputCol="features")
  output = assembler.transform(inputData)

  dt = DecisionTreeRegressor(featuresCol="features", labelCol='MEDV')
  dtModel = dt.fit(df_train)
  predictions = dtModel.transform(df_test)
  evaluator = RegressionEvaluator(
  labelCol="MEDV", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predictions)
  sum += rmse

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

Average RMSE is: 0.9766597056395634


In [39]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

sum = 0

for i in range(calculateNum):
  inputData = df
  assembler = VectorAssembler(
  inputCols=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"],
  outputCol="features")
  output = assembler.transform(inputData)

  rf = RandomForestRegressor(featuresCol="features", labelCol='MEDV', numTrees=10)
  rfModel = rf.fit(df_train)
  predictions = rfModel.transform(df_test)
  evaluator = RegressionEvaluator(
  labelCol="MEDV", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predictions)
  sum += rmse

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

Average RMSE is: 1.5862399368628237


In [40]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

sum = 0

for i in range(calculateNum):
  inputData = df
  assembler = VectorAssembler(
  inputCols=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"],
  outputCol="features")
  output = assembler.transform(inputData)

  gbt = GBTRegressor(featuresCol="features", labelCol='MEDV')
  gbtModel = gbt.fit(df_train)
  predictions = gbtModel.transform(df_test)
  evaluator = RegressionEvaluator(
  labelCol="MEDV", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predictions)
  sum += rmse

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

Average RMSE is: 1.0064015592174191


In [27]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import AFTSurvivalRegression

sum = 0

for i in range(calculateNum):
  inputData = df
  assembler = VectorAssembler(
  inputCols=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"],
  outputCol="features")
  output = assembler.transform(inputData)

  afts = AFTSurvivalRegression(featuresCol="features", labelCol='MEDV')
  aftsModel = afts.fit(df_train)
  predictions = aftsModel.transform(df_test)
  evaluator = RegressionEvaluator(
  labelCol="MEDV", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predictions)
  sum += rmse

avrgRMSE = sum/calculateNum
print("Average RMSE is:", avrgRMSE)

IllegalArgumentException: ignored