In [None]:
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [None]:
sparkSession = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [None]:
df = sparkSession.read.csv("/content/drive/MyDrive/Colab Notebooks/data/housing.csv",
                           header=True,
                           inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR B

In [None]:
def getOceanProximityValue(x):
  if x == "ISLAND":
    return 0
  if x == "NEAR BAY":
    return 1
  elif x == "NEAR OCEAN":
    return 2
  elif x == "<1H OCEAN":
    return 3
  elif x == "INLAND":
    return 4
  else:
    return x
#End getOceanProximityValue
getOceanProximityValueUDF = udf(getOceanProximityValue, IntegerType())
df = df.withColumn("ocean_proximity", getOceanProximityValueUDF("ocean_proximity"))
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|              1|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|              1|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|              1|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|              1|
|  -122.25|   37.85|              

In [None]:
targetLabel = "median_house_value"
featureCols = df.columns
featureCols.remove(targetLabel)
assembler = VectorAssembler(inputCols=featureCols, outputCol="features", handleInvalid='skip')
df = assembler.transform(df)
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|            features|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+--------------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|              1|[-122.23,37.88,41...|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|              1|[-122.22,37.86,21...|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|              1|[-122.24,37.85,52...|
|  -122.25|   37.85|              52.0|     12

In [None]:
dataSet = df.select('features', targetLabel)
dataSet.show(5, truncate = False)

+-----------------------------------------------------------+------------------+
|features                                                   |median_house_value|
+-----------------------------------------------------------+------------------+
|[-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,1.0]    |452600.0          |
|[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,1.0]|358500.0          |
|[-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,1.0]   |352100.0          |
|[-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,1.0]   |341300.0          |
|[-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,1.0]   |342200.0          |
+-----------------------------------------------------------+------------------+
only showing top 5 rows



In [None]:
trainData, testData = dataSet.randomSplit([0.8, 0.2], seed = 1)
print(trainData.count(), testData.count())

16353 4080


In [None]:
linearRegresionModel = LinearRegression(featuresCol="features", labelCol=targetLabel)
linearRegresionModel = linearRegresionModel.fit(trainData)

In [None]:
predictions = linearRegresionModel.transform(testData)
predictions.show(5)

+--------------------+------------------+------------------+
|            features|median_house_value|        prediction|
+--------------------+------------------+------------------+
|[-124.26,40.58,52...|          111400.0| 166503.7694119555|
|[-124.19,40.73,21...|           90100.0|173504.17656383477|
|[-124.19,40.77,30...|           69000.0|150920.43276811345|
|[-124.19,40.78,37...|           70000.0|123491.77112032752|
|[-124.18,40.62,35...|          107000.0| 165996.3895347016|
+--------------------+------------------+------------------+
only showing top 5 rows



In [None]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=targetLabel, metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse:g}")
evaluator.setMetricName("r2")
r2 = evaluator.evaluate(predictions)
print(f"R Squared (R2) on test data = {r2:g}")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
print(f"Mean Squared Error (MSE) on test data = {mse:g}")
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
print(f"Mean Absolute Error (MAE) on test data = {mae:g}")
var = evaluator.evaluate(predictions, {evaluator.metricName: "var"})
print(f"Explained Variance (var) on test data = {var:g}")


Root Mean Squared Error (RMSE) on test data = 70424.8
R Squared (R2) on test data = 0.640091
Mean Squared Error (MSE) on test data = 4.95965e+09
Mean Absolute Error (MAE) on test data = 51486.1
Explained Variance (var) on test data = 8.72298e+09


In [None]:
#Carga de datos con sparkContext
sparkSession.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/data/housing.csv")
dfHousing = sparkSession.read.csv(SparkFiles.get("housing.csv"),
                                  header=True,
                                  inferSchema=True)
dfHousing.printSchema()
dfHousing.show(5)

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR B

In [None]:
totalMedianHouseValue = dfHousing.agg({"median_house_value": "avg"}).collect()[0][0]
print(f"Total Median House Value: {totalMedianHouseValue}")

Total Median House Value: 206855.81690891474


In [None]:
dfHousing = dfHousing.withColumn('label_high_house_value', (dfHousing.median_house_value > totalMedianHouseValue).cast('integer'))
dfHousing = dfHousing.withColumn("ocean_proximity_int", getOceanProximityValueUDF("ocean_proximity"))
dfHousing.sample(0.05, False).show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|label_high_house_value|ocean_proximity_int|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+
|  -122.25|   37.84|              52.0|     3549.0|         707.0|    1551.0|     714.0|       3.6912|          261100.0|       NEAR BAY|                     1|                  1|
|  -122.28|   37.85|              49.0|     1130.0|         244.0|     607.0|     239.0|       2.4597|           93800.0|       NEAR BAY|                     0|                  1|
|   -122.3|   37.81|              48.0|     1455.0|         354.0|     788.0|     332.0|       

In [None]:
featureCols