In [11]:
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

In [None]:
sparkSession = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [13]:
def getOceanProximityValue(x):
  if x == "ISLAND":
    return 0
  if x == "NEAR BAY":
    return 1
  elif x == "NEAR OCEAN":
    return 2
  elif x == "<1H OCEAN":
    return 3
  elif x == "INLAND":
    return 4
  else:
    return x
#End getOceanProximityValue
getOceanProximityValueUDF = udf(getOceanProximityValue, IntegerType())

In [None]:
#Carga de datos con sparkContext
sparkSession.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/data/housing.csv")
dfHousing = sparkSession.read.csv(SparkFiles.get("housing.csv"),
                                  header=True,
                                  inferSchema=True)
dfHousing.printSchema()
dfHousing.show(5)

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR B

In [None]:
totalMedianHouseValue = dfHousing.agg({"median_house_value": "avg"}).collect()[0][0]
print(f"Total Median House Value: {totalMedianHouseValue}")

Total Median House Value: 206855.81690891474


In [14]:
dfHousing = dfHousing.withColumn('label_high_house_value', (dfHousing.median_house_value > totalMedianHouseValue).cast('integer'))
dfHousing = dfHousing.withColumn("ocean_proximity_int", getOceanProximityValueUDF("ocean_proximity"))
dfHousing.sample(0.05, False).show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|label_high_house_value|ocean_proximity_int|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+
|  -122.25|   37.84|              52.0|     3549.0|         707.0|    1551.0|     714.0|       3.6912|          261100.0|       NEAR BAY|                     1|                  1|
|  -122.28|   37.85|              49.0|     1130.0|         244.0|     607.0|     239.0|       2.4597|           93800.0|       NEAR BAY|                     0|                  1|
|   -122.3|   37.81|              48.0|     1455.0|         354.0|     788.0|     332.0|       

In [15]:
featureCols = dfHousing.columns
featureCols.remove("ocean_proximity")
featureCols.remove("median_house_value")
featureCols.remove("label_high_house_value")
print(featureCols)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity_int']


In [22]:
dfHousing = dfHousing.drop("features")
assembler = VectorAssembler(inputCols=featureCols, outputCol="features", handleInvalid='skip')
dfHousing = assembler.transform(dfHousing)
dfHousing.sample(0.05, False).show(5)


+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|label_high_house_value|ocean_proximity_int|            features|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+----------------------+-------------------+--------------------+
|  -122.25|   37.84|              52.0|     3549.0|         707.0|    1551.0|     714.0|       3.6912|          261100.0|       NEAR BAY|                     1|                  1|[-122.25,37.84,52...|
|  -122.28|   37.85|              49.0|     1130.0|         244.0|     607.0|     239.0|       2.4597|           93800.0|       NEAR BAY|                     0|                  1|[-122.28,37.

In [23]:
dsHousing = dfHousing.select("features", "label_high_house_value")
dsHousing.show(5)

+--------------------+----------------------+
|            features|label_high_house_value|
+--------------------+----------------------+
|[-122.23,37.88,41...|                     1|
|[-122.22,37.86,21...|                     1|
|[-122.24,37.85,52...|                     1|
|[-122.25,37.85,52...|                     1|
|[-122.25,37.85,52...|                     1|
+--------------------+----------------------+
only showing top 5 rows



In [24]:
train, test = dsHousing.randomSplit([0.8, 0.2], seed=42)
print(f"Train size: {train.count()}")
print(f"Test size: {test.count()}")

Train size: 16395
Test size: 4038


In [25]:
logisticRegression = LogisticRegression(featuresCol="features", labelCol="label_high_house_value")
model = logisticRegression.fit(train)


In [27]:
coeficientes = model.coefficients
for i in range(len(coeficientes)):
  print(f"{featureCols[i]}: {coeficientes[i]}")
#End for
print(f"Intercept: {model.intercept}")

longitude: -1.912563274578849
latitude: -1.910560333145928
housing_median_age: 0.03371002670170704
total_rooms: -0.00013435594151665496
total_bedrooms: 0.0026474505430730184
population: -0.001553912198341853
households: 0.0027967800584418195
median_income: 1.226350184629384
ocean_proximity_int: 0.09474929694555863
Intercept: -167.63009369513082


In [28]:
predictions = model.transform(test)
predictions.sample(0.05, False).show(5)
#

+--------------------+----------------------+--------------------+--------------------+----------+
|            features|label_high_house_value|       rawPrediction|         probability|prediction|
+--------------------+----------------------+--------------------+--------------------+----------+
|[-124.16,40.95,20...|                     0|[3.54023753035539...|[0.97181121970677...|       0.0|
|[-123.53,40.88,20...|                     0|[4.94544097616619...|[0.99293450042843...|       0.0|
|[-122.71,38.91,20...|                     0|[4.61588203847310...|[0.99020346837565...|       0.0|
|[-122.63,38.21,22...|                     1|[-2.6269331702277...|[0.06742503382787...|       1.0|
|[-122.54,37.76,45...|                     1|[-1.2729390427194...|[0.21875455158815...|       1.0|
+--------------------+----------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [32]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label_high_house_value")
auc = evaluator.evaluate(predictions,{evaluator.metricName : 'areaUnderROC'})
print(f"AUC: {auc}")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_high_house_value")
evaluator.setMetricName("accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")
evaluator.setMetricName("f1")
f1 = evaluator.evaluate(predictions)
print(f"F1: {f1}")
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(predictions)
print(f"Precision: {precision}")
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(predictions)
print(f"Recall: {recall}")

AUC: 0.9105547477684895
Accuracy: 0.8325903912828133
F1: 0.8319985205446851
Precision: 0.8319814565362588
Recall: 0.8325903912828132


In [36]:
#Tuning
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label_high_house_value")
paramGrid = ParamGridBuilder().addGrid(logisticRegression.regParam, [0.01, 0.1, 1.0]).build()
crossValidator = CrossValidator(estimator = logisticRegression,
                                estimatorParamMaps = paramGrid,
                                evaluator = evaluator,
                                numFolds = 3)
cvModel = crossValidator.fit(train)

In [38]:
bestModel = cvModel.bestModel
predictions = bestModel.transform(test)
auc = evaluator.evaluate(predictions,{evaluator.metricName : 'areaUnderROC'})
print(f"AUC: {auc}")

AUC: 0.8935695100771448
