In [1]:
import os

# Set HADOOP_HOME to the parent folder of the 'bin' with winutils.exe
os.environ["HADOOP_HOME"] = r"C:\winutils-master\hadoop-3.0.0"
os.environ["PATH"] += r";C:\winutils-master\hadoop-3.0.0\bin"

# (Optional: Set JAVA_HOME if not already set)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot"

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnan, isnull, mean, stddev, abs as spark_abs
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [3]:
# set a Spark session
spark = SparkSession.builder \
    .appName("DataExploitation") \
    .master("local[*]") \
    .getOrCreate()

In [4]:
# set the paths for the input and output data
formatted_zone = ("formatted_zone")
exploitation_zone = ("exploitation_zone")

In [5]:
# load data previously formatted
data = spark.read.option("multiline", True).parquet(f"{formatted_zone}/formatted_data")

In [6]:
data.show()

+---------+--------+-------------------+--------+-----+------+---------+-------+-------+----------+--------+----------+---------+--------------------+--------------+----------------------+---------+------------------+---------+-----------+------------+------------+-----+-----------+-----+------+-----------------+---------------+------------------+------------------+
|bathrooms|distance|           district|exterior|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|        neighborhood|newDevelopment|newDevelopmentFinished|numPhotos|      parkingSpace|    price|priceByArea|propertyCode|propertyType|rooms|showAddress| size|status|topNewDevelopment|neighborhood_id|   Index_RFD_media|    Poblacio_media|
+---------+--------+-------------------+--------+-----+------+---------+-------+-------+----------+--------+----------+---------+--------------------+--------------+----------------------+---------+------------------+---------+-----------+------------+----------

In [7]:
# clean numerical features: if price, size, or priceByArea are null or less than or equal to zero (=probably errors),
# set them to None. When instead they are valid, cast them to DoubleType (as anyways VectorAssembler would convert 
# to this type)
data_features = data.withColumn(
    "price_clean", 
    when(col("price").isNull() | (col("price") <= 0), None).otherwise(col("price").cast(DoubleType()))
).withColumn(
    "size_clean",
    when(col("size").isNull() | (col("size") <= 0), None).otherwise(col("size").cast(DoubleType()))
).withColumn(
    "priceByArea_clean",
    when(col("priceByArea").isNull() | (col("priceByArea") <= 0), None).otherwise(col("priceByArea").cast(DoubleType())))

In [None]:
# create categorical features for size, income, and rooms
data_features = data_features.withColumn(
    "size_category",
    when(col("size_clean") <= 50, "small")
    .when(col("size_clean") <= 100, "medium") 
    .when(col("size_clean") <= 150, "large")
    .otherwise("extra_large")
).withColumn(
    "income_category", 
    when(col("Index_RFD_average") <= 70, "low_income")
    .when(col("Index_RFD_average") <= 100, "medium_income")
    .when(col("Index_RFD_average") <= 130, "high_income") 
    .otherwise("very_high_income")
).withColumn(
    "rooms_category",
    when(col("rooms") <= 2, "small")
    .when(col("rooms") <= 4, "medium")
    .otherwise("large"))

In [9]:
# try to convert floor to numeric when possible
data_features = data_features.withColumn(
    "floor_numeric",
    when(col("floor").rlike("^\\d+$"), col("floor").cast("int")).otherwise(0))

In [10]:
# create binary features for parking, exterior, and lift availability
data_features = data_features.withColumn(
    "has_parking", 
    when(col("parkingSpace").isNotNull(), 1).otherwise(0)
).withColumn(
    "is_exterior",
    when(col("exterior") == True, 1).otherwise(0)
).withColumn(
    "has_lift_binary",
    when(col("hasLift") == True, 1).otherwise(0))

In [11]:
# detect and remove outliers 
price_stats = data_features.select(
    mean("price_clean").alias("price_mean"),
    stddev("price_clean").alias("price_std")
).collect()[0]

size_stats = data_features.select(
    mean("size_clean").alias("size_mean"), 
    stddev("size_clean").alias("size_std")
).collect()[0]

# remove outliers (beyond 3 standard deviations) 
price_lower = price_stats["price_mean"] - 3 * price_stats["price_std"] 
price_upper = price_stats["price_mean"] + 3 * price_stats["price_std"]
size_lower = size_stats["size_mean"] - 3 * size_stats["size_std"]
size_upper = size_stats["size_mean"] + 3 * size_stats["size_std"]

data_cleaned = data_features.filter(
    (col("price_clean").between(price_lower, price_upper)) &
    (col("size_clean").between(size_lower, size_upper)) &
    (col("price_clean").isNotNull()) &
    (col("size_clean").isNotNull()) &
    (col("Index_RFD_average").isNotNull()) &
    (col("neighborhood").isNotNull()))

print(f"removed {data.count() - data_cleaned.count()} outliers")

removed 112 outliers


In [12]:
# convert all numerical columns to DoubleType - same reason as above 
data_cleaned = data_cleaned.withColumn("distance", col("distance").cast(DoubleType())) \
    .withColumn("numPhotos", col("numPhotos").cast(DoubleType())) \
    .withColumn("rooms", col("rooms").cast(DoubleType())) \
    .withColumn("bathrooms", col("bathrooms").cast(DoubleType())) \
    .withColumn("latitude", col("latitude").cast(DoubleType())) \
    .withColumn("longitude", col("longitude").cast(DoubleType()))

In [13]:
# define the features we will use for prediction
numerical_features = [
    "size_clean", "rooms", "bathrooms", "Index_RFD_average", "Poblacio_average",
    "latitude", "longitude", "distance", "numPhotos", "floor_numeric",
    "has_parking", "is_exterior", "has_lift_binary"]

# handle boolean columns - convert to integer, and then VectorAssembler will convert to DoubleType
for bool_col in ["has360", "has3DTour", "hasVideo", "hasPlan", "newDevelopment"]:
    data_cleaned = data_cleaned.withColumn(bool_col + "_int", when(col(bool_col) == True, 1).otherwise(0))
    numerical_features.append(bool_col + "_int")

# handle categorical features by doing string indexing and one-hot encoding
categorical_features = [
    "propertyType", "size_category", "income_category", "rooms_category",
    "neighborhood", "status"]

# create string indexers for categorical variables
indexers = []
for cat_col in categorical_features:
    indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_indexed", handleInvalid="keep")
    indexers.append(indexer)

# create one-hot encoders  
encoders = []
encoded_cols = []
for cat_col in categorical_features:
    encoder = OneHotEncoder(inputCol=f"{cat_col}_indexed", outputCol=f"{cat_col}_encoded")
    encoders.append(encoder)
    encoded_cols.append(f"{cat_col}_encoded")

In [14]:
# target variable
target_column = "price_clean"

# select only the columns we need
all_features = [target_column] + numerical_features + categorical_features
ml_data = data_cleaned.select(all_features)

# fill missing values
ml_data = ml_data.fillna({
    "floor_numeric": 0,
    "numPhotos": 0,
    "distance": 0.0,
    "status": "unknown"})

print(f"Selected {len(numerical_features)} numerical features and {len(categorical_features)} categorical features")
print(f"ML dataset shape: {ml_data.count()} rows, {len(all_features)} columns")

Selected 18 numerical features and 6 categorical features
ML dataset shape: 3950 rows, 25 columns


In [15]:
# combine all feature columns for VectorAssembler
all_feature_cols = numerical_features + encoded_cols

# assemble all features into a single vector - which is the input for ML algorithms
assembler = VectorAssembler(inputCols=all_feature_cols, outputCol="features")

# create preprocessing pipeline
pipeline_stages = indexers + encoders + [assembler]
preprocessing_pipeline = Pipeline(stages=pipeline_stages)

In [16]:
# fit and transform the preprocessing pipeline
pipeline_model = preprocessing_pipeline.fit(ml_data)
ml_data_processed = pipeline_model.transform(ml_data)

# select final columns for ML (target, features, and neighborhood for analysis)
final_ml_data = ml_data_processed.select("price_clean", "features", "neighborhood")

# split data 80/20 for train/test
train_data, test_data = final_ml_data.randomSplit([0.8, 0.2], seed=42)

print(f"Train set: {train_data.count():,} records")
print(f"Test set: {test_data.count():,} records")

Train set: 3,219 records
Test set: 731 records


In [17]:
# check for missing values in target
null_targets_train = train_data.filter(col("price_clean").isNull()).count()
null_targets_test = test_data.filter(col("price_clean").isNull()).count()

print(f" missing values in target column ->  train: {null_targets_train}, test: {null_targets_test}")

 missing values in target column ->  train: 0, test: 0


In [18]:
# save train set 
train_data.write.mode("overwrite").parquet(f"{exploitation_zone}/train_data")
# save test set 
test_data.write.mode("overwrite").parquet(f"{exploitation_zone}/test_data")
# save full processed dataset
final_ml_data.write.mode("overwrite").parquet(f"{exploitation_zone}/ml_ready_data")
# preprocessing pipeline
pipeline_model.write().overwrite().save(f"{exploitation_zone}/preprocessing_pipeline")

spark.stop()