In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("RealEstate_Cleaning").getOrCreate()

df = spark.read.parquet("hdfs://namenode:9000/datalake/bronze/propertyfinder")

In [3]:
df.show(5)
df.printSchema()


+-------+--------------------+--------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|     id|               title|   price|currency|            location|bedrooms|bathrooms|property_type|property_size|furnished|           share_url|         description|           amenities|         listed_date|          latitude|         longitude|
+-------+--------------------+--------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|8091574|2 floors home in ...|18000000|     EGP|Tawny Hyde Park, ...|       3|        3|        Villa|      240 sqm|       NO|https://www.prope...|Club Hills Reside...|Maids Room, Study...|2025-10-15T14:44:26Z|29.957908630371094| 30.91575813293457|
|812

In [4]:
df = df.filter((col("price").isNotNull()) & (col("price") > 0))

In [5]:
df.groupBy("property_type").count().show(
    
)

+---------------+-----+
|  property_type|count|
+---------------+-----+
|      Apartment| 1179|
|      Townhouse|  148|
|         iVilla|   46|
|Hotel Apartment|    5|
|          Villa|  388|
|      Penthouse|   59|
|           Land|    2|
|         Chalet|  155|
|         Palace|    2|
|     Twin House|   60|
|         Duplex|   55|
|       Bungalow|    1|
+---------------+-----+



In [6]:
df.filter((col("property_type") == "Villa") ).show(2)
df.filter((col("property_type") == "iVilla") ).show(2)

+-------+--------------------+--------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|     id|               title|   price|currency|            location|bedrooms|bathrooms|property_type|property_size|furnished|           share_url|         description|           amenities|         listed_date|          latitude|         longitude|
+-------+--------------------+--------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|8091574|2 floors home in ...|18000000|     EGP|Tawny Hyde Park, ...|       3|        3|        Villa|      240 sqm|       NO|https://www.prope...|Club Hills Reside...|Maids Room, Study...|2025-10-15T14:44:26Z|29.957908630371094| 30.91575813293457|
|814

In [7]:
df.filter(lower(col("amenities")).contains("pool")).count()

1404

In [8]:
from pyspark.sql.functions import col, when, lower

# More comprehensive pattern matching
df = df.withColumn("Pool", 
    when(lower(col("amenities")).contains("pool"), 1)
    .when(lower(col("amenities")).contains("swimming"), 1)
    .otherwise(0)
)

df = df.withColumn("Gym", 
    when(lower(col("amenities")).contains("gym"), 1)
    .when(lower(col("amenities")).contains("fitness"), 1)
    .when(lower(col("amenities")).contains("workout"), 1)
    .otherwise(0)
)

df = df.withColumn("Garden", 
    when(lower(col("amenities")).contains("garden"), 1)
    .when(lower(col("amenities")).contains("gardening"), 1)
    .when(lower(col("amenities")).contains("lawn"), 1)
    .otherwise(0)
)

df = df.withColumn("Parking", 
    when(lower(col("amenities")).contains("parking"), 1)
    .when(lower(col("amenities")).contains("garage"), 1)
    .when(lower(col("amenities")).contains("carport"), 1)
    .otherwise(0)
)

df = df.withColumn("Maids_Quarters", 
    when(lower(col("amenities")).contains("maids"), 1)
    .when(lower(col("amenities")).contains("servant"), 1)
    .when(lower(col("amenities")).contains("staff"), 1)
    .otherwise(0)
)
df = df.withColumn("Jacuzzi",
    when(lower(col("amenities")).contains("jacuzzi"),1)
    .otherwise(0)
)
df = df.withColumn("Balcony",
                   when(lower(col("amenities")).contains("balcony"),1)
                   .otherwise(0)
)
df= df.withColumn("Spa",
                  when(lower(col("amenities")).contains("spa"),1)
                  .otherwise(0)
                  )

In [9]:
df.withColumn("price",
              col("price").cast("double"))

DataFrame[id: string, title: string, price: double, currency: string, location: string, bedrooms: string, bathrooms: string, property_type: string, property_size: string, furnished: string, share_url: string, description: string, amenities: string, listed_date: string, latitude: string, longitude: string, Pool: int, Gym: int, Garden: int, Parking: int, Maids_Quarters: int, Jacuzzi: int, Balcony: int, Spa: int]

In [None]:
from pyspark.sql.functions import col, trim, regexp_replace, when, lower

df = df.drop("id", "currency")

df = df.withColumn("property_size",
                   regexp_replace(col("property_size"), "[^0-9.]", "")
                  .cast("double"))

df = df.filter(col("property_size").isNotNull() & (col("property_size") > 0))

df = df.withColumn("price_per_sqm", col("price") / col("property_size"))

df = df.withColumn("amenities", trim(lower(col("amenities"))))

df = df.withColumn("bedrooms", regexp_replace(col("bedrooms"), "[^0-9]", "").cast("int"))
df = df.withColumn("bathrooms", regexp_replace(col("bathrooms"), "[^0-9]", "").cast("int"))

df.show(10)


+--------------------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+------------------+
|               title|   price|            location|bedrooms|bathrooms|property_type|property_size|furnished|           share_url|         description|           amenities|         listed_date|          latitude|         longitude|Pool|Gym|Garden|Parking|Maids_Quarters|Jacuzzi|Balcony|Spa|     price_per_sqm|
+--------------------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+------------------+
|2 floors home in ...|18000000|Tawny Hyde Park, ...|       3|        3

In [11]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- price: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- property_type: string (nullable = true)
 |-- property_size: double (nullable = true)
 |-- furnished: string (nullable = true)
 |-- share_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- amenities: string (nullable = true)
 |-- listed_date: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- Pool: integer (nullable = false)
 |-- Gym: integer (nullable = false)
 |-- Garden: integer (nullable = false)
 |-- Parking: integer (nullable = false)
 |-- Maids_Quarters: integer (nullable = false)
 |-- Jacuzzi: integer (nullable = false)
 |-- Balcony: integer (nullable = false)
 |-- Spa: integer (nullable = false)
 |-- price_per_sqm: double (nullable = true)



In [12]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/cleaned_propertyfinder")

In [None]:
# hdfs dfs -rm /datalake/silver/cleaned_propertyfinder/_SUCCESS