In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Silver_transformation_propertyfinder").getOrCreate()

In [3]:
df = spark.read.parquet("hdfs://namenode:9000/datalake/silver/cleaned_propertyfinder")
df.show(3)
df.printSchema()

+--------------------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+-----------------+
|               title|   price|            location|bedrooms|bathrooms|property_type|property_size|furnished|           share_url|         description|           amenities|         listed_date|          latitude|         longitude|Pool|Gym|Garden|Parking|Maids_Quarters|Jacuzzi|Balcony|Spa|    price_per_sqm|
+--------------------+--------+--------------------+--------+---------+-------------+-------------+---------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+-----------------+
|2 floors home in ...|18000000|Tawny Hyde Park, ...|       3|        3|  

In [4]:
df = df.drop("amenities","listed_date","furnished")
df =df.withColumnRenamed("share_url","link")
df =df.withColumnRenamed("property_size","area")


In [5]:
from pyspark.sql.functions import col, regexp_replace

df = (
    df
    # Fix common encoding issues like â€“
    .withColumn("title", regexp_replace(col("title"), "â€“", "-"))
    .withColumn("title", regexp_replace(col("title"), "Ù", "ا"))  # example for Arabic letters
    .withColumn("title", regexp_replace(col("title"), "Ø", "ف"))  # example
    .withColumn("title", regexp_replace(col("title"), "[\uFFFD]", ""))  # replacement char
    .withColumn("title", regexp_replace(col("title"), "[^\x00-\x7F\u0600-\u06FF\s0-9a-zA-Z\-]", ""))  # remove other weird chars
    # Trim leading/trailing spaces
    .withColumn("title", regexp_replace(col("title"), "^\s+|\s+$", ""))
    # Replicate title into description
    .withColumn("description", col("title"))
)

# Show first 10 cleaned titles
rows = df.select("title").take(20)
for num, row in enumerate(rows):
    print(num, row["title"])

0 2 floors home in prime location by equal ins 10yrs
1 own your unique unit in with instalments or cash
2 VILLA Fully Finished Ready To Move Prime Location
3 Easy plan, early keys | Delivered in 18 months
4 Twin House READY TO MOVE and FULLY FINISHED
5 Apartment With Prime Location - Special Price
6 Apartment For Sale Fully Finished In Sodic
7 Pay 1.5 milion delivery within 2y installment 10y
8 Apartment with 10 years installs in prime location
9 Unique Apartment in heart of zayed with prime view
10 Cityline Apart  Open view of a breathing City
11 Chalet with amazing sea view for sale in Ain Sokhn
12 apartment 170m fully finished ready to move
13 Apartment for sale, 145 m, Saba Pasha
14 Finished Stand Alone For Sale in Kattameya Hills
15 Ready to move | Pay in Installments | close to Sea
16 Villa Twin house 315m with installment
17 RTM villa with 10 years installment and 10 % DP
18 English Version  Luxury Rooftop Villa | Palm City
19 Apartment sale prime location in Sarai view villas


In [6]:
locations = df.groupBy("location").count().take(30)
for location in locations:
    print(location)


Row(location='The Wonder Marq, Mostakbal City Compounds, Mostakbal City - Future City, Cairo', count=3)
Row(location='Rayhanah Avenue, Zahraa El Maadi, Hay El Maadi, Cairo', count=1)
Row(location='Karma Residence, 16th District, Sheikh Zayed City, Giza', count=4)
Row(location='Blue Blue, Al Ain Al Sokhna, Suez', count=3)
Row(location='Eleva, Uptown Cairo, Mokattam, Cairo', count=2)
Row(location='Granville, New Capital City, Cairo', count=3)
Row(location='Solay, 5th Settlement Compounds, The 5th Settlement, New Cairo City, Cairo', count=1)
Row(location='Swan Lake West, 6 October Compounds, 6 October City, Giza', count=3)
Row(location='El Patio Oro, 5th Settlement Compounds, The 5th Settlement, New Cairo City, Cairo', count=25)
Row(location='Al Maqsad, New Capital Compounds, New Capital City, Cairo', count=7)
Row(location='Al Amir Bashtak St., Sporting, Hay Sharq, Alexandria', count=1)
Row(location='Florenza Khamsin Resort, Hurghada Resorts, Hurghada, Red Sea', count=2)
Row(location='The

In [7]:
df.show(3)

+--------------------+--------+--------------------+--------+---------+-------------+-----+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+-----------------+
|               title|   price|            location|bedrooms|bathrooms|property_type| area|                link|         description|          latitude|         longitude|Pool|Gym|Garden|Parking|Maids_Quarters|Jacuzzi|Balcony|Spa|    price_per_sqm|
+--------------------+--------+--------------------+--------+---------+-------------+-----+--------------------+--------------------+------------------+------------------+----+---+------+-------+--------------+-------+-------+---+-----------------+
|2 floors home in ...|18000000|Tawny Hyde Park, ...|       3|        3|        Villa|240.0|https://www.prope...|2 floors home in ...|29.957908630371094| 30.91575813293457|   1|  1|     1|      1|             1|      0|      1|  1|          75000.0|
|own

In [8]:
df = df \
    .withColumn("longitude",col("longitude").cast("double"))\
    .withColumn("latitude",col("latitude").cast("double"))\
    .withColumn("price",col("price").cast("double"))


In [9]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- property_type: string (nullable = true)
 |-- area: double (nullable = true)
 |-- link: string (nullable = true)
 |-- description: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Pool: integer (nullable = true)
 |-- Gym: integer (nullable = true)
 |-- Garden: integer (nullable = true)
 |-- Parking: integer (nullable = true)
 |-- Maids_Quarters: integer (nullable = true)
 |-- Jacuzzi: integer (nullable = true)
 |-- Balcony: integer (nullable = true)
 |-- Spa: integer (nullable = true)
 |-- price_per_sqm: double (nullable = true)



In [10]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()
distinct = df.distinct().count()
count = df.count()
duplicate = count - distinct
print(f"Total Rows: {count}")
print(f"Distinct Rows: {distinct}")
print(f"Duplicate Rows Found: {duplicate}")

+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+
|title|price|location|bedrooms|bathrooms|property_type|area|link|description|latitude|longitude|Pool|Gym|Garden|Parking|Maids_Quarters|Jacuzzi|Balcony|Spa|price_per_sqm|
+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+
|    0|    0|       0|      46|        2|            0|   0|   0|          0|       0|        0|   0|  0|     0|      0|             0|      0|      0|  0|            0|
+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+

Total Rows: 2100
Distinct Rows: 2092
Duplicate Rows Found: 8


In [11]:
df = df.dropna(subset=[ "bedrooms", "bathrooms"])
df =df.dropDuplicates()

In [12]:
df = df.withColumn("source", lit("propertyfinder"))

In [13]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()
distinct = df.distinct().count()
count = df.count()
duplicate = count - distinct
print(f"Total Rows: {count}")
print(f"Distinct Rows: {distinct}")
print(f"Duplicate Rows Found: {duplicate}")

+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+------+
|title|price|location|bedrooms|bathrooms|property_type|area|link|description|latitude|longitude|Pool|Gym|Garden|Parking|Maids_Quarters|Jacuzzi|Balcony|Spa|price_per_sqm|source|
+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+------+
|    0|    0|       0|       0|        0|            0|   0|   0|          0|       0|        0|   0|  0|     0|      0|             0|      0|      0|  0|            0|     0|
+-----+-----+--------+--------+---------+-------------+----+----+-----------+--------+---------+----+---+------+-------+--------------+-------+-------+---+-------------+------+

Total Rows: 2046
Distinct Rows: 2046
Duplicate Rows Found: 0


In [14]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/gold/propertyfinder_gold")