In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Silver_transformation_dubbizle").getOrCreate()

In [3]:
df_alex = spark.read.parquet("hdfs://namenode:9000/datalake/silver/cleaned_dubbizle_alexandria")
df_alex.show(3)
df_alex.printSchema()

+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|         description|                link|    price|            location|bedrooms|bathrooms| area|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|شقة للبيع في السر...|https://www.dubiz...|7000000.0|Sidi Beshr, Alexa...|       3|        2|165.0|
|Apartment for sal...|https://www.dubiz...|5700000.0|   Alex West, Agami•|       3|        2|178.0|
|Apartment for sal...|https://www.dubiz...|9800000.0|Sawari, Moharam Bik•|       3|        3|208.0|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = t

In [4]:
df_cairo = spark.read.parquet("hdfs://namenode:9000/datalake/silver/cleaned_dubbizle_cairo")
df_cairo.show(3)
df_cairo.printSchema()

+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|         description|                link|    price|            location|bedrooms|bathrooms| area|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|A stunning apartm...|https://www.dubiz...|3075000.0|Shorouk City, Cairo•|       3|        2|205.0|
|An opportunity to...|https://www.dubiz...|8300000.0|Noor City, New Ca...|       3|        3|147.0|
|Apartment 3, imme...|https://www.dubiz...|   1.14E7|Villette Compound...|       3|        3|156.0|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = t

In [5]:
df_cairo.groupBy("location").count().show()
df_alex.groupBy("location").count().show()

+--------------------+-----+
|            location|count|
+--------------------+-----+
|El Banafseg 2, 1s...|    2|
|Galleria Moon Val...|   29|
|New Andalous, 5th...|    4|
|El Yasmeen 2, 1st...|    3|
|Jewar Compound, 6...|    3|
|Trio Gardens Comp...|   14|
|      EGP 14,430,000|    1|
|Baron City, Katam...|    1|
|El Banafseg 7, 1s...|    4|
|IL Bosco, New Cap...|    2|
|El Lotus, 5th Set...|   48|
|Midtown Condo, Ne...|    1|
|The View Compound...|    1|
|North Rehab, 1st ...|    1|
|Banafsag Omarat, ...|   17|
|Green Hills, Nasr...|    1|
|Katameya Heights ...|    1|
|Armonia, New Capi...|    3|
|El Banafseg 9, 1s...|    1|
|Al Burouj, Shorou...|   25|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|            location|count|
+--------------------+-----+
|Fleming, Alexandria•|   24|
|Sidi Beshr, Alexa...|  110|
|Kafr Abdo, Alexan...|   64|
|Moustafa kamel, S...|    7|
|Sawari, Moharam Bik•|   44|
| Al Hanouvel, Agami•|    8|
|Raml Station, Al

In [6]:
df_alex = df_alex\
    .withColumn("source", lit("dubbizle_Alexandria"))\
    
df_cairo = df_cairo\
    .withColumn("source", lit("dubbizle_Cairo"))

In [7]:
df_alex.show(3)
df_cairo.show(3)

+--------------------+--------------------+---------+--------------------+--------+---------+-----+-------------------+
|         description|                link|    price|            location|bedrooms|bathrooms| area|             source|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+-------------------+
|شقة للبيع في السر...|https://www.dubiz...|7000000.0|Sidi Beshr, Alexa...|       3|        2|165.0|dubbizle_Alexandria|
|Apartment for sal...|https://www.dubiz...|5700000.0|   Alex West, Agami•|       3|        2|178.0|dubbizle_Alexandria|
|Apartment for sal...|https://www.dubiz...|9800000.0|Sawari, Moharam Bik•|       3|        3|208.0|dubbizle_Alexandria|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+-------------------+
only showing top 3 rows

+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+
|         descriptio

In [8]:
df= df_cairo.unionByName(df_alex)
df.show(3)
df.printSchema()

+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+
|         description|                link|    price|            location|bedrooms|bathrooms| area|        source|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+
|A stunning apartm...|https://www.dubiz...|3075000.0|Shorouk City, Cairo•|       3|        2|205.0|dubbizle_Cairo|
|An opportunity to...|https://www.dubiz...|8300000.0|Noor City, New Ca...|       3|        3|147.0|dubbizle_Cairo|
|Apartment 3, imme...|https://www.dubiz...|   1.14E7|Villette Compound...|       3|        3|156.0|dubbizle_Cairo|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedr

In [9]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()
distinct = df.distinct().count()
count = df.count()
duplicate = count - distinct
print(f"Total Rows: {count}")
print(f"Distinct Rows: {distinct}")
print(f"Duplicate Rows Found: {duplicate}")

+-----------+----+-----+--------+--------+---------+----+------+
|description|link|price|location|bedrooms|bathrooms|area|source|
+-----------+----+-----+--------+--------+---------+----+------+
|          0|   0|    9|       0|      12|        4|   4|     0|
+-----------+----+-----+--------+--------+---------+----+------+

Total Rows: 4346
Distinct Rows: 4346
Duplicate Rows Found: 0


In [10]:
df = df.dropna(subset=[ "price", "area","bedrooms", "bathrooms"])

In [11]:
df = df\
    .withColumn("price", col("price").cast("double"))\
    
df =df.withColumn("price_per_sqm", (col("price") / col("area")).cast("double"))
    

In [12]:
cols_to_add = ["Jacuzzi", "Garden", "Balcony", "Pool", "Parking", "Gym", "Maids_Quarters", "Spa"]

# Add each column with value 0
for col in cols_to_add:
    df = df.withColumn(col, lit(0))

In [13]:
from pyspark.sql.functions import col, regexp_replace
df=df.withColumn("title", col("description"))

In [14]:
from pyspark.sql.functions import col, regexp_replace

df = (
    df
    # Fix common encoding issues like â€“
    .withColumn("title", regexp_replace(col("title"), "â€“", "-"))
    .withColumn("title", regexp_replace(col("title"), "Ù", "ا"))  # example for Arabic letters
    .withColumn("title", regexp_replace(col("title"), "Ø", "ف"))  # example
    .withColumn("title", regexp_replace(col("title"), "[\uFFFD]", ""))  # replacement char
    .withColumn("title", regexp_replace(col("title"), "[^\x00-\x7F\u0600-\u06FF\s0-9a-zA-Z\-]", ""))  # remove other weird chars
    # Trim leading/trailing spaces
    .withColumn("title", regexp_replace(col("title"), "^\s+|\s+$", ""))
    # Replicate title into description
    .withColumn("description", col("title"))
)

# Show first 10 cleaned titles
rows = df.select("title").take(20)
for num, row in enumerate(rows):
    print(num, row["title"])


0 A stunning apartment for sale, 205 m in the most upscale areas of the buildings in the city of Al-Shorouk, with a distinctive location directly on the Suez road.
1 An opportunity to own a 147m apartment in the first Smart City in Egypt, New Heliopolis, overlooking Wide Garden with payment facilitation for up to 13 years
2 Apartment 3, immediate delivery, luxurious finishing, for sale in SODIC Villette, New Cairo, near Palm Hills Compound
3 Apartment for sale in Privado Compound, with a garden view and lakes, area of 178 square meters, close to services
4 Penthouse for sale 237 sqm immediate receipt in Azad On 90th Street Fifth Settlement
5 Fully Finished 3BR Apartment + Maid Room in Hyde Park  Prime Location on Main Road
6 Open View Apartment | Green Spaces | in Sodic Eastown  compound
7 Duplexfor sale, ready to move , in installments, in Mountain View 1 Compound, Fifth Settlement, in front of the Public Prosecutor
8 شقه لقطه للبيع الزيتون غربيه
9 For sale  Ground apartment with gard

In [15]:
rows = df.select("description").take(20)
for num, row in enumerate(rows):
    print(num, row["description"])

0 A stunning apartment for sale, 205 m in the most upscale areas of the buildings in the city of Al-Shorouk, with a distinctive location directly on the Suez road.
1 An opportunity to own a 147m apartment in the first Smart City in Egypt, New Heliopolis, overlooking Wide Garden with payment facilitation for up to 13 years
2 Apartment 3, immediate delivery, luxurious finishing, for sale in SODIC Villette, New Cairo, near Palm Hills Compound
3 Apartment for sale in Privado Compound, with a garden view and lakes, area of 178 square meters, close to services
4 Penthouse for sale 237 sqm immediate receipt in Azad On 90th Street Fifth Settlement
5 Fully Finished 3BR Apartment + Maid Room in Hyde Park  Prime Location on Main Road
6 Open View Apartment | Green Spaces | in Sodic Eastown  compound
7 Duplexfor sale, ready to move , in installments, in Mountain View 1 Compound, Fifth Settlement, in front of the Public Prosecutor
8 شقه لقطه للبيع الزيتون غربيه
9 For sale  Ground apartment with gard

In [16]:
from pyspark.sql.functions import when, col, lower

df = df.withColumn("Pool", 
    when(lower(col("title")).contains("pool"), 1)
    .when(lower(col("title")).contains("swimming"), 1)
    .otherwise(0)
)

df = df.withColumn("Gym", 
    when(lower(col("title")).contains("gym"), 1)
    .when(lower(col("title")).contains("fitness"), 1)
    .when(lower(col("title")).contains("workout"), 1)
    .otherwise(0)
)

df = df.withColumn("Garden", 
    when(lower(col("title")).contains("garden"), 1)
    .when(lower(col("title")).contains("gardening"), 1)
    .when(lower(col("title")).contains("lawn"), 1)
    .otherwise(0)
)

df = df.withColumn("Parking", 
    when(lower(col("title")).contains("parking"), 1)
    .when(lower(col("title")).contains("garage"), 1)
    .when(lower(col("title")).contains("carport"), 1)
    .otherwise(0)
)

df = df.withColumn("Maids_Quarters", 
    when(lower(col("title")).contains("maids"), 1)
    .when(lower(col("title")).contains("servant"), 1)
    .when(lower(col("title")).contains("staff"), 1)
    .otherwise(0)
)

df = df.withColumn("Jacuzzi",
    when(lower(col("title")).contains("jacuzzi"), 1)
    .otherwise(0)
)

df = df.withColumn("Balcony",
    when(lower(col("title")).contains("balcony"), 1)
    .otherwise(0)
)

df = df.withColumn("Spa",
    when(lower(col("title")).contains("spa"), 1)
    .otherwise(0)
)

In [17]:
amenities_list = ["Jacuzzi", "Garden", "Balcony", "Pool", "Parking", "Gym", "Maids_Quarters", "Spa"]

for amenity in amenities_list:
    count = df.filter(col(amenity) == 1).count()
    print(f"{amenity}: {count}")

Jacuzzi: 1
Garden: 484
Balcony: 2
Pool: 55
Parking: 29
Gym: 1
Maids_Quarters: 2
Spa: 36


In [18]:
df.groupBy("location").count().show()

+--------------------+-----+
|            location|count|
+--------------------+-----+
|El Banafseg 2, 1s...|    2|
|Galleria Moon Val...|   29|
|New Andalous, 5th...|    4|
|El Yasmeen 2, 1st...|    3|
|Jewar Compound, 6...|    3|
|Trio Gardens Comp...|   14|
|Baron City, Katam...|    1|
|El Banafseg 7, 1s...|    4|
|IL Bosco, New Cap...|    2|
|El Lotus, 5th Set...|   48|
|Midtown Condo, Ne...|    1|
|The View Compound...|    1|
|North Rehab, 1st ...|    1|
|Banafsag Omarat, ...|   17|
|Green Hills, Nasr...|    1|
|Katameya Heights ...|    1|
|Armonia, New Capi...|    3|
|El Banafseg 9, 1s...|    1|
|Al Burouj, Shorou...|   25|
|      Shubra, Cairo•|    2|
+--------------------+-----+
only showing top 20 rows



In [19]:
from pyspark.sql.functions import col, lower, when

df = df.withColumn(
    "property_type",
    when(
        lower(col("title")).like("%apartment%") | lower(col("title")).like("%apt%") | lower(col("title")).contains("apartm") | col("title").contains("شقة")| col("title").contains("شقه"),
        "Apartment"
    )
    .when(
        lower(col("title")).like("%townhouse%") | lower(col("title")).contains("town") | lower(col("title")).like("%town house%") | col("title").contains("تاون هاوس"),
        "Townhouse"
    )
    .when(
        lower(col("title")).like("%twin house%") | lower(col("title")).like("%twin%") | col("title").contains("توأم"),
        "Twin House"
    )
    .when(
        lower(col("title")).like("%villa%") | col("title").contains("فيلا"),
        "Villa"
    )
    .when(lower(col("title")).like("%ivilla%"), "iVilla")
    .when(lower(col("title")).like("%hotel apartment%"), "Hotel Apartment")
    .when(lower(col("title")).like("%penthouse%") | col("title").contains("بنتهاوس"), "Penthouse")
    .when(lower(col("title")).like("%land%") | col("title").contains("أرض"), "Land")
    .when(lower(col("title")).like("%chalet%") | lower(col("title")).like("%chaleat%") | col("title").contains("شاليه"), "Chalet")
    .when(lower(col("title")).like("%palace%") | col("title").contains("قصر"), "Palace")
    .when(lower(col("title")).like("%mansion%") | col("title").contains("مانشن"), "Mansion")
    .when(lower(col("title")).like("%duplex%") | lower(col("title")).like("%dublex%") | col("title").contains("دوبلكس")|lower(col("title")).like("%duplix%") , "Duplex")
    .when(lower(col("title")).like("%bungalow%") | col("title").contains("بنغل"), "Bungalow")
    .when(lower(col("title")).like("%studio%") | col("title").contains("ستوديو"), "Studio")
    .when(lower(col("title")).like("%roof%") | col("title").contains("روف"), "Roof")
    .when(lower(col("title")).like("%triplex%") | col("title").contains("تريبلكس"), "Triplex")
    .when(lower(col("title")).like("%standalone%") | lower(col("title")).like("%stand-alone%") | lower(col("title")).contains("stand") | col("title").contains("ستندالون"), "Standalone")
    .otherwise("Other")
)



In [20]:
property_type_counts = df.groupBy("property_type").count().orderBy("count", ascending=False)
property_type_counts.show(truncate=False)

+-------------+-----+
|property_type|count|
+-------------+-----+
|Apartment    |3672 |
|Other        |275  |
|Duplex       |149  |
|Penthouse    |80   |
|Villa        |53   |
|Studio       |51   |
|Townhouse    |24   |
|Roof         |14   |
|Land         |9    |
|Palace       |2    |
|Standalone   |1    |
|Triplex      |1    |
+-------------+-----+



In [21]:
df.filter(col("property_type") == "Other").show(12)

+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+------------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+
|         description|                link|    price|            location|bedrooms|bathrooms| area|        source|     price_per_sqm|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|               title|property_type|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+------------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+
|For sale in Al-Ri...|https://www.dubiz...|6900000.0|Rehab City Compou...|       3|        3|113.0|dubbizle_Cairo| 61061.94690265487|      0|     1|      0|   0|      0|  0|             0|  0|For sale in Al-Ri...|        Other|
|New Egypt, on Al-...|https://www.dubiz...|4350000.0|  Heliopolis, Cairo•|       2|     

In [22]:
df = df.withColumn("longitude", lit(0).cast("double"))
df = df.withColumn("latitude", lit(0).cast("double"))

In [23]:
df.show(3)

+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+-----------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+---------+--------+
|         description|                link|    price|            location|bedrooms|bathrooms| area|        source|    price_per_sqm|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|               title|property_type|longitude|latitude|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+-----------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+---------+--------+
|A stunning apartm...|https://www.dubiz...|3075000.0|Shorouk City, Cairo•|       3|        2|205.0|dubbizle_Cairo|          15000.0|      0|     0|      0|   0|      0|  0|             0|  0|A stunning apartm...|    Apartment|      0.0|     0.0|
|An opportunity 

In [27]:
df.select("location").show(15)

+--------------------+
|            location|
+--------------------+
|Shorouk City, Cairo•|
|Noor City, New Ca...|
|Villette Compound...|
|  Privado, Madinaty•|
|AZAD Compound, 5t...|
|Hyde Park New Cai...|
|Eastown Compound,...|
|Mountain View 1 C...|
|Hadayeq El Zeitou...|
|Wesal, Shorouk City•|
|Sky Condos Sodic ...|
|Monte Napoleon, M...|
|Maadi View, Shoro...|
|Stone Residence C...|
|Sodic East, New H...|
+--------------------+
only showing top 15 rows



In [25]:
df.show(15)

+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+------------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+---------+--------+
|         description|                link|    price|            location|bedrooms|bathrooms| area|        source|     price_per_sqm|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|               title|property_type|longitude|latitude|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+--------------+------------------+-------+------+-------+----+-------+---+--------------+---+--------------------+-------------+---------+--------+
|A stunning apartm...|https://www.dubiz...|3075000.0|Shorouk City, Cairo•|       3|        2|205.0|dubbizle_Cairo|           15000.0|      0|     0|      0|   0|      0|  0|             0|  0|A stunning apartm...|    Apartment|      0.0|     0.0|
|An opportun

In [25]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()
distinct = df.distinct().count()
count = df.count()
duplicate = count - distinct
print(f"Total Rows: {count}")
print(f"Distinct Rows: {distinct}")
print(f"Duplicate Rows Found: {duplicate}")

+-----------+----+-----+--------+--------+---------+----+------+-------------+-------+------+-------+----+-------+---+--------------+---+-----+-------------+---------+--------+
|description|link|price|location|bedrooms|bathrooms|area|source|price_per_sqm|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|title|property_type|longitude|latitude|
+-----------+----+-----+--------+--------+---------+----+------+-------------+-------+------+-------+----+-------+---+--------------+---+-----+-------------+---------+--------+
|          0|   0|    0|       0|       0|        0|   0|     0|            0|      0|     0|      0|   0|      0|  0|             0|  0|    0|            0|        0|       0|
+-----------+----+-----+--------+--------+---------+----+------+-------------+-------+------+-------+----+-------+---+--------------+---+-----+-------------+---------+--------+

Total Rows: 4331
Distinct Rows: 4331
Duplicate Rows Found: 0


In [None]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/gold/dubbizle_gold")

In [28]:
df.write.mode("overwrite").csv("/data/df_all.csv")