In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Silver_transformation_fazwaz").getOrCreate()

In [None]:
df = spark.read.parquet("hdfs://namenode:9000/datalake/silver/cleaned_fazwaz")
df.show(3)
df.printSchema()

+--------------------+------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+
|                name| price|               about|bedrooms|bathrooms| size|property_type|            location|price_per_sqm|                link|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|
+--------------------+------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+
|6 Bedroom Villa f...|1.96E8|This property is ...|       6|        6|830.0|        Villa|     Al Rehab, Cairo|     236000.0|https://www.fazwa...|      0|     0|      0|   0|      0|  0|             0|  0|
|3 Bedroom Villa f...|1.08E8|This property is ...|       3|        3|360.0|        Villa|     Al Rehab, Cairo|     300000.0|https://www.fazwa...|      0|     1|      1|   0|      1

In [4]:
from pyspark.sql.functions import col,lit,regexp_replace
df = df\
    .withColumn("source", lit("fazwaz"))\
    .withColumn("title",col("name"))

df = df.withColumnRenamed("size_sqm", "area")
    

In [5]:
df.groupBy("location").count().show()

+--------------------+-----+
|            location|count|
+--------------------+-----+
|6th District, New...|   39|
|El Yasmeen, New C...|    6|
|El Shorouk Compou...|   59|
|Nasr City,Cairo, ...|    4|
|New Capital City,...|    5|
|New Cairo City,Ca...|   10|
|Mostakbal City Co...|   37|
| 5th District, Cairo|    3|
|El Katameya, New ...|    6|
|    Al Narges, Cairo|    3|
|El Shorouk Compou...|   37|
|The 5th Settlemen...| 2341|
|New Capital Compo...|  142|
|North Investors A...|   21|
|The 1st Settlemen...|   46|
|New Capital Compo...|   28|
|Zahraa El Maadi, ...|    4|
|5th District, Sho...|    4|
|El Patio, Shorouk...|    1|
|Mostakbal City Co...|   49|
+--------------------+-----+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import regexp_replace, col

df = (
    df
    # Fix common encoding issues like â€“
    .withColumn("title", regexp_replace(col("title"), "â€“", "-"))
    .withColumn("title", regexp_replace(col("title"), "Ù", "ا"))  # example for Arabic letters
    .withColumn("title", regexp_replace(col("title"), "Ø", "ف"))  # example
    .withColumn("title", regexp_replace(col("title"), "[\uFFFD]", ""))  # replacement char
    .withColumn("title", regexp_replace(col("title"), "[^\x00-\x7F\u0600-\u06FF\s0-9a-zA-Z\-]", ""))  # remove other weird chars
    # Trim leading/trailing spaces
    .withColumn("title", regexp_replace(col("title"), "^\s+|\s+$", ""))
    # Replicate title into description
    
)

# Show first 10 cleaned titles
rows = df.select("title").take(10)
for num, row in enumerate(rows):
    print(num, row["title"])


0 6 Bedroom Villa for sale at El Rehab Extension
1 3 Bedroom Villa for sale at El Rehab Extension
2 2 Bedroom Apartment for sale at El Rehab Extension
3 3 Bedroom Apartment for sale at Mivida
4 4 Bedroom Townhouse for sale at Midtown Sky
5 3 Bedroom Penthouse for sale at Villette
6 4 Bedroom Villa for sale at Hyde Park
7 3 Bedroom Apartment for sale at Hyde Park
8 7 Bedroom Villa for sale at Azzar
9 3 Bedroom Apartment for sale at Mivida


In [7]:
df = df.withColumnRenamed("about","description")
df = df.drop("name")


In [8]:
locations = df.groupBy("location").count().take(30)

for location in locations:
    print(location)

Row(location='6th District, New Heliopolis,Cairo', count=39)
Row(location='El Yasmeen, New Cairo City,Cairo', count=6)
Row(location='El Shorouk Compounds, Shorouk City,Cairo', count=59)
Row(location='Nasr City,Cairo, Egypt', count=4)
Row(location='New Capital City,Cairo, Egypt', count=5)
Row(location='New Cairo City,Cairo, Egypt', count=10)
Row(location='Mostakbal City Compounds, Cairo', count=37)
Row(location='5th District, Cairo', count=3)
Row(location='El Katameya, New Cairo City,Cairo', count=6)
Row(location='Al Narges, Cairo', count=3)
Row(location='El Shorouk Compounds, Cairo', count=37)
Row(location='The 5th Settlement, New Cairo City,Cairo', count=2341)
Row(location='New Capital Compounds, New Capital City,Cairo', count=142)
Row(location='North Investors Area, Cairo', count=21)
Row(location='The 1st Settlement, Cairo', count=46)
Row(location='New Capital Compounds, Cairo', count=28)
Row(location='Zahraa El Maadi, Hay El Maadi,Cairo', count=4)
Row(location='5th District, Shorouk

In [9]:
df.show(3)

+------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+------+--------------------+
| price|         description|bedrooms|bathrooms| size|property_type|            location|price_per_sqm|                link|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|source|               title|
+------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+------+--------------------+
|1.96E8|This property is ...|       6|        6|830.0|        Villa|     Al Rehab, Cairo|     236000.0|https://www.fazwa...|      0|     0|      0|   0|      0|  0|             0|  0|fazwaz|6 Bedroom Villa f...|
|1.08E8|This property is ...|       3|        3|360.0|        Villa|     Al Rehab, Cairo|     300000.0|https://www.fazwa...|      0|     1|      1|   0|

In [10]:
df.printSchema()

root
 |-- price: double (nullable = true)
 |-- description: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- size: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- price_per_sqm: double (nullable = true)
 |-- link: string (nullable = true)
 |-- Jacuzzi: integer (nullable = true)
 |-- Garden: integer (nullable = true)
 |-- Balcony: integer (nullable = true)
 |-- Pool: integer (nullable = true)
 |-- Parking: integer (nullable = true)
 |-- Gym: integer (nullable = true)
 |-- Maids_Quarters: integer (nullable = true)
 |-- Spa: integer (nullable = true)
 |-- source: string (nullable = false)
 |-- title: string (nullable = true)



In [11]:
df = df.dropna(subset=[ "price_per_sqm"])

In [12]:
df = df.withColumn("longitude", lit(0).cast("double"))
df = df.withColumn("latitude", lit(0).cast("double"))


In [13]:
df = df.withColumnRenamed("size", "area")

In [14]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()
distinct = df.distinct().count()
count = df.count()
duplicate = count - distinct
print(f"Total Rows: {count}")
print(f"Distinct Rows: {distinct}")
print(f"Duplicate Rows Found: {duplicate}")

+-----+-----------+--------+---------+----+-------------+--------+-------------+----+-------+------+-------+----+-------+---+--------------+---+------+-----+---------+--------+
|price|description|bedrooms|bathrooms|area|property_type|location|price_per_sqm|link|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|source|title|longitude|latitude|
+-----+-----------+--------+---------+----+-------------+--------+-------------+----+-------+------+-------+----+-------+---+--------------+---+------+-----+---------+--------+
|    0|          0|       0|        0|   0|            0|       0|            0|   0|      0|     0|      0|   0|      0|  0|             0|  0|     0|    0|        0|       0|
+-----+-----------+--------+---------+----+-------------+--------+-------------+----+-------+------+-------+----+-------+---+--------------+---+------+-----+---------+--------+

Total Rows: 4991
Distinct Rows: 4991
Duplicate Rows Found: 0


In [None]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/gold/fazwaz_gold")