In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("RawToBronze_Cleaning_dubbizle").getOrCreate()

In [3]:
df_cairo  = spark.read.parquet("hdfs://namenode:9000/datalake/bronze/dubizzle_all_listings_cairo")
df_cairo.show(3)
df_cairo.printSchema()

+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
|         description|                link|         price|            location|bedrooms|bathrooms|   area|
+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
|Appartment for sa...|https://www.dubiz...|EGP 12,200,000|Swan Lake Residen...|       2|        2|127 SQM|
|A fully finished ...|https://www.dubiz...| EGP 8,350,000|Sodic East, New H...|       3|        2|160 SQM|
|City Gate – Diar ...|https://www.dubiz...|EGP 13,790,000|City Gate Compoun...|       3|        3|210 SQM|
+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- bathrooms: string (n

In [4]:
df_alex  = spark.read.parquet("hdfs://namenode:9000/datalake/bronze/dubbizle_alexandria")
df_alex.show(3)
df_alex.printSchema()

+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
|         description|                link|         price|            location|bedrooms|bathrooms|   area|
+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
|Apartment for sal...|https://www.dubiz...|EGP 10,600,000|Palm Hills Alexan...|       2|        3|120 SQM|
|Apartment for sal...|https://www.dubiz...| EGP 6,300,000|  Smoha, Alexandria•|       3|        2|155 SQM|
|Own your apartmen...|https://www.dubiz...| EGP 9,360,000|      Murooj, Smoha•|       3|        3|195 SQM|
+--------------------+--------------------+--------------+--------------------+--------+---------+-------+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- bathrooms: string (n

remove duplicates

In [5]:
total_rows = df_alex.count()
distinct_rows = df_alex.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

total_rows = df_cairo.count()
distinct_rows = df_cairo.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

Total Rows: 5850
Distinct Rows: 1735
Duplicate Rows Found: 4115
Total Rows: 4906
Distinct Rows: 2612
Duplicate Rows Found: 2294


In [6]:
df_alex = df_alex.dropDuplicates()
df_cairo = df_cairo.dropDuplicates()

In [7]:
total_rows = df_alex.count()
distinct_rows = df_alex.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

total_rows = df_cairo.count()
distinct_rows = df_cairo.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

Total Rows: 1735
Distinct Rows: 1735
Duplicate Rows Found: 0
Total Rows: 2612
Distinct Rows: 2612
Duplicate Rows Found: 0


In [8]:
df_alex.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df_alex.columns]
).show()
df_cairo.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df_alex.columns]
).show()

+-----------+----+-----+--------+--------+---------+----+
|description|link|price|location|bedrooms|bathrooms|area|
+-----------+----+-----+--------+--------+---------+----+
|          0|   0|    0|       0|       0|        0|   0|
+-----------+----+-----+--------+--------+---------+----+

+-----------+----+-----+--------+--------+---------+----+
|description|link|price|location|bedrooms|bathrooms|area|
+-----------+----+-----+--------+--------+---------+----+
|          0|   1|    1|       1|       1|        1|   1|
+-----------+----+-----+--------+--------+---------+----+



In [9]:
df_cairo = df_cairo.dropna()
df_alex = df_alex.dropna()

In [10]:
df_alex.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df_alex.columns]
).show()
df_cairo.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df_alex.columns]
).show()

+-----------+----+-----+--------+--------+---------+----+
|description|link|price|location|bedrooms|bathrooms|area|
+-----------+----+-----+--------+--------+---------+----+
|          0|   0|    0|       0|       0|        0|   0|
+-----------+----+-----+--------+--------+---------+----+

+-----------+----+-----+--------+--------+---------+----+
|description|link|price|location|bedrooms|bathrooms|area|
+-----------+----+-----+--------+--------+---------+----+
|          0|   0|    0|       0|       0|        0|   0|
+-----------+----+-----+--------+--------+---------+----+



Cleaning price and area column

In [11]:
df_alex = df_alex.withColumn(
    "price",
    regexp_replace(col("price"), "[^0-9\\.]", "")
)
df_alex.select("price").show(5)

df_cairo = df_cairo.withColumn(
    "price",
    regexp_replace(col("price"), "[^0-9\\.]", "")

)
df_cairo.select("price").show(5)
df_cairo.printSchema()

+-------+
|  price|
+-------+
|7000000|
|5700000|
|9800000|
|8300000|
|3700000|
+-------+
only showing top 5 rows

+--------+
|   price|
+--------+
| 3075000|
| 8300000|
|11400000|
|14000000|
| 8370000|
+--------+
only showing top 5 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- area: string (nullable = true)



In [12]:
df_alex = df_alex.withColumn(
    "area",
    regexp_replace(col("area"),"SQM|sqm|\\s","")
)
df_alex.select("area").show(5)

df_cairo = df_cairo.withColumn(
    "area",
    regexp_replace(col("area"),"SQM|sqm|\\s","")

)
df_cairo.select("area").show(5)
df_cairo.printSchema()

+----+
|area|
+----+
| 165|
| 178|
| 208|
| 175|
| 165|
+----+
only showing top 5 rows

+----+
|area|
+----+
| 205|
| 147|
| 156|
| 178|
| 237|
+----+
only showing top 5 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- area: string (nullable = true)



In [13]:
df_alex = df_alex.withColumn("price", trim(col("price")))
df_alex= df_alex.withColumn("area", trim(col("area")))
df_cairo = df_cairo.withColumn("price", trim(col("price")))
df_cairo= df_cairo.withColumn("area", trim(col("area")))
df_alex.select("area","price").show(5)
df_cairo.select("area","price").show(5)


+----+-------+
|area|  price|
+----+-------+
| 165|7000000|
| 178|5700000|
| 208|9800000|
| 175|8300000|
| 165|3700000|
+----+-------+
only showing top 5 rows

+----+--------+
|area|   price|
+----+--------+
| 205| 3075000|
| 147| 8300000|
| 156|11400000|
| 178|14000000|
| 237| 8370000|
+----+--------+
only showing top 5 rows



fixing datatypes

In [14]:
df_alex = df_alex \
    .withColumn("price", col("price").cast("double")) \
    .withColumn("area", col("area").cast("double")) \
    .withColumn("bedrooms", col("bedrooms").cast("integer")) \
    .withColumn("bathrooms", col("bathrooms").cast("integer")) 

df_alex.printSchema()

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = true)



In [15]:
df_cairo = df_cairo \
    .withColumn("price", col("price").cast("double")) \
    .withColumn("area", col("area").cast("double")) \
    .withColumn("bedrooms", col("bedrooms").cast("integer")) \
    .withColumn("bathrooms", col("bathrooms").cast("integer")) 

df_cairo.printSchema()

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = true)



In [16]:
df_cairo.show(3)
df_cairo.printSchema()


+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|         description|                link|    price|            location|bedrooms|bathrooms| area|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|A stunning apartm...|https://www.dubiz...|3075000.0|Shorouk City, Cairo•|       3|        2|205.0|
|An opportunity to...|https://www.dubiz...|8300000.0|Noor City, New Ca...|       3|        3|147.0|
|Apartment 3, imme...|https://www.dubiz...|   1.14E7|Villette Compound...|       3|        3|156.0|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = t

In [17]:
df_alex.show(3)
df_alex.printSchema()

+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|         description|                link|    price|            location|bedrooms|bathrooms| area|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
|شقة للبيع في السر...|https://www.dubiz...|7000000.0|Sidi Beshr, Alexa...|       3|        2|165.0|
|Apartment for sal...|https://www.dubiz...|5700000.0|   Alex West, Agami•|       3|        2|178.0|
|Apartment for sal...|https://www.dubiz...|9800000.0|Sawari, Moharam Bik•|       3|        3|208.0|
+--------------------+--------------------+---------+--------------------+--------+---------+-----+
only showing top 3 rows

root
 |-- description: string (nullable = true)
 |-- link: string (nullable = true)
 |-- price: double (nullable = true)
 |-- location: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- area: double (nullable = t

In [18]:
df_alex.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/cleaned_dubbizle_alexandria")
df_cairo.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/cleaned_dubbizle_cairo")

In [None]:
# execute those two command in hadoop shell
# hdfs dfs -rm /datalake/silver/cleaned_dubbizle_alexandria/_SUCCESS
# hdfs dfs -rm /datalake/silver/cleaned_dubbizle_cairo/_SUCCESS
