In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [5]:
spark = SparkSession.builder.appName("RawToBronze_Cleaning_fazwaz").getOrCreate()

In [6]:
df = spark.read.parquet("hdfs://namenode:9000/datalake/bronze/fazwaz_apartments_allcombined")
df.show(3)
df.printSchema()

+--------+--------------------+-------------+--------------------+--------+---------+----+-------------+--------------------+-------------+--------------------+------------+-----------+--------------+---------------+--------------+-------+------+-------+
| unit_id|                name|        price|               about|bedrooms|bathrooms|size|property_type|            location|price_per_sqm|                link|Private Pool|Private Gym|Private Garden|Covered Parking|Maids Quarters|Jacuzzi|Garden|Balcony|
+--------+--------------------+-------------+--------------------+--------+---------+----+-------------+--------------------+-------------+--------------------+------------+-----------+--------------+---------------+--------------+-------+------+-------+
|U5693108|2 Bedroom Condo f...| EGP2,000,000|This property is ...|       2|        1|  86|        Condo|Nasr City,Cairo, ...|    EGP23,256|https://www.fazwa...|         N/A|        N/A|           N/A|            N/A|           N/A|    

In [7]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()

+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+
|unit_id|name|price|about|bedrooms|bathrooms|size|property_type|location|price_per_sqm|link|Private Pool|Private Gym|Private Garden|Covered Parking|Maids Quarters|Jacuzzi|Garden|Balcony|
+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+
|      0|   0|    0|    0|       0|        0|   2|            0|       0|            0|   0|           0|          0|             0|              0|             0|      0|     0|      0|
+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+



In [8]:
df = df.na.drop(subset=["size"])

In [9]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()

+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+
|unit_id|name|price|about|bedrooms|bathrooms|size|property_type|location|price_per_sqm|link|Private Pool|Private Gym|Private Garden|Covered Parking|Maids Quarters|Jacuzzi|Garden|Balcony|
+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+
|      0|   0|    0|    0|       0|        0|   0|            0|       0|            0|   0|           0|          0|             0|              0|             0|      0|     0|      0|
+-------+----+-----+-----+--------+---------+----+-------------+--------+-------------+----+------------+-----------+--------------+---------------+--------------+-------+------+-------+



In [10]:
df = df \
  .withColumn("price", regexp_replace(col("price"), "[^0-9.]", "").cast("double"))\
    .withColumn("price_per_sqm", regexp_replace(col("price_per_sqm"), "[^0-9.]", "").cast("double"))\
    .withColumn("size", col("size").cast("double"))\
    .withColumn("bedrooms", col("bedrooms").cast("integer"))\
    .withColumn("bathrooms", col("bathrooms").cast("integer"))\
    .withColumn("Pool",when(lower(col("Private Pool")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Garden",when(lower(col("Private Garden")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Parking",when(lower(col("Covered Parking")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Gym", when(lower(col("Private Gym")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Maids_Quarters",when(lower(col("Maids Quarters")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Spa", lit(0).cast("int"))


In [11]:
df = df \
    .withColumn("Jacuzzi",when(lower(col("Jacuzzi")) == "mawgood", 1).otherwise(0).cast("int"))\
    .withColumn("Balcony",when(lower(col("Balcony")) == "mawgood", 1).otherwise(0).cast("int"))
df = df.drop("Private Pool", "Private Garden", "Covered Parking", "Private Gym", "Maids Quarters")


In [12]:
df =df.drop("unit_id")

In [13]:
df.filter(col("Maids_Quarters") == 1).count()

1074

In [14]:
df.show(3)
df.printSchema()

+--------------------+---------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+
|                name|    price|               about|bedrooms|bathrooms| size|property_type|            location|price_per_sqm|                link|Jacuzzi|Garden|Balcony|Pool|Parking|Gym|Maids_Quarters|Spa|
+--------------------+---------+--------------------+--------+---------+-----+-------------+--------------------+-------------+--------------------+-------+------+-------+----+-------+---+--------------+---+
|2 Bedroom Condo f...|2000000.0|This property is ...|       2|        1| 86.0|        Condo|Nasr City,Cairo, ...|      23256.0|https://www.fazwa...|      0|     0|      0|   0|      0|  0|             0|  0|
|3 Bedroom Condo f...|   1.63E7|This property is ...|       3|        1|160.0|        Condo|Nasr City,Cairo, ...|     102000.0|https://www.fazwa...|      0|     0|     

In [15]:
total_rows = df.count()
distinct_rows = df.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

df = df.dropDuplicates()
total_rows = df.count()
distinct_rows = df.distinct().count()

print(f"Total Rows after removing duplicates: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")


Total Rows: 4999
Distinct Rows: 4992
Duplicate Rows Found: 7
Total Rows after removing duplicates: 4992
Distinct Rows: 4992
Duplicate Rows Found: 0


In [16]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/cleaned_fazwaz")

In [None]:
# hdfs dfs -rm /datalake/silver/cleaned_fazwaz/_SUCCESS