In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [14]:
spark = SparkSession.builder.appName("RawToBronze_Cleaning_bayut").getOrCreate()

In [15]:
df = spark.read.parquet("hdfs://namenode:9000/datalake/bronze/bayutData")
df.show(3)
df.printSchema()

+--------------------+--------------------+--------+--------+-----------+------+--------+--------+---------+----------+----------+
|               title|                 url|   price|currency|   location|region|size_sqm|bedrooms|bathrooms|  latitude| longitude|
+--------------------+--------------------+--------+--------+-----------+------+--------+--------+---------+----------+----------+
|Chalet for Sale â€“...|https://www.bayut...|26500000|     EGP|North Coast|Matruh|     130|       2|        2|30.9628937|28.7565892|
|Chalet for sale i...|https://www.bayut...|32000000|     EGP|North Coast|Matruh|     189|       3|        3|30.9628937|28.7565892|
|Apartment ready t...|https://www.bayut...|13000000|     EGP|  New Cairo| Cairo|     191|       3|        3|29.9637831|31.5383442|
+--------------------+--------------------+--------+--------+-----------+------+--------+--------+---------+----------+----------+
only showing top 3 rows

root
 |-- title: string (nullable = true)
 |-- url: stri

In [16]:
total_rows = df.count()
distinct_rows = df.distinct().count()

print(f"Total Rows: {total_rows}")
print(f"Distinct Rows: {distinct_rows}")
print(f"Duplicate Rows Found: {total_rows - distinct_rows}")

Total Rows: 1903
Distinct Rows: 1903
Duplicate Rows Found: 0


In [17]:
df.select(
    [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
     for c in df.columns]
).show()

+-----+---+-----+--------+--------+------+--------+--------+---------+--------+---------+
|title|url|price|currency|location|region|size_sqm|bedrooms|bathrooms|latitude|longitude|
+-----+---+-----+--------+--------+------+--------+--------+---------+--------+---------+
|    0|  0|    0|       0|       0|     0|       0|       0|        0|       0|        0|
+-----+---+-----+--------+--------+------+--------+--------+---------+--------+---------+



In [18]:
from pyspark.sql.functions import col, lower, when

from pyspark.sql.functions import col, lower, when

df = df.withColumn(
    "property_type",
    when(lower(col("title")).like("%apartment%") | lower(col("title")).like("%apt%") | lower(col("title")).contains("apartm"), "Apartment")
    .when(lower(col("title")).like("%townhouse%")  | lower(col("title")).contains("town") | lower(col("title")).like("%town house%"),"Townhouse")
    .when(lower(col("title")).like("%twin house%") | lower(col("title")).like("%twin%"), "Twin House")
    .when(lower(col("title")).like("%villa%"), "Villa")
    .when(lower(col("title")).like("%ivilla%"), "iVilla")
    .when(lower(col("title")).like("%hotel apartment%"), "Hotel Apartment")
    .when(lower(col("title")).like("%penthouse%"), "Penthouse")
    .when(lower(col("title")).like("%land%"), "Land")
    .when(lower(col("title")).like("%chalet%") | lower(col("title")).like("%chaleat%"), "Chalet")
    .when(lower(col("title")).like("%palace%"), "Palace")
    .when(lower(col("title")).like("%mansion%"), "Mansion")
    .when(lower(col("title")).like("%duplex%") | lower(col("title")).like("%dublex%"), "Duplex")
    .when(lower(col("title")).like("%bungalow%"), "Bungalow")
    .when(lower(col("title")).like("%studio%"), "Studio")
    .when(lower(col("title")).like("%roof%"), "Roof")
    .when(lower(col("title")).like("%triplex%"), "Triplex")
    .when(lower(col("title")).like("%standalone%") | lower(col("title")).like("%stand-alone%") | lower(col("title")).contains("stand"), "Standalone")
    .otherwise("Other")
)



In [19]:
import os
print(os.getcwd())
other_apartments = df.filter((col("property_type")=="Other"))
#other_apartments.show(5)
other_apartments.toPandas().to_csv("/data/other_apartmentss.csv", index=False)



/home/jovyan


In [20]:
df.filter((col("property_type")=="Other")).count()

97

In [21]:
from pyspark.sql.functions import trim

# 1. Drop duplicate rows
df = df.dropDuplicates()

# 2. Trim spaces from string columns
for c in df.columns:
    df = df.withColumn(c, trim(col(c)))

# 3. Drop columns that are fully null
non_empty_cols = [c for c in df.columns if df.filter(col(c).isNotNull()).count() > 0]
df = df.select(non_empty_cols)

df = df.na.drop(subset=["title", "property_type"])

In [22]:
df = df.drop("currency")

In [23]:
from pyspark.sql.functions import col, regexp_replace, trim, lower
from pyspark.sql.types import DoubleType, IntegerType

# Numeric cleanup and casting
numeric_cols = {
    "price": IntegerType(),
    "size_sqm": DoubleType(),
    "bedrooms": IntegerType(),
    "bathrooms": IntegerType(),
    "latitude": DoubleType(),
    "longitude": DoubleType()
}

for col_name, dtype in numeric_cols.items():
    df = (
        df.withColumn(col_name,
            regexp_replace(col(col_name), "[^0-9.-]", "")  # remove non-numeric chars
            .cast(dtype)
        )
    )

# Normalize string columns: trim + lowercase for uniformity
string_cols = ["title", "url",  "location", "region", "property_type"]
for col_name in string_cols:
    df = df.withColumn(col_name, trim(lower(col(col_name))))

# Drop rows with invalid critical data (like missing property_type or price)
df = df.na.drop(subset=["property_type", "price", "location"])

# Optional: remove duplicates
df = df.dropDuplicates()

# Reorder columns nicely
df = df.select(
    "title", "url", "price", "location", "region",
    "size_sqm", "bedrooms", "bathrooms", "latitude", "longitude", "property_type"
)

df.printSchema()
df.show(5)


root
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- region: string (nullable = true)
 |-- size_sqm: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = false)

+--------------------+--------------------+--------+--------------+------+--------+--------+---------+---------------+---------------+-------------+
|               title|                 url|   price|      location|region|size_sqm|bedrooms|bathrooms|       latitude|      longitude|property_type|
+--------------------+--------------------+--------+--------------+------+--------+--------+---------+---------------+---------------+-------------+
|for sale chalet w...|https://www.bayut...| 6150000|    ain sukhna|  suez|   160.0|       4|        3|      2

In [24]:
df.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/cleaned_bayut")

In [25]:
# hdfs dfs -rm /datalake/silver/cleaned_bayut/_SUCCESS