In [0]:
from pyspark.sql.functions import sum, col,to_date,rlike,regexp_replace,date_format,date_diff,lit
import pandas as pd
import os

date_str=dbutils.widgets.get("arrival_date")
ref_date = to_date(lit(date_str), "yyyyMMdd")
customers_df = spark.read.format("json").load(f"/Volumes/zoom_car/default/data/customers/zoom_car_customers_{date_str}.json")
customers_df.printSchema()
display(customers_df)

In [0]:
#Data Cleaning
critical_fields=['customer_id','name','email']
valid_status=['active','inactive']
email_regex= r'^[A-Za-z0-9]+@[A-Za-z]+\.[A-Za-z]{2,}$'

customers_cleaned=customers_df.dropna(subset=critical_fields)\
                            .filter(col("status").isin(valid_status))\
                            .filter(col("email").rlike(email_regex))

customers_transformed=customers_cleaned.withColumn("phone_digits",regexp_replace("phone_number",r"\D",""))\
    .withColumn("customer_tenure",date_diff(ref_date,col("signup_date")))
customers_transformed.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("zoom_car.default.staging_customers_delta")
print("Table created")
display(customers_transformed)