In [0]:
from pyspark.sql.functions import col, lit, current_timestamp, sum as _sum
from delta.tables import DeltaTable
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite, VerificationResult
import os

In [0]:
# Print the Spark version from the environment variables
print(os.environ['SPARK_VERSION'])

# Get job parameters from Databricks widgets
# 'current_date' is expected to be passed as a widget parameter
date_str = dbutils.widgets.get("current_date")

# Define the file path for the customer data based on the 'current_date' parameter
# Example filename format: zoom_car_customer_yyyymmdd.json
customer_data = f"dbfs:/FileStore/zoomcar/customer/zoom_car_customer_{date_str}.json"

# Check if the file exists in the specified path
# If the file is not found, raise a FileNotFoundError with a descriptive message
try:
    dbutils.fs.ls(customer_data)  # List the directory contents to check file existence
except Exception as e:
    # Raise a more specific error if the file is not found
    raise FileNotFoundError(f"File {customer_data} not found!") from e


3.5.0


In [0]:
# Read booking data
customer_df = spark.read \
    .format("json") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("multiLine", "true") \
    .load(customer_data)

customer_df.printSchema()
display(customer_df)


root
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- signup_date: string (nullable = true)
 |-- status: string (nullable = true)



customer_id,email,name,phone_number,signup_date,status
C001,apayne@example.net,Glenn Jones,618-241-0939x81956,2024-03-05,active
C002,bridgesjennifer@example.net,Michele Munoz,285-336-1207x9689,2024-02-15,inactive
C003,romerogordon@example.net,Valerie Huff,(670)616-8784x73073,2024-07-05,inactive
C004,kiarathompson@example.org,Monica Gill,001-407-657-1726x534,2024-10-09,inactive
C005,matthewjohnson@example.com,Zachary Duncan,851-526-0178x89130,2024-07-17,active
C006,johnwatson@example.org,Jodi Warren,001-785-217-8761x97316,2024-06-09,active
C007,jameslee@example.net,Sarah Rasmussen,(925)374-1133,2024-11-08,active
C008,stevenfoster@example.com,Scott Bryan,001-582-200-7910x726,2024-06-29,inactive
C009,christopher01@example.org,Beth Pham,(322)406-2179,2024-05-15,inactive
C010,xbradley@example.org,Madeline Good,5866919138,2024-03-07,active


In [0]:
from pyspark.sql.functions import to_date  # Function to convert a column to date format
from pyspark.sql.functions import col, to_timestamp  # col: Access DataFrame columns, to_timestamp: Convert to timestamp
from pyspark.sql.functions import when, lit  # when: Conditional expressions, lit: Create literal column values

# Define allowed statuses for filtering
allowed_statuses = ["active", "inactive"]

# Define email regex pattern to validate email addresses
email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

# Validate email addresses:
# If the 'email' column matches the regex, keep the email; otherwise, replace with "NA"
customer_df = customer_df.withColumn(
    "email",
    when(customer_df["email"].rlike(email_regex), customer_df["email"]).otherwise(lit("NA"))
)

# Filter rows to only include records where 'status' is either "active" or "inactive"
customer_df = customer_df.filter(col("status").isin(allowed_statuses))

# Convert the 'signup_date' column to a proper date format (yyyy-MM-dd)
customer_df = customer_df.withColumn("signup_date", to_date(col("signup_date"), "yyyy-MM-dd"))

# Drop rows where 'customer_id', 'name', or 'email' contain null values
customer_df = customer_df.na.drop(subset=["customer_id", "name", "email"])

# Display the resulting DataFrame
display(customer_df)


customer_id,email,name,phone_number,signup_date,status
C001,apayne@example.net,Glenn Jones,618-241-0939x81956,2024-03-05,active
C002,bridgesjennifer@example.net,Michele Munoz,285-336-1207x9689,2024-02-15,inactive
C003,romerogordon@example.net,Valerie Huff,(670)616-8784x73073,2024-07-05,inactive
C004,kiarathompson@example.org,Monica Gill,001-407-657-1726x534,2024-10-09,inactive
C005,matthewjohnson@example.com,Zachary Duncan,851-526-0178x89130,2024-07-17,active
C006,johnwatson@example.org,Jodi Warren,001-785-217-8761x97316,2024-06-09,active
C007,jameslee@example.net,Sarah Rasmussen,(925)374-1133,2024-11-08,active
C008,stevenfoster@example.com,Scott Bryan,001-582-200-7910x726,2024-06-29,inactive
C009,christopher01@example.org,Beth Pham,(322)406-2179,2024-05-15,inactive
C010,xbradley@example.org,Madeline Good,5866919138,2024-03-07,active


In [0]:
# Import necessary classes for data quality checks
# Check: Defines the checks to validate data
# CheckLevel: Severity level of the checks (e.g., Error)
# VerificationSuite: Executes the checks on the provided data
# VerificationResult: Formats and extracts results of the checks
# Define a data quality check for customer data
# 'CheckLevel.Error' ensures that any check failure will be treated as an error
check_customer = Check(spark, CheckLevel.Error, "Customer Data Check") \
    .hasSize(lambda x: x > 0) \  # Check that the DataFrame has at least one row
    .isComplete("customer_id", hint="Customer ID is missing") \  # Ensure 'customer_id' column has no nulls
    .isComplete("name", hint="Customer name is missing") \       # Ensure 'name' column has no nulls
    .isComplete("email", hint="Email address is missing") \      # Ensure 'email' column has no nulls
    .isComplete("phone_number", hint="Phone number is missing") \  # Ensure 'phone_number' column has no nulls
    .isComplete("signup_date", hint="Signup date is missing") \    # Ensure 'signup_date' column has no nulls
    .isContainedIn("status", allowed_statuses, hint="Status value is invalid")  # Check 'status' values are valid

# Run the verification suite on the 'customer_df' DataFrame
# Adds the defined checks and executes them
customer_dq_check = VerificationSuite(spark) \
    .onData(customer_df) \    # Specify the DataFrame to validate
    .addCheck(check_customer) \  # Add the customer data check defined above
    .run()  # Run the verification suite

# Convert the verification results into a DataFrame for easier readability and display
customer_dq_check_df = VerificationResult.checkResultsAsDataFrame(spark, customer_dq_check)
display(customer_dq_check_df)  # Display the check results

# Check if the verification status is successful; raise an error if not
if customer_dq_check.status != "Success":
    raise ValueError("Data Quality Checks Failed for Booking Data")




check,check_level,check_status,constraint,constraint_status,constraint_message
Customer Data Check,Error,Success,SizeConstraint(Size(None)),Success,
Customer Data Check,Error,Success,"CompletenessConstraint(Completeness(customer_id,None,None))",Success,
Customer Data Check,Error,Success,"CompletenessConstraint(Completeness(name,None,None))",Success,
Customer Data Check,Error,Success,"CompletenessConstraint(Completeness(email,None,None))",Success,
Customer Data Check,Error,Success,"CompletenessConstraint(Completeness(phone_number,None,None))",Success,
Customer Data Check,Error,Success,"CompletenessConstraint(Completeness(signup_date,None,None))",Success,
Customer Data Check,Error,Success,"ComplianceConstraint(Compliance(status contained in active,inactive,`status` IS NULL OR `status` IN ('active','inactive'),None,List(status),None))",Success,


In [0]:
from pyspark.sql.functions import col, lit, to_date, datediff

customer_df = customer_df.withColumnRenamed("status", "customer_status")
customer_df = customer_df.withColumn("days_difference",
    datediff(to_date(lit(date_str), "yyyyMMdd"), to_date(col("signup_date"), "yyyy-MM-dd"))
)

In [0]:
display(customer_df)

customer_id,email,name,phone_number,signup_date,customer_status,days_difference
C001,apayne@example.net,Glenn Jones,618-241-0939x81956,2024-03-05,active,263
C002,bridgesjennifer@example.net,Michele Munoz,285-336-1207x9689,2024-02-15,inactive,282
C003,romerogordon@example.net,Valerie Huff,(670)616-8784x73073,2024-07-05,inactive,141
C004,kiarathompson@example.org,Monica Gill,001-407-657-1726x534,2024-10-09,inactive,45
C005,matthewjohnson@example.com,Zachary Duncan,851-526-0178x89130,2024-07-17,active,129
C006,johnwatson@example.org,Jodi Warren,001-785-217-8761x97316,2024-06-09,active,167
C007,jameslee@example.net,Sarah Rasmussen,(925)374-1133,2024-11-08,active,15
C008,stevenfoster@example.com,Scott Bryan,001-582-200-7910x726,2024-06-29,inactive,147
C009,christopher01@example.org,Beth Pham,(322)406-2179,2024-05-15,inactive,192
C010,xbradley@example.org,Madeline Good,5866919138,2024-03-07,active,261


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS zoom

In [0]:
booking_table_name = "databricks2.zoom.zoom_staging_customer_delta"

# Check if the table exists
if not spark.catalog._jcatalog.tableExists(booking_table_name):
    # If the table does not exist, write the DataFrame as a new Delta table
    customer_df.write.format("delta").saveAsTable(booking_table_name)
else:
    # If the table exists, append the new data
    customer_df.write.format("delta").mode("append").saveAsTable(booking_table_name)


In [0]:
%sql
--%sql
--DROP TABLE IF EXISTS databricks2.default.zoom_staging_customer_delta