In [0]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName("Option_2").getOrCreate()

In [0]:
file_path = "/Volumes/workspace/default/my_datas/sample-chocolate-sales-data-all.csv"

In [0]:
# Bronze Table - Raw data
# Run this cell only when new data into csv is ingested
csv_df = (spark.read
               .format("csv")
               .option("header", "true")
               .load(file_path)
)

(
    csv_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("delta_bronze")
)

print(csv_df.count())

In [0]:
#Silver Table

# STEP 1 — Read Bronze Table
delta_temp_silver = spark.read.table("delta_bronze")

print(delta_temp_silver.count())

# Checkpoint
# Row count matches Bronze table
# checks_equal = delta_temp_silver.subtract(spark.read.table("delta_bronze")).count() 
# print(checks_equal == 0)

In [0]:
# STEP 2 — Explicitly Cast ONLY 3 Columns
delta_temp_silver_cast = delta_temp_silver.withColumns(
    {
        "Shipdate": F.to_date("Shipdate", "dd-MMM-yy"),
        "Amount": F.col("Amount").cast("double"),
        "Boxes": F.col("Boxes").cast("integer")
    }
)

# Checkpoint
# Row count matches Bronze table
# checks_equal = delta_temp_silver.subtract(spark.read.table("delta_bronze")).count() 
# print(checks_equal == 0)
# Only 3 columns changed type -- I dont know how to check this

In [0]:
# STEP 3 — Define Validation Flags (Same as Option 1)
# Create boolean flags (do NOT filter yet):

invalid_shipdate = (
    (F.col("Shipdate").isNull())
)

invalid_amount = (
    (F.col("Amount") <= 0) |
    (F.col("Amount").isNull()) |
    (F.isnan("Amount"))
)

invalid_boxes = (
    (F.col("Boxes") < 0) |
    (F.col("Boxes").isNull()) |
    (F.isnan("Boxes"))
)

invalid_list = invalid_amount | invalid_boxes | invalid_shipdate

# Checkpoint
# Flags exist as boolean columns & No rows dropped


In [0]:
# STEP 4 — Derive Overall Record Validity
# Create is_valid_record

delta_temp_silver_is_record = delta_temp_silver_cast.withColumn(
    "is_valid_record", ~(invalid_list)
)
    

print(delta_temp_silver_is_record.count())
# Checkpoint
# Column exists
column_exist = "is_valid_record"
if column_exist in delta_temp_silver_is_record.columns:
    print("Column exists")
else:
    print("Column does not exist")
# TRUE/FALSE values look reasonable
# It seems like. There are no false records

In [0]:
# STEP 5 — Split Data (NO WRITE YET)
valid_records = delta_temp_silver_is_record.filter("is_valid_record").count()
print(valid_records)
invalid_records = delta_temp_silver_is_record.filter("Not is_valid_record").count()
print(invalid_records)
# Checkpoint
# valid_count + invalid_count = total_count
print(delta_temp_silver_is_record.count() == valid_records + invalid_records)

In [0]:
# STEP 6 — Write Silver Tables (First Time Only)
#Write valid records
delta_silver_valid = delta_temp_silver_is_record.filter("is_valid_record")
delta_silver_valid.write.format("delta").mode("overwrite").saveAsTable("delta_silver_valid")

#Write invalid records
delta_silver_invalid = delta_temp_silver_is_record.filter("NOT is_valid_record")
delta_silver_invalid.write.format("delta").mode("overwrite").saveAsTable("delta_quarantine")



In [0]:
# Checkpoint
# Tables created successfully & Row counts match expectations
valid_check = spark.read.table("delta_silver_valid")
invalid_check = spark.read.table("delta_quarantine")
print(valid_check.count())
print(invalid_check.count())
valid_check.show(5)
invalid_check.show(5)

In [0]:
%sql
show tables;