In [0]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName("Option_2").getOrCreate()

In [0]:
file_path = "/Volumes/workspace/default/my_datas/sample-chocolate-sales-data-all.csv"

In [0]:
# Bronze Table - Raw data
# Run this cell only when new data into csv is ingested
csv_df = (spark.read
               .format("csv")
               .option("header", "true")
               .load(file_path)
)

#create timestamp
csv_df_with_metadata = csv_df.withColumn("ingestion_timestamp", F.current_timestamp())

(
    csv_df_with_metadata.write
    .format("delta")
    .mode("overwrite")
    #.option("inferSchema", "true")
    .option("mergeSchema", "true")
    .saveAsTable("delta_bronze")
)

In [0]:
%restart_python

In [0]:
# Bronze Table --> Silver Table

# Read delta_bronze table
delta_temp_silver = spark.read.table("delta_bronze")
# Check whether count of delta_temp_silver and delta_bronze is same
checks_equal = delta_temp_silver.subtract(spark.read.table("delta_bronze")).count() 
print(checks_equal == 0)
# Ensure Schema for crirical columns
delta_temp_silver = delta_temp_silver.withColumns(
    {
        "Shipdate": F.to_date("Shipdate", "dd-MMM-yy"),
        "Amount": F.col("Amount").cast("double"),
        "Boxes": F.col("Boxes").cast("integer")
    }
)


delta_temp_silver.select(F.sum(F.col("Shipdate").isNull().cast("int"))).show()

delta_temp_silver.where("Shipdate is NULL").show()