In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, when)
from notebookUtils import readCsvAsDataframe

In [3]:
spark = SparkSession.builder.appName("factsales2").getOrCreate()

In [11]:
fact_sales_original = readCsvAsDataframe(spark,"fact_sales_data_v2.csv")

In [12]:
fact_sales_transformed = spark.read.parquet("spark-warehouse/fact_sales")

In [13]:
# test if data was lost or not
if fact_sales_transformed.count() == fact_sales_original.count():
    print("Data not lost in transformation")
else:
    print("Some data lost in transformation")

Data not lost in transformation


In [20]:
# test for invalid unit price in fact
fact_sales_invalid_unit_price = fact_sales_transformed.filter((col("UnitPrice") < 0) | (col("UnitPrice").isNull())).count()
if fact_sales_invalid_unit_price > 0:
    print("Some Rows have invalid unit price")

In [24]:
# check for null values in transformed dataframe
nullDf = df = fact_sales_transformed.filter(
    (fact_sales_transformed["ProductKey"].isNull()) | 
    (fact_sales_transformed["StoreKey"].isNull()) | 
    (fact_sales_transformed["EmployeeKey"].isNull())  
)
if nullDf.count() > 0:
    print("Dimesions not properly configured in transformed fact table")

In [31]:
# check for fact columns matched in transformed dataframe
original_facts = fact_sales_original.fillna(0) \
    .withColumn(
        "UnitPrice",
        when(col("UnitPrice") < 0, 0).otherwise(col("UnitPrice"))
    ) \
    .select("UnitsSold", "UnitPrice", "Discount", "SaleDate").orderBy("SaleDate")
transformed_facts = fact_sales_transformed.select("UnitsSold", "UnitPrice", "Discount", "SaleDate").orderBy("SaleDate")

differences = original_facts.exceptAll(transformed_facts)
diff_count = differences.count()
if diff_count > 0:
    print("Error: All Column Values dont match")
    differences.show()
else:
    print("Columns match after transformations.")

Validation Passed: Columns match after transformations.
