In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, sum as sparkSum)

In [2]:
spark = SparkSession.builder.appName("fact_mapping").getOrCreate()

In [3]:
df_fact_original = spark.read.option("header",True).option("inferSchema",True).csv("source/Fact_Transactions.csv")

In [4]:
df_fact_transformed = spark.read.parquet("spark-warehouse/fact_transactions")

#### Testing for total amount integrity

In [5]:
original_sum = df_fact_original.agg(sparkSum("Amount")).collect()[0][0]
new_sum = df_fact_transformed.agg(sparkSum("Amount")).collect()[0][0]

In [6]:
if original_sum != new_sum:
    print("Sum of Amount should remain unchanged after adding MappingID")
    print(original_sum)
    print(new_sum)
else:
    print("Amount Integrity Maintained")

Amount Integrity Maintained


#### Testing if all the fact columns match or not

In [10]:
original_facts = df_fact_original.select([col("Amount"),col("TransactionID")]).orderBy(col("TransactionID"))
transformed_facts = df_fact_transformed.select([col("Amount"),col("TransactionID")]).orderBy(col("TransactionID"))

In [11]:
differences = original_facts.exceptAll(transformed_facts)
diff_count = differences.count()
if diff_count > 0:
    print("Error: All Column Values dont match")
    differences.show()
else:
    print("Columns match after transformations.")

Columns match after transformations.
