### Test Bronze layer

In [1]:
import unittest
import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Configure logging to display more details
logging.basicConfig(level=logging.INFO, format='%(asctime)s: %(levelname)s: %(message)s')

class SparkDataTests(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        # Initialize Spark Session
        cls.spark = SparkSession.builder \
            .appName("Data Pipeline Testing - Bronze Layer") \
            .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.1") \
            .getOrCreate()
        logging.info("Spark session initialized for test class.")

        # Directories and file names setup
        cls.current_dir = os.getcwd()
        cls.lloyds_dir = os.path.join(cls.current_dir, '..')
        cls.bronze_dir = os.path.join(cls.lloyds_dir, 'Bronze')
        cls.files = ["locations.xlsx", "NSP21CL_AUG23_UK_LU.csv", "NSPL21_AUG_2023_UK.csv"]
        logging.info(f"Bronze directory set to: {cls.bronze_dir}, with files: {cls.files}")

    def test_read_bronze_files(self):
        logging.info("Starting test: test_read_bronze_files")
        for file_name in self.files:
            file_path = os.path.join(self.bronze_dir, file_name)
            logging.info(f"Processing file: {file_name}")
            if file_name.endswith('.csv'):
                df = self.spark.read.csv(file_path, header=True)
            elif file_name.endswith('.xlsx'):
                df = self.spark.read.format("com.crealytics.spark.excel") \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .load(file_path)

            self.assertIsNotNone(df, f"DataFrame for {file_name} should not be None")
            record_count = df.count()
            self.assertGreater(record_count, 0, f"DataFrame for {file_name} should not be empty")
            logging.info(f"File {file_name} passed with {record_count} records.")

    def test_schema_validation(self):
        logging.info("Starting test: test_schema_validation")
        expected_schema = StructType([
            StructField("LocationId", StringType(), True),
            StructField("OrganisationID", StringType(), True),
            StructField("LocationPostCode", StringType(), True),
        ])

        df_test = self.spark.read \
            .format("com.crealytics.spark.excel") \
            .option("header", "true") \
            .schema(expected_schema) \
            .load(os.path.join(self.bronze_dir, "locations.xlsx"))

        self.assertEqual(df_test.schema, expected_schema, "Schema does not match the expected schema")
        logging.info("Schema validation passed.")

    @classmethod
    def tearDownClass(cls):
        cls.spark.stop()
        logging.info("Spark session stopped.")

# Define a main function to run the tests with enhanced output
def run_tests():
    # Create a test suite
    suite = unittest.TestLoader().loadTestsFromTestCase(SparkDataTests)
    # Run the test suite with a runner that provides a higher verbosity level
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(suite)

# Execute the tests
run_tests()


24/03/14 10:31:39 WARN Utils: Your hostname, Mansurs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.24 instead (on interface en0)
24/03/14 10:31:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/usr/local/Cellar/apache-spark/3.5.1/libexec/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/mansurcan/.ivy2/cache
The jars for the packages stored in: /Users/mansurcan/.ivy2/jars
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9c141e14-b00f-4a64-854d-f9cf6be37f5d;1.0
	confs: [default]
	found com.crealytics#spark-excel_2.12;0.13.1 in central
	found org.apache.poi#poi;4.1.0 in central
	found commons-codec#commons-codec;1.12 in central
	found org.apache.commons#commons-collections4;4.3 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.apache.poi#poi-ooxml;4.1.0 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.0 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found com.norbitltd#spoiwo_2.12;1.6.0 in central
	found org.scala-lang.modules#scala-xml_2.12;1.2.0 in central
	found joda-time#joda-time;2.9.9 in central
	found org.joda#joda-convert;2.0.1 in central
	found com.monito