### Test Silver layer

In [2]:
import unittest
import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s: %(levelname)s: %(message)s')

class SparkSilverLayerTests(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        # Initialize Spark Session for Testing
        cls.spark = SparkSession.builder \
            .appName("Data Pipeline Testing - Silver Layer") \
            .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.1") \
            .getOrCreate()
        logging.info("Spark session for Silver layer testing initialized.")

        # Base directory where CSV files are located
        cls.base_dir = "/Users/mansurcan/Documents/ITC/Lloyds/Silver/"

        # Define the expected schemas for tables
        cls.expected_schemas = {
            "locations_cleaned": StructType([
                StructField("LocationId", StringType(), True),
                StructField("OrganisationID", StringType(), True),
                StructField("LocationPostCode_clean", StringType(), True)
            ]),
            "rural_lookup_full_CL_cleaned": StructType([
                StructField("pcds_clean", StringType(), True),
                StructField("ladnm", StringType(), True)
            ]),
            "lookup_full_cleaned": StructType([
                StructField("pcds_clean", StringType(), True),
                StructField("ru11ind", StringType(), True),
                StructField("lat", DoubleType(), True),
                StructField("long", DoubleType(), True)
            ])
        }

    def test_schema_and_data_validation(self):
        logging.info("Starting schema and data validation tests for Silver layer")
        for table_name, expected_schema in self.expected_schemas.items():
            df = self.spark.read \
                .schema(expected_schema) \
                .csv(f"{self.base_dir}/{table_name}.csv", header=True)

            # Validate schema
            self.assertEqual(df.schema, expected_schema, f"Schema for {table_name} does not match the expected schema")
            logging.info(f"Schema validation for {table_name} passed.")

            # Validate data is not empty
            self.assertGreater(df.count(), 0, f"DataFrame for {table_name} is empty")
            logging.info(f"Data loading for {table_name} passed.")

    @classmethod
    def tearDownClass(cls):
        cls.spark.stop()
        logging.info("Spark session stopped.")

# Define a main function to run the tests
def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(SparkSilverLayerTests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(suite)

# Execute the tests
run_tests()


24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
24/03/14 10:33:24 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
2024-03-14 10:33:25,792: INFO: Spark session for Silver layer testing initialized.
test_schema_and_data_validation (__main__.SparkSilverLayerTests.test_schema_and_data_validation) ... 2024-03-14 10:33:25,793: