In [0]:
import unittest
from pyspark.ml.feature import StringIndexer, VectorAssembler

class TestPromotionNotebook(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        # Path to your parquet file in Azure Blob
        parquet_path = "/mnt/silver/promotion/"
        
        # Load DataFrame from Parquet file in Azure Blob storage
        cls.df = spark.read.parquet(parquet_path)

# null values in feature and display columns are filled with the string "NONE".
    def test_null_fill(self):
        df = self.df.fillna("NONE", subset=["feature", "display"])
        features = df.select("feature").rdd.flatMap(lambda x: x).collect()
        displays = df.select("display").rdd.flatMap(lambda x: x).collect()
        self.assertNotIn(None, features)
        self.assertNotIn(None, displays)
        self.assertIn("NONE", features)
        self.assertIn("NONE", displays)

# Verifies StringIndexer creates numerical index columns
    def test_string_indexer(self):
        df = self.df.fillna("NONE", subset=["feature", "display"])
        indexer_feature = StringIndexer(
            inputCol="feature",
            outputCol="featureindexed",
            handleInvalid="keep"
        )
        indexer_display = StringIndexer(
            inputCol="display",
            outputCol="displayindexed",
            handleInvalid="keep"
        )

        model_feature = indexer_feature.fit(df)
        model_display = indexer_display.fit(df)

        df_indexed = model_feature.transform(df)
        df_indexed = model_display.transform(df_indexed)

        self.assertIn("featureindexed", df_indexed.columns)
        self.assertIn("displayindexed", df_indexed.columns)+

#  VectorAssembler successfully assemble the indexed features into single vector column
    def test_vector_assembler(self):
        df = self.df.fillna("NONE", subset=["feature", "display"])
        indexer_display = StringIndexer(
            inputCol="display",
            outputCol="displayindexed",
            handleInvalid="keep"
        )
        model_display = indexer_display.fit(df)
        df_indexed = model_display.transform(df)

        assembler = VectorAssembler(
            inputCols=["displayindexed"],
            outputCol="typevec"
        )
        df_assembled = assembler.transform(df_indexed)

        self.assertIn("typevec", df_assembled.columns)
        row = df_assembled.select("typevec").head()
        self.assertTrue(hasattr(row["typevec"], "toArray"))
        
# ensure the script runs directly and triggers unit test framework to run all tests
if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)