In [0]:
import unittest
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

class TestSalesDataProcessing(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.spark = SparkSession.builder.master("local[2]").appName("SalesTest").getOrCreate()
        # Load or create a sample DataFrame similar to your salesdf for testing
        data = [
            (11100, 1, 2),  # (time, province, code)
            (11323, 2, 3),
            (11415, 1, 4)
        ]
        columns = ["time", "province", "code"]
        cls.df = cls.spark.createDataFrame(data, columns)
    
    @classmethod
    def tearDownClass(cls):
        cls.spark.stop()
    
    def test_province_encoding(self):
        # Apply your province encoding logic to self.df here or import from your notebook
        df = self.df.withColumn("province1", (col("province") == 1).cast("int")) \
                    .withColumn("province2", (col("province") == 2).cast("int"))
        # Check that province1 and province2 are correct
        results = df.select("province", "province1", "province2").collect()
        for row in results:
            self.assertEqual(row["province1"], 1 if row["province"] == 1 else 0)
            self.assertEqual(row["province2"], 1 if row["province"] == 2 else 0)
    
    def test_hour_extraction_and_cyclic_features(self):
        # Test time string conversion and cyclic sin/cos calculation
        from math import sin, cos, pi
        HOURS_IN_DAY = 24

        df = self.df.withColumn("hour", (col("time") / 100).cast("int"))
        df = df.withColumn("hoursin", sin(2 * pi * col("hour") / HOURS_IN_DAY)) \
               .withColumn("hourcos", cos(2 * pi * col("hour") / HOURS_IN_DAY))
    
        result = df.select("time", "hour", "hoursin", "hourcos").collect()
        for row in result:
            expected_hour = row["time"] // 100
            self.assertEqual(row["hour"], expected_hour)
            # Values for hoursin and hourcos should be valid floats
            self.assertIsInstance(row["hoursin"], float)
            self.assertIsInstance(row["hourcos"], float)

    def test_cyclical_day_encoding(self):
        # You can create a small bulk test for cyclical day encoding if implemented similarly
        pass
    
    def test_null_imputation(self):
        # Create DataFrame with nulls and apply your null impute logic,
        # then check that nulls are filled properly
        data_with_nulls = [(None, 1.0), (2.0, None)]
        df_null = self.spark.createDataFrame(data_with_nulls, ["type_indexed", "size_value"])
        fill_values = {"type_indexed": 9999.0, "size_value": 0.0}
        df_filled = df_null.fillna(fill_values)
        for row in df_filled.collect():
            self.assertIsNotNone(row["type_indexed"])
            self.assertIsNotNone(row["size_value"])

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)
