In [None]:
%py
# PySpark script to generate comprehensive test data for purgo_playground.d_product_revenue_clone table

# from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, DateType, TimestampType
# from pyspark.sql import Row

try:
    # Define the schema for the d_product_revenue_clone table
    schema = StructType([
        StructField("product_id", LongType(), True),
        StructField("product_name", StringType(), True),
        StructField("product_type", StringType(), True),
        StructField("revenue", LongType(), True),
        StructField("country", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("purchased_date", DateType(), True),
        StructField("invoice_date", DateType(), True),
        StructField("invoice_number", StringType(), True),
        StructField("is_returned", LongType(), True),
        StructField("customer_satisfaction_score", LongType(), True),
        StructField("product_details", StringType(), True),
        StructField("customer_first_purchased_date", DateType(), True),
        StructField("customer_first_product", StringType(), True),
        StructField("customer_first_revenue", DoubleType(), True),
        StructField("timestamp", TimestampType(), True)
    ])

    # Create test data covering various scenarios
    test_data = [
        # Happy path
        Row(1, "ProductA", "Type1", 1000, "USA", "C001", "2024-01-15", "2024-01-20", "1234234534", 0, 5, "DetailA", "2023-12-01", "ProductX", 500.0, "2024-01-20T10:00:00.000+0000"),
        Row(2, "ProductB", "Type2", 2000, "Canada", "C002", "2024-02-10", "2024-02-15", "9876543210", 1, 4, "DetailB", "2023-11-05", "ProductY", 1500.0, "2024-02-15T11:30:00.000+0000"),
        
        # Edge cases: invoice_number with exactly four digits
        Row(3, "ProductC", "Type3", 1500, "UK", "C003", "2024-03-05", "2024-03-10", "1000", 0, 3, "DetailC", "2023-10-20", "ProductZ", 750.0, "2024-03-10T09:45:00.000+0000"),
        
        # Edge cases: invoice_number with less than four digits
        Row(4, "ProductD", "Type4", 2500, "Germany", "C004", "2024-04-12", "2024-04-18", "123", 1, 2, "DetailD", "2023-09-15", "ProductW", 2000.0, "2024-04-18T14:20:00.000+0000"),
        
        # Error cases: invoice_number with non-numeric characters
        Row(5, "ProductE", "Type5", 3000, "France", "C005", "2024-05-22", "2024-05-25", "ABCDEF1234", 0, 1, "DetailE", "2023-08-10", "ProductV", 2500.0, "2024-05-25T16:00:00.000+0000"),
        
        # NULL handling: some nullable fields are null
        Row(6, "ProductF", "Type6", None, "Spain", "C006", None, "2024-06-30", "555566667777", 0, 5, None, "2023-07-05", "ProductU", 3000.0, "2024-06-30T12:15:00.000+0000"),
        
        # Special characters in string fields
        Row(7, "Prod🛒ctG", "Typ€7", 4000, "Italy", "C007", "2024-07-14", "2024-07-20", "999988887777", 1, 4, "Detail@G", "2023-06-25", "ProductT", 3500.0, "2024-07-20T08:50:00.000+0000"),
        
        # Multi-byte characters in product_name
        Row(8, "产品H", "类型8", 5000, "China", "C008", "2024-08-19", "2024-08-25", "112233445566", 0, 3, "DetailH", "2023-05-30", "ProductS", 4000.0, "2024-08-25T13:35:00.000+0000"),
        
        # Maximum bigint value for revenue
        Row(9, "ProductI", "Type9", 9223372036854775807, "Japan", "C009", "2024-09-10", "2024-09-15", "777788889999", 0, 5, "DetailI", "2023-04-20", "ProductR", 4500.0, "2024-09-15T17:25:00.000+0000"),
        
        # Negative revenue value
        Row(10, "ProductJ", "Type10", -500, "Australia", "C010", "2024-10-05", "2024-10-10", "333344445555", 1, 2, "DetailJ", "2023-03-15", "ProductQ", -1000.0, "2024-10-10T19:40:00.000+0000"),
        
        # NULL invoice_number
        Row(11, "ProductK", "Type11", 6000, "Brazil", "C011", "2024-11-18", "2024-11-22", None, 0, 4, "DetailK", "2023-02-10", "ProductP", 5000.0, "2024-11-22T07:55:00.000+0000"),
        
        # Special characters in customer_id
        Row(12, "ProductL", "Type12", 7000, "Mexico", "C@012", "2024-12-25", "2024-12-30", "666677778888", 1, 1, "DetailL", "2023-01-05", "ProductO", 5500.0, "2024-12-30T21:10:00.000+0000"),
        
        # Very long product_name
        Row(13, "ProductNameThatIsExceptionallyLongToTestBoundaryConditionsAndEnsureProperHandling", "Type13", 8000, "India", "C013", "2025-01-10", "2025-01-15", "444455556666", 0, 5, "DetailM", "2022-12-25", "ProductN", 6000.0, "2025-01-15T05:30:00.000+0000"),
        
        # Revenue as zero
        Row(14, "ProductM", "Type14", 0, "Russia", "C014", "2025-02-20", "2025-02-25", "101010101010", 1, 3, "DetailM", "2022-11-20", "ProductM", 0.0, "2025-02-25T23:45:00.000+0000"),
        
        # High customer satisfaction score
        Row(15, "ProductN", "Type15", 9000, "South Africa", "C015", "2025-03-30", "2025-04-05", "121212121212", 0, 6, "DetailN", "2022-10-15", "ProductL", 6500.0, "2025-04-05T02:20:00.000+0000"),
        
        # Low customer satisfaction score
        Row(16, "ProductO", "Type16", 10000, "Nigeria", "C016", "2025-04-18", "2025-04-22", "131313131313", 1, 0, "DetailO", "2022-09-10", "ProductK", 7000.0, "2025-04-22T14:55:00.000+0000"),
        
        # Mixed valid and invalid invoice_number
        Row(17, "ProductP", "Type17", 11000, "Egypt", "C017", "2025-05-25", "2025-05-30", "ABC1234DEF", 0, 4, "DetailP", "2022-08-05", "ProductJ", 7500.0, "2025-05-30T18:35:00.000+0000"),
        
        # Invoice_number with special characters
        Row(18, "ProductQ", "Type18", 12000, "Kenya", "C018", "2025-06-12", "2025-06-17", "12#34$6789", 1, 2, "DetailQ", "2022-07-01", "ProductI", 8000.0, "2025-06-17T09:25:00.000+0000"),
        
        # Invoice_number as empty string
        Row(19, "ProductR", "Type19", 13000, "Argentina", "C019", "2025-07-07", "2025-07-12", "", 0, 5, "DetailR", "2022-06-20", "ProductH", 8500.0, "2025-07-12T11:15:00.000+0000"),
        
        # All NULL fields except mandatory
        Row(20, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),
        
        # Duplicate invoice_number
        Row(21, "ProductS", "Type20", 14000, "Thailand", "C020", "2025-08-19", "2025-08-24", "1234234534", 1, 3, "DetailS", "2022-05-15", "ProductG", 9000.0, "2025-08-24T16:05:00.000+0000"),
        
        # Invoice_number with spaces
        Row(22, "ProductT", "Type21", 15000, "Vietnam", "C021", "2025-09-23", "2025-09-28", "  4455667788  ", 0, 4, "DetailT", "2022-04-10", "ProductF", 9500.0, "2025-09-28T07:40:00.000+0000"),
        
        # Invoice_number with mixed case letters
        Row(23, "ProductU", "Type22", 16000, "Philippines", "C022", "2025-10-30", "2025-11-04", "AbC1234dEfG", 1, 2, "DetailU", "2022-03-05", "ProductE", 10000.0, "2025-11-04T13:55:00.000+0000"),
        
        # Invoice_number with Unicode characters
        Row(24, "ProductV", "Type23", 17000, "Singapore", "C023", "2025-11-15", "2025-11-20", "१२३४५६", 0, 5, "DetailV", "2022-02-01", "ProductD", 10500.0, "2025-11-20T20:30:00.000+0000"),
        
        # Revenue as decimal in string
        Row(25, "ProductW", "Type24", 18000, "Netherlands", "C024", "2025-12-05", "2025-12-10", "6677889900", 1, 1, "DetailW", "2022-01-25", "ProductC", 11000.0, "2025-12-10T05:50:00.000+0000"),
        
        # Future purchase dates
        Row(26, "ProductX", "Type25", 19000, "Belgium", "C025", "2026-01-20", "2026-01-25", "7788990011", 0, 4, "DetailX", "2021-12-15", "ProductB", 11500.0, "2026-01-25T12:40:00.000+0000"),
        
        # Past purchase dates
        Row(27, "ProductY", "Type26", 20000, "Sweden", "C026", "2023-12-01", "2023-12-05", "8899001122", 1, 3, "DetailY", "2021-11-10", "ProductA", 12000.0, "2023-12-05T17:30:00.000+0000"),
        
        # Timestamp with different time zones
        Row(28, "ProductZ", "Type27", 21000, "Norway", "C027", "2026-02-14", "2026-02-18", "9900112233", 0, 2, "DetailZ", "2021-10-05", "Product9", 12500.0, "2026-02-18T23:59:59.999+0500"),
        
        # Invoice_number with maximum length
        Row(29, "ProductAA", "Type28", 22000, "Denmark", "C028", "2026-03-08", "2026-03-12", "12345678901234567890", 1, 1, "DetailAA", "2021-09-01", "Product8", 13000.0, "2026-03-12T03:15:00.000+0000"),
        
        # Invoice_number with minimum length (empty string)
        Row(30, "ProductAB", "Type29", 23000, "Finland", "C029", "2026-04-22", "2026-04-27", "", 0, 5, "DetailAB", "2021-08-25", "Product7", 13500.0, "2026-04-27T19:05:00.000+0000")
    ]

    # Create DataFrame with test data
    df = spark.createDataFrame(test_data, schema)

    # Write the DataFrame to the purgo_playground.d_product_revenue_clone table
    df.write.mode("overwrite").saveAsTable("purgo_playground.d_product_revenue_clone")

except Exception as e:
    # Handle exceptions related to data generation and writing
    print(f"An error occurred during test data generation: {e}")

# spark.stop()