In [None]:
%py
# PySpark script
# Purpose: Mask last 4 digits of invoice numbers in d_product_revenue_clone table
# Author: Mariana Pineda
# Date: 2025-06-06
# Description: This script creates a clone of the d_product_revenue table, masks the last 4 digits of the invoice_number column with asterisks, handles nulls, and ensures performance for large datasets.

# Import necessary PySpark functions
from pyspark.sql.functions import col, expr, when

# Function to mask invoice numbers
def mask_invoice_numbers(df):
    """
    Mask the last 4 digits of the invoice_number column.

    Args:
        df (DataFrame): The input DataFrame containing the invoice_number column.

    Returns:
        DataFrame: A new DataFrame with the masked invoice_number column.
    """
    return df.withColumn(
        'invoice_number',
        when(
            col('invoice_number').isNull(),
            None  # Retain null if exists
        ).when(
            expr("LENGTH(invoice_number) < 4"),
            col('invoice_number')  # Retain invoice numbers with less than 4 digits
        ).otherwise(
            expr("CONCAT(SUBSTRING(invoice_number, 1, LENGTH(invoice_number) - 4), '****')")  # Mask last 4 digits
        )
    )

try:
    # Load the original table
    df_original = spark.table("d_product_revenue")

    # Drop the clone table if it exists
    spark.sql("DROP TABLE IF EXISTS d_product_revenue_clone")

    # Create a clone of the original table
    df_original.write.saveAsTable("d_product_revenue_clone")

    # Load the clone table for processing
    df_clone = spark.table("d_product_revenue_clone")

    # Mask invoice numbers in the clone table
    df_masked = mask_invoice_numbers(df_clone)

    # Overwrite the clone table with the masked data
    df_masked.write.mode("overwrite").saveAsTable("d_product_revenue_clone")

    # Validate that the data was masked correctly
    # CTE to select masked records for validation
    df_validation = spark.sql("""
        -- Purpose: Validate the masking of invoice numbers
        WITH MaskedData AS (
            SELECT invoice_number
            FROM d_product_revenue_clone
        )
        SELECT *
        FROM MaskedData
    """)

    # Show the results of the validation query
    df_validation.show()

except Exception as e:
    # Handle exceptions in the process
    print(f"An error occurred: {e}")