In [0]:
from pyspark.sql.functions import regexp_extract, trim, col, when

# Load the existing invoice table
clean_df = spark.table("databricks_catalog.invoice_schema.raw_inovice_table")

# Extract customer name from file_name if missing
clean_df = clean_df.withColumn(
    "extracted_customer_name", 
    regexp_extract(
        col("file_name"), 
        r"invoice_(.*?)_\d+\.pdf", 
        1
    )
)

# Replace empty or null customer_name with extracted value
clean_df = clean_df.withColumn(
    "customer_name",
    when(
        (col("customer_name").isNull()) | (trim(col("customer_name")) == ""), 
        col("extracted_customer_name")
    ).otherwise(col("customer_name"))
)

# Drop helper column
clean_df = clean_df.drop("extracted_customer_name")

# Save the cleaned data back to a Delta table
clean_df.write.format("delta").mode("overwrite").saveAsTable(
    "databricks_catalog.invoice_schema.customer_invoice_cleaned"
)

display(clean_df)