In [0]:
from pyspark.sql.functions import col, coalesce, lit, regexp_extract, current_timestamp

# Read both tables
unified = spark.read.table("silver.unified.unified_companies").alias("u")
enriched = spark.read.table("gold.final.llm_enriched_companies").alias("e")

In [0]:
final_gold = (
    unified.join(enriched, col("u.uen") == col("e.uen"), "left")
    .select(
        col("u.uen"),
        col("u.company_name"),
        col("u.website"),
        coalesce(col("e.llm_normalized_industry"), col("u.industry")).alias("industry"),
        col("u.hq_country"),
        col("u.no_of_locations_in_singapore"),
        col("u.linkedin"),
        col("u.facebook"),
        col("u.instagram"),
        col("u.is_it_delisted"),
        col("u.revenue"),
        col("u.founding_year"),
        coalesce(col("e.company_size"), lit(None)).alias("company_size"),
        coalesce(col("e.products_offered"), lit(None)).alias("products_offered"),
        coalesce(col("e.services_offered"), lit(None)).alias("services_offered"),
        coalesce(col("e.keywords"), col("u.keywords")).alias("keywords"),
        lit(None).alias("number_of_employees"),  # Placeholder for future field
        # Derive stock exchange code from stock symbol (if present)
        regexp_extract(col("u.stock_symbol"), r"([A-Za-z]+)", 1).alias("stock_exchange_code"),
        current_timestamp().alias("created_at"),
        current_timestamp().alias("updated_at")
    )
)

In [0]:
display(final_gold)

In [0]:
final_gold_path = "abfss://gold@singaporecomadls.dfs.core.windows.net/final_gold_master"

In [0]:
from delta.tables import DeltaTable

# spark.conf.set("spark.delta.schema.autoMerge.enabled", "true")

delta_target = DeltaTable.forPath(spark, final_gold_path)

(
    delta_target.alias("t")
    .merge(final_gold.alias("s"), "t.uen = s.uen")
    .whenMatchedUpdate(set={
        "company_name": "s.company_name",
        "website": "s.website",
        "industry": "s.industry",
        "hq_country": "s.hq_country",
        "no_of_locations_in_singapore": "s.no_of_locations_in_singapore",
        "linkedin": "s.linkedin",
        "facebook": "s.facebook",
        "instagram": "s.instagram",
        "is_it_delisted": "s.is_it_delisted",
        "revenue": "s.revenue",
        "founding_year": "s.founding_year",
        "company_size": "s.company_size",
        "products_offered": "s.products_offered",
        "services_offered": "s.services_offered",
        "keywords": "s.keywords",
        "number_of_employees": "s.number_of_employees",
        "stock_exchange_code": "s.stock_exchange_code",
        "updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "uen": "s.uen",
        "company_name": "s.company_name",
        "website": "s.website",
        "industry": "s.industry",
        "hq_country": "s.hq_country",
        "no_of_locations_in_singapore": "s.no_of_locations_in_singapore",
        "linkedin": "s.linkedin",
        "facebook": "s.facebook",
        "instagram": "s.instagram",
        "is_it_delisted": "s.is_it_delisted",
        "revenue": "s.revenue",
        "founding_year": "s.founding_year",
        "company_size": "s.company_size",
        "products_offered": "s.products_offered",
        "services_offered": "s.services_offered",
        "keywords": "s.keywords",
        "number_of_employees": "s.number_of_employees",
        "stock_exchange_code": "s.stock_exchange_code",
        "created_at": "current_timestamp()",
        "updated_at": "current_timestamp()"
    })
    .execute()
)


In [0]:
%sql
SELECT * FROM gold.final.master_companies