# Config

In [None]:
%run <Fundraising_SalesforceNPSP_Config>

# Transformation

### Transform: Source

In [None]:
merge_sql = f"""
MERGE INTO {silver_lakehouse_name}.Source AS target
USING (
    SELECT 
        '{source_id}' AS SourceId,
        '{source_name}' AS Name,
        current_timestamp() AS CreatedDate,
        current_timestamp() AS ModifiedDate
) AS source
ON target.SourceId = source.SourceId
WHEN NOT MATCHED THEN INSERT (
    SourceId, CreatedDate, ModifiedDate, Name
) VALUES (
    source.SourceId, source.CreatedDate, source.ModifiedDate, source.Name
)
"""

result = spark.sql(merge_sql)
row = result.collect()[0]

logging.info(f"‚úÖ Rows processed {row['num_affected_rows']}")

### Transform: Country

In [None]:
from pyspark.sql.functions import col, current_timestamp, lit, udf
from pyspark.sql.types import StringType

def EnrichCountry(df):
    return (
        df
        .select(col("npsp__MailingCountry__c").alias("Name"))
        .dropna(subset=["Name"])
        .distinct()
        .withColumn("CountryId", expr("uuid()"))
        .withColumn("CreatedDate", current_timestamp())
        .withColumn("ModifiedDate", current_timestamp())
        .withColumn("CountryCode", lit(None).cast(StringType()))  
        .withColumn("SourceId", lit(source_id))
        .select(
            "CountryId",
            "CreatedDate",
            "ModifiedDate",
            "CountryCode",
            "Name",
            "SourceId"
        )
    )

countryTable = CdfTable(
    source_table_name="Address",
    source_primary_key="npsp__MailingCountry__c",
    target_table_name="Country",
    columns=["npsp__MailingCountry__c"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Country AS target
    USING latestSnapshot_Address AS source
    ON target.Name = source.Name
    WHEN NOT MATCHED THEN INSERT (
        CountryId, CreatedDate, ModifiedDate, CountryCode, Name, SourceId
    ) VALUES (
        source.CountryId, source.CreatedDate, source.ModifiedDate, source.CountryCode, source.Name, source.SourceId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichCountry
)

ProcessCdfTable(countryTable, source_name)

### Transform: CampaignType

In [None]:
def EnrichCampaignType(df):
    new_df =  (
        df
        .filter(col("SobjectType") == "Campaign")  # Filter only Campaign record types
        .dropna(subset=["Name"])  # Ensure valid Name
        .select(
            col("Name"),
            col("CreatedDate"),
            col("LastModifiedDate").alias("ModifiedDate"),
            col("Id").alias("SourceSystemId")
        )
        .dropDuplicates(["SourceSystemId"]) 
        .withColumn("CampaignTypeId", expr("uuid()"))
        .withColumn("SourceId", lit(source_id))
        .select(
            "CampaignTypeId",
            "CreatedDate",
            "ModifiedDate",
            "Name",
            "SourceId",
            "SourceSystemId"
        )
    )

    return new_df

campaignTypeTable = CdfTable(
    source_table_name="RecordType",
    source_primary_key="Id",
    target_table_name="CampaignType",
    columns=["Id", "Name", "CreatedDate", "LastModifiedDate", "SobjectType"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.CampaignType AS target
    USING latestSnapshot_RecordType AS source
    ON target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.Name = source.Name

    WHEN NOT MATCHED THEN INSERT (
        CampaignTypeId, CreatedDate, ModifiedDate, Name, SourceId, SourceSystemId
    ) VALUES (
        source.CampaignTypeId, source.CreatedDate, source.ModifiedDate,
        source.Name, source.SourceId, source.SourceSystemId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichCampaignType
)

ProcessCdfTable(campaignTypeTable, source_name)

### Transform: Channel

In [None]:
from pyspark.sql.functions import col, current_timestamp, udf
from pyspark.sql.types import StringType

def EnrichChannel(df):
    new_df =  (
        df
        .select(col("Type").alias("Name"))
        .distinct()
        .dropna(subset=["Name"])
        .withColumn("ChannelId", expr("uuid()"))
        .withColumn("CreatedDate", current_timestamp())
        .withColumn("ModifiedDate", current_timestamp())
        .withColumn("SourceId", lit(source_id))
        .select(
            "ChannelId",
            "CreatedDate",
            "ModifiedDate",
            "Name",
            "SourceId"
        )
    )

    return new_df

channelTable = CdfTable(
    source_table_name="Campaign",
    source_primary_key="Id",
    target_table_name="Channel",
    columns=["Type", "Id"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Channel AS target
    USING latestSnapshot_Campaign AS source
    ON target.Name = source.Name
    WHEN NOT MATCHED THEN INSERT (
        ChannelId, CreatedDate, ModifiedDate, Name, SourceId
    ) VALUES (
        source.ChannelId, source.CreatedDate, source.ModifiedDate, source.Name, source.SourceId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichChannel
)

ProcessCdfTable(channelTable, source_name)

### Transform: ConstituentType

In [None]:
from pyspark.sql.functions import col, current_timestamp, lit
from pyspark.sql import Row

def EnrichConstituentType(df_account):
    df_types = (
        df_account
        .select(col("Type").alias("Name"))
        .distinct()
        .dropna(subset=["Name"])
    )

    # Check if there is at least one contact
    df_contact = get_bronze_table("Contact")
    if not df_contact.limit(1).isEmpty():
        # Add 'Individual' as a synthetic row
        df_individual = spark.createDataFrame([Row(Name="Individual")])
        df_types = df_types.unionByName(df_individual).dropDuplicates(["Name"])

    new_df = (
        df_types
        .withColumn("ConstituentTypeId", expr("uuid()"))
        .withColumn("CreatedDate", current_timestamp())
        .withColumn("ModifiedDate", current_timestamp())
        .withColumn("SourceId", lit(source_id))
        .select(
            "ConstituentTypeId",
            "CreatedDate",
            "ModifiedDate",
            "Name",
            "SourceId"
        )
    )

    return new_df

constituentTypeTable = CdfTable(
    source_table_name="Account",
    source_primary_key="Type",
    target_table_name="ConstituentType",
    columns=["Type"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.ConstituentType AS target
    USING latestSnapshot_Account AS source
    ON target.Name = source.Name
    WHEN NOT MATCHED THEN INSERT (
        ConstituentTypeId, CreatedDate, ModifiedDate, Name, SourceId
    ) VALUES (
        source.ConstituentTypeId, source.CreatedDate, source.ModifiedDate, source.Name, source.SourceId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichConstituentType
)

ProcessCdfTable(constituentTypeTable, source_name)

### Transform: EventType

In [None]:
from pyspark.sql.functions import col, current_timestamp, lit, udf
from pyspark.sql.types import StringType

def EnrichEventType(df):
    new_df = (
        df
        .select(col("Subject").alias("Name"))
        .distinct()
        .dropna(subset=["Name"])
        .withColumn("EventTypeId", expr("uuid()"))
        .withColumn("CreatedDate", current_timestamp())
        .withColumn("ModifiedDate", current_timestamp())
        .withColumn("SourceId", lit(source_id))
        .select(
            "EventTypeId",
            "CreatedDate",
            "ModifiedDate",
            "Name",
            "SourceId"
        )
    )

    return new_df

eventTypeTable = CdfTable(
    source_table_name="Event",
    source_primary_key="Subject",
    target_table_name="EventType",
    columns=["Subject"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.EventType AS target
    USING latestSnapshot_Event AS source
    ON target.Name = source.Name and target.SourceId = source.SourceId
    WHEN NOT MATCHED THEN INSERT (
        EventTypeId, CreatedDate, ModifiedDate, Name, SourceId
    ) VALUES (
        source.EventTypeId, source.CreatedDate, source.ModifiedDate, source.Name, source.SourceId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichEventType
)

ProcessCdfTable(eventTypeTable, source_name)

### Transform: OpportunityType

In [None]:
from pyspark.sql.functions import col, current_timestamp, udf, lit
from pyspark.sql.types import StringType

def EnrichOpportunityType(df):
    new_df = (
        df
        .select(col("Type").alias("Name"))
        .distinct()
        .dropna(subset=["Name"])
        .withColumn("OpportunityTypeId", expr("uuid()"))
        .withColumn("CreatedDate", current_timestamp())
        .withColumn("ModifiedDate", current_timestamp())
        .withColumn("SourceId", lit(source_id))
        .select(
            "OpportunityTypeId",
            "CreatedDate",
            "ModifiedDate",
            "Name",
            "SourceId"
        )
    )

    return new_df

opportunityTypeTable = CdfTable(
    source_table_name="Opportunity",
    source_primary_key="Id",
    target_table_name="OpportunityType",
    columns=["Type", "Id"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.OpportunityType AS target
    USING latestSnapshot_Opportunity AS source
    ON target.Name = source.Name
    WHEN NOT MATCHED THEN INSERT (
        OpportunityTypeId, CreatedDate, ModifiedDate, Name, SourceId
    ) VALUES (
        source.OpportunityTypeId, source.CreatedDate, source.ModifiedDate, source.Name, source.SourceId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichOpportunityType
)

ProcessCdfTable(opportunityTypeTable, source_name)

### Transform: Campaign

In [None]:
from pyspark.sql.functions import col, current_timestamp, lit
from pyspark.sql.types import StringType

def EnrichCampaign(df):
    # Read CampaignType table
    campaign_type_df = get_silver_table("CampaignType").select(
        "CampaignTypeId",
        col("SourceSystemId").alias("CampaignTypeSourceSystemId")
    )

    new_df = (
        df
        .join(campaign_type_df, df["RecordTypeId"] == campaign_type_df["CampaignTypeSourceSystemId"], how="left")
        .withColumn("SourceId", lit(source_id))
        .withColumn("CreatedDate", current_timestamp()) # Optional override
        .withColumn("ModifiedDate", current_timestamp()) # Optional override
        .withColumn("CampaignId",  expr("uuid()"))
        .select(
            "CampaignId",
            "CampaignTypeId",
            "ActualCost",
            "CreatedDate",
            "EndDate",
            "ModifiedDate",
            "Name",
            "SourceId",
            col("Id").alias("SourceSystemId"),
            "StartDate"
        )
    )

    return new_df

campaignTable = CdfTable(
    source_table_name="Campaign",
    source_primary_key="Id",
    target_table_name="Campaign",
    columns=[
        "Id", "RecordTypeId", "ActualCost", "CreatedDate", "EndDate",
        "Name", "StartDate"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Campaign AS target
    USING latestSnapshot_Campaign AS source
    ON target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.Cost = source.ActualCost,
        target.EndDate = source.EndDate,
        target.Name = source.Name,
        target.StartDate = source.StartDate

    WHEN NOT MATCHED THEN INSERT (
        CampaignId, CampaignTypeId, Cost, CreatedDate, EndDate, ModifiedDate, Name, 
        SourceId, SourceSystemId, StartDate
    ) VALUES (
        source.CampaignId, source.CampaignTypeId, source.ActualCost, source.CreatedDate, source.EndDate, source.ModifiedDate, source.Name, 
        source.SourceId, source.SourceSystemId, source.StartDate
        )
    
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichCampaign,
    hard_delete=True
)

ProcessCdfTable(campaignTable, source_name)


### Transform: Address

In [None]:
def EnrichAddress(df_addr_src):
    df_union = (
        df_addr_src
          .select(
              lit(None).cast("string").alias("AddressId"),
              col("npsp__MailingCity__c").alias("MailingCity"),
              col("npsp__MailingState__c").alias("MailingState"),
              col("npsp__MailingPostalCode__c").alias("MailingPostalCode"),
              col("npsp__MailingCountry__c").alias("MailingCountry"),
              "CreatedDate",
              "LastModifiedDate",
              col("Id").alias("SourceSystemId"),
              lit(source_id).alias("SourceId")
          )
    )

    # Assign AddressId where missing
    df_union = df_union.withColumn("AddressId", expr("coalesce(AddressId, uuid())"))

    df_country = get_silver_table("Country").select("CountryId", "Name")
    df_enriched = df_union.join(
        df_country,
        df_union["MailingCountry"] == df_country["Name"],
        how="left"
    )

    new_df =  (
        df_enriched.select(
            "AddressId",
            col("MailingCity").alias("City"),
            "CountryId",
            "CreatedDate",
            col("LastModifiedDate").alias("ModifiedDate"),
            "SourceId",
            "SourceSystemId",
            col("MailingState").alias("State"),
            col("MailingPostalCode").alias("ZipCode")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df

addressTable = CdfTable(
    source_table_name="Address",              
    source_primary_key="Id",
    target_table_name="Address",
    columns=[
        "Id", "npsp__MailingCity__c", "npsp__MailingState__c",
        "npsp__MailingPostalCode__c", "npsp__MailingCountry__c",
        "CreatedDate", "LastModifiedDate"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Address AS tgt
    USING latestSnapshot_Address AS src
    ON  tgt.SourceId      = src.SourceId
    AND tgt.SourceSystemId = src.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
         tgt.ModifiedDate = src.ModifiedDate,
         tgt.City         = src.City,
         tgt.State        = src.State,
         tgt.ZipCode      = src.ZipCode,
         tgt.CountryId    = src.CountryId
    WHEN NOT MATCHED THEN INSERT (
         AddressId, City, CountryId, CreatedDate, ModifiedDate,
         SourceId, SourceSystemId, State, ZipCode
    ) VALUES (
         src.AddressId, src.City, src.CountryId, src.CreatedDate,
         src.ModifiedDate, src.SourceId, src.SourceSystemId, src.State, src.ZipCode
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichAddress,
    hard_delete=True
)

ProcessCdfTable(addressTable, source_name)

### Transform: OpportunityStage


In [None]:
from pyspark.sql.functions import col, current_timestamp, lit, udf
from pyspark.sql.types import StringType

def EnrichOpportunityStage(df):
    new_df = (
        df
        .dropna(subset=["MasterLabel"])
        .select("Id", "MasterLabel", "CreatedDate", "LastModifiedDate")
        .withColumn("OpportunityStageId", expr("uuid()"))
        .withColumn("ModifiedDate", col("LastModifiedDate"))
        .withColumn("SourceId", lit(source_id))
        .select(
            col("Id").alias("SourceSystemId"),
            "OpportunityStageId",
            "CreatedDate",
            "ModifiedDate",
            "SourceId",
            col("MasterLabel").alias("Name")
        )
    )

    return new_df

opportunityStageTable = CdfTable(
    source_table_name="OpportunityStage",
    source_primary_key="Id",
    target_table_name="OpportunityStage",
    columns=["Id", "MasterLabel", "CreatedDate", "LastModifiedDate"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.OpportunityStage AS target
    USING latestSnapshot_OpportunityStage AS source
    ON target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate
    WHEN NOT MATCHED THEN INSERT (
        OpportunityStageId, Name, CreatedDate, ModifiedDate, SourceId, SourceSystemId
    ) VALUES (
        source.OpportunityStageId, source.Name, source.CreatedDate, source.ModifiedDate,
        source.SourceId, source.SourceSystemId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichOpportunityStage
)

ProcessCdfTable(opportunityStageTable, source_name)

### Transfrom: CampaignChannel

In [None]:
from pyspark.sql.functions import col, current_timestamp, lit, monotonically_increasing_id

def EnrichCampaignChannel(df):
    channel_df = get_silver_table("Channel").select(col("Name").alias("ChannelName"), "ChannelId")
    silver_campaign_df = get_silver_table("Campaign").select("SourceSystemId", "CampaignId")

    new_df =  (
        df
        .join(
            silver_campaign_df,
            df["Id"] == silver_campaign_df["SourceSystemId"],
            how="left"
        )
        .join(
            channel_df,
            df["Type"] == channel_df["ChannelName"],
            how="left"
        )
        .withColumn("CampaignChannelId", expr("uuid()"))
        .select(
            "ChannelId",
            "CampaignChannelId",
            "CampaignId"
        )
    )

    return new_df

campaignChannelTable = CdfTable(
    source_table_name="Campaign",
    source_primary_key="Id",
    target_table_name="CampaignChannel",
    columns=[
        "Id", "Type"  # Type is used to match Channel.Name
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.CampaignChannel AS target
    USING latestSnapshot_Campaign AS source
    ON target.CampaignId = source.CampaignId AND target.ChannelId = source.ChannelId
    WHEN NOT MATCHED THEN INSERT (
        CampaignChannelId, ChannelId, CampaignId
    ) VALUES (
        source.CampaignChannelId, source.ChannelId, source.CampaignId
    )
    WHEN NOT MATCHED BY SOURCE THEN DELETE
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichCampaignChannel
)

ProcessCdfTable(campaignChannelTable, source_name)
log_merge_metrics(
    f"{silver_lakehouse_name}.CampaignChannel",
    "CampaignChannel"
)

### Transform: EmailEngagement

In [None]:
def EnrichEmailEngagement(df):
    # Silver Campaign lookup (SourceSystemId to CampaignId)
    df_campaign = get_silver_table("Campaign").select(
        col("CampaignId"), col("SourceSystemId").alias("CampaignSourceSystemId")
    )

    # ChannelId by resolving ChannelName = 'Email' 
    df_channel = get_silver_table("Channel").filter(col("Name") == "Email") \
        .select(col("ChannelId").alias("EmailChannelId")).limit(1)

    # Join EmailMessage.RelatedToId ‚Üí Campaign.SourceSystemId
    df = df.join(
        df_campaign,
        df["RelatedToId"] == df_campaign["CampaignSourceSystemId"],
        "left"
    )

    # join to add EmailChannelId
    df = df.join(df_channel, how="left")

    new_df = (
        df
        .select(
            col("CampaignId"),
            col("EmailChannelId").alias("ChannelId"),
            col("CreatedDate"),
            expr("uuid()").alias("EmailEngagementId"),
            col("Id").alias("EmailId"),
            col("LastModifiedDate").alias("ModifiedDate"),
            col("MessageDate").alias("SendDate"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId"),
            col("Subject")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df

emailEngagementTable = CdfTable(
    source_table_name="EmailMessage",
    source_primary_key="Id",
    target_table_name="EmailEngagement",
    columns=[
        "Id", "CreatedDate", "LastModifiedDate", "MessageDate", "Subject", "RelatedToId"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.EmailEngagement AS target
    USING latestSnapshot_EmailMessage AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.CampaignId = source.CampaignId,
        target.ChannelId = source.ChannelId,
        target.Subject = source.Subject,
        target.SendDate = source.SendDate,
        target.EmailId = source.EmailId
    WHEN NOT MATCHED THEN INSERT (
        CampaignId, ChannelId, CreatedDate, EmailEngagementId, EmailId, ModifiedDate,
        SendDate, SourceId, SourceSystemId, Subject
    ) VALUES (
        source.CampaignId, source.ChannelId, source.CreatedDate, source.EmailEngagementId, source.EmailId, source.ModifiedDate,
        source.SendDate, source.SourceId, source.SourceSystemId, source.Subject
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichEmailEngagement,
    hard_delete=True
)

ProcessCdfTable(emailEngagementTable, source_name)

### Transform: Account

In [None]:
def EnrichAccount(df):
    # Bronze Address: for direct match
    df_addr_bronze = get_bronze_table("Address").select(
        col("Id").alias("AddressSourceSystemId_bronze"),
        col("npsp__Household_Account__c").alias("HouseholdAccountId_bronze")
    )

    # Silver Address: for translation to AddressId
    df_addr_silver = get_silver_table("Address").select(
        "AddressId", "SourceSystemId", "City", "State", "ZipCode", "CountryId", "SourceId"
    )

    # Silver Country for country name (for fallback matching)
    df_country = get_silver_table("Country").select("CountryId", "Name")

    # -------------------- Direct Match --------------------
    df_direct = (
        df
        .join(
            df_addr_bronze,
            df["Id"] == df_addr_bronze["HouseholdAccountId_bronze"],
            how="left"
        )
        .join(
            df_addr_silver.withColumnRenamed("AddressId", "DirectAddressId"),
            df_addr_bronze["AddressSourceSystemId_bronze"] == col("SourceSystemId"),
            how="left"
        )
    )

    # -------------------- Fallback Match --------------------
    df_addr_silver_country = (
        df_addr_silver
        .join(df_country, df_addr_silver["CountryId"] == df_country["CountryId"], "left")
        .withColumnRenamed("AddressId", "FallbackAddressId")
        .withColumnRenamed("City", "fb_city")
        .withColumnRenamed("State", "fb_state")
        .withColumnRenamed("ZipCode", "fb_zip")
        .withColumnRenamed("Name", "fb_country_name")
    )

    df_fallback = (
        df
        .join(
            df_addr_silver_country,
            (df["BillingCity"] == col("fb_city")) &
            (df["BillingState"] == col("fb_state")) &
            (df["BillingPostalCode"] == col("fb_zip")) &
            (df["BillingCountry"] == col("fb_country_name")) &
            (lit(source_id) == df_addr_silver_country["SourceId"]),
            how="left"
        )
    )

    from pyspark.sql.functions import coalesce

    result = (
        df_direct
        .join(
            df_fallback.select(col("Id").alias("fallback_Id"), col("FallbackAddressId")),
            df_direct["Id"] == col("fallback_Id"),
            how="left"
        )
        .withColumn("AddressId", coalesce(col("DirectAddressId"), col("FallbackAddressId")))
    )

    new_df = (
        result.select(
            expr("uuid()").alias("AccountId"),
            col("AddressId"),
            col("CreatedDate"),
            col("npsp__Matching_Gift_Email__c").alias("Email"),
            col("LastModifiedDate").alias("ModifiedDate"),
            col("Name"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df

accountTable = CdfTable(
    source_table_name="Account",
    source_primary_key="Id",
    target_table_name="Account",
    columns=[
    "Id", "Name", "CreatedDate", "LastModifiedDate", "npsp__Matching_Gift_Email__c",
    "BillingCity", "BillingState", "BillingPostalCode", "BillingCountry"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Account AS target
    USING latestSnapshot_Account AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.Email = source.Email,
        target.AddressId = source.AddressId,
        target.Name = source.Name
    WHEN NOT MATCHED THEN INSERT (
        AccountId, AddressId, CreatedDate, Email, ModifiedDate,
        Name, SourceId, SourceSystemId
    ) VALUES (
        source.AccountId, source.AddressId, source.CreatedDate, source.Email,
        source.ModifiedDate, source.Name, source.SourceId, source.SourceSystemId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichAccount,
    hard_delete=True
)

ProcessCdfTable(accountTable, source_name)

### Transform: Contact

In [None]:
def EnrichContact(df):
    df_contact = df.alias("c")
    df_address = get_silver_table("Address") \
        .select("AddressId", "City", "State", "ZipCode", "SourceId", "SourceSystemId") \
        .alias("a")

    # First try to match by npsp__Current_Address__c ‚Üí Address.SourceSystemId
    df_direct = df_contact.join(
        df_address,
        col("c.npsp__Current_Address__c") == col("a.SourceSystemId"),
        how="left"
    )

    # If not matched, fall back to address fields (City, State, Zip, SourceId)
    df_fallback = df_contact.join(
        df_address,
        (col("c.MailingCity") == col("a.City")) &
        (col("c.MailingState") == col("a.State")) &
        (col("c.MailingPostalCode") == col("a.ZipCode")) &
        (lit(source_id) == col("a.SourceId")),
        how="left"
    )

    # Use AddressId from direct match if available, otherwise from fallback
    from pyspark.sql.functions import coalesce

    df_joined = df_direct \
        .withColumn("FallbackAddressId", df_fallback["a.AddressId"]) \
        .withColumn("AddressId", coalesce(df_direct["a.AddressId"], col("FallbackAddressId")))

    new_df = df_joined.select(
        expr("uuid()").alias("ContactId"),
        col("AddressId"),
        col("c.CreatedDate"),
        col("c.LastModifiedDate").alias("ModifiedDate"),
        col("c.Email"),
        col("c.FirstName"),
        col("c.LastName"),
        col("c.Id").alias("SourceSystemId"),
        col("c.Birthdate").alias("BirthDate"),
        lit(source_id).alias("SourceId")
    ).dropDuplicates(["SourceId", "SourceSystemId"])

    return new_df


contactTable = CdfTable(
    source_table_name="Contact",
    source_primary_key="Id",
    target_table_name="Contact",
    columns=[
        "Id", "FirstName", "LastName", "CreatedDate", "LastModifiedDate", 
        "MailingCity", "MailingState", "MailingPostalCode", "Email", "Birthdate", "npsp__Current_Address__c"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Contact AS target
    USING latestSnapshot_Contact AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.Email        = source.Email,
        target.FirstName    = source.FirstName,
        target.LastName     = source.LastName,
        target.AddressId    = source.AddressId,
        target.BirthDate    = source.BirthDate
    WHEN NOT MATCHED THEN INSERT (
        ContactId, AddressId, CreatedDate, ModifiedDate,
        Email, FirstName, LastName, SourceSystemId, SourceId, BirthDate
    ) VALUES (
        source.ContactId, source.AddressId, source.CreatedDate, source.ModifiedDate,
        source.Email, source.FirstName, source.LastName, source.SourceSystemId, source.SourceId, source.BirthDate
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichContact,
    hard_delete=True
)

ProcessCdfTable(contactTable, source_name)

### Transform: Constituent

In [None]:
from pyspark.sql.functions import col, lit, expr, when
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql import functions as Ff

# ------------------------------------------------------------------#
# 1.  LOAD SOURCE TABLES
# ------------------------------------------------------------------#
df_contact_silver = get_silver_table("Contact") \
    .select("ContactId", "SourceId")

df_account_silver = get_silver_table("Account") \
    .select("AccountId", "SourceSystemId", "SourceId")

df_account_bronze = get_bronze_table("Account") \
    .select(col("Id").alias("SourceSystemId"), col("Type"))

df_const_type = get_silver_table("ConstituentType") \
    .select("Name", "ConstituentTypeId")

type_map = {r["Name"]: r["ConstituentTypeId"] for r in df_const_type.collect()}

# ------------------------------------------------------------------#
# 2.  CONTACT-BASED CONSTITUENTS 
# ------------------------------------------------------------------#
individual_type_id = type_map["Individual"]

df_contact_constituents = (
    df_contact_silver
    .withColumn("ConstituentId", expr("uuid()"))
    .withColumn("AccountId", lit(None).cast("string"))
    .withColumn("ConstituentTypeId", lit(individual_type_id))
)

# ------------------------------------------------------------------#
# 3.  ACCOUNT-BASED CONSTITUENTS
# ------------------------------------------------------------------#

df_account_enriched = (
    df_account_silver
      .join(df_account_bronze, "SourceSystemId")            # adds Type
      .join(df_const_type, df_account_bronze["Type"] == df_const_type["Name"], "left")  # adds ConstituentTypeId
)

df_account_constituents = (
    df_account_enriched
      .withColumn("ConstituentId", expr("uuid()"))
      .withColumn("ContactId", lit(None).cast("string"))
      .withColumn("ConstituentTypeId", col("ConstituentTypeId"))
      .select("ConstituentId", "AccountId", "ContactId", "ConstituentTypeId", "SourceId")
)

# ------------------------------------------------------------------#
# 4.  UNION  & TEMP VIEW
# ------------------------------------------------------------------#
df_union = (
    df_contact_constituents
      .select("ConstituentId", "AccountId", "ContactId", "ConstituentTypeId", "SourceId")
      .unionByName(df_account_constituents)
)

window = Window.partitionBy("ContactId", "AccountId").orderBy(F.col("ConstituentId"))
df_union = df_union.withColumn("row_number", F.row_number().over(window)).filter(F.col("row_number") == 1).drop("row_number")


df_union.createOrReplaceTempView("staged_Constituent")

# ------------------------------------------------------------------#
# 5.  MERGE INTO SILVER.Constituent
# ------------------------------------------------------------------#
spark.sql(f"""
MERGE INTO {silver_lakehouse_name}.Constituent AS tgt
USING staged_Constituent AS src
ON   tgt.ContactId  <=> src.ContactId             
AND  tgt.AccountId  <=> src.AccountId
WHEN MATCHED THEN UPDATE SET
      tgt.ConstituentTypeId = src.ConstituentTypeId
WHEN NOT MATCHED THEN INSERT (
      ConstituentId, AccountId, ContactId, ConstituentTypeId
) VALUES (
      src.ConstituentId, src.AccountId, src.ContactId, src.ConstituentTypeId
)
WHEN NOT MATCHED BY SOURCE THEN DELETE 
""")

# ------------------------------------------------------------------#
# 6.  LOG
# ------------------------------------------------------------------#

log_merge_metrics(f"{silver_lakehouse_name}.Constituent", "Constituent")


# ------------------------------------------------------------------#
# 7.  CLEAN UP TEMP VIEW
# ------------------------------------------------------------------#
spark.catalog.dropTempView("staged_Constituent")

logging.info("‚úÖ Constituent table refreshed successfully.")


### Transform: ConstituentEmailEngagement

In [None]:
def EnrichConstituentEmailEngagement(df):
    df = df.filter(
        (col("RelationType") == "ToAddress") &
        (col("RelationObjectType").isin("Account", "Contact"))
    )

    df_contact = spark.read.table(f"{silver_lakehouse_name}.Contact") \
        .select(col("ContactId"), col("SourceSystemId").alias("ContactSourceSystemId"), col("Email"))

    df_constituent = spark.read.table(f"{silver_lakehouse_name}.Constituent") \
        .select("ConstituentId", "ContactId")

    df_email_engagement = spark.read.table(f"{silver_lakehouse_name}.EmailEngagement") \
        .select(col("EmailEngagementId"), col("SourceSystemId").alias("EmailMessageId_silver"))


    # Join RelationId (Salesforce ContactId) to Contact.SourceSystemId
    df = df.join(
        df_contact,
        df["RelationId"] == df_contact["ContactSourceSystemId"],
        "left"
    )

    # Join ContactId to Constituent
    df = df.join(
        df_constituent,
        df_contact["ContactId"] == df_constituent["ContactId"],
        "left"
    )

    # Join EmailMessageId (bronze) to EmailEngagement.SourceSystemId (silver)
    df = df.join(
        df_email_engagement,
        df["EmailMessageId"] == df_email_engagement["EmailMessageId_silver"],
        "left"
    )



    new_df = (
        df
        .select(
            expr("uuid()").alias("ConstituentEmailEngagementId"),
            col("ConstituentId"),
            col("CreatedDate"),
            col("Id").alias("SourceSystemId"),
            lit(source_id).alias("SourceId"),
            col("EmailEngagementId"),   # from join
            col("EmailMessageId").alias("EmailId"),
            col("SystemModstamp").alias("ModifiedDate"),
            col("CreatedDate").alias("SendDate")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df

constituentEmailEngagementTable = CdfTable(
    source_table_name="EmailMessageRelation",
    source_primary_key="Id",
    target_table_name="ConstituentEmailEngagement",
    columns=[
        "Id", "RelationId", "RelationType", "RelationObjectType", "RelationAddress", "EmailMessageId", "CreatedDate", "SystemModstamp"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.ConstituentEmailEngagement AS target
    USING latestSnapshot_EmailMessageRelation AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.EmailEngagementId = source.EmailEngagementId,
        target.ConstituentId = source.ConstituentId,
        target.SendDate = source.SendDate
    WHEN NOT MATCHED THEN INSERT (
        ConstituentEmailEngagementId, ConstituentId, CreatedDate, SourceSystemId, SourceId,
        SendDate, EmailEngagementId, ModifiedDate, EmailId
    ) VALUES (
        source.ConstituentEmailEngagementId, source.ConstituentId, source.CreatedDate, source.SourceSystemId, source.SourceId,
        source.SendDate, source.EmailEngagementId, source.ModifiedDate, source.EmailId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichConstituentEmailEngagement,
    hard_delete=True
)

ProcessCdfTable(constituentEmailEngagementTable, source_name)

### Transform: Event

In [None]:
from pyspark.sql.functions import col, lit, expr
from pyspark.sql.types import StringType, DecimalType

def EnrichEvent(df):
    # Silver EventType lookup
    df_eventtype = get_silver_table("EventType").select(
        col("EventTypeId"), col("Name")
    )

    # Silver Channel lookup ‚Äì we assume ChannelName = 'Events'
    df_channel = get_silver_table("Channel").filter(col("Name") == "Events") \
        .select(col("ChannelId").alias("EventsChannelId")).limit(1)

    # Join EventSubtype to EventType.Name
    df = df.join(df_eventtype, df["EventSubtype"] == df_eventtype["Name"], "left")

    # join to add ChannelId = 'Events'
    df = df.join(df_channel, how="left")

    # Final output
    new_df = (
        df
        .select(
            expr("uuid()").alias("EventId"),
            col("CreatedDate"),
            col("EventTypeId"),
            col("EventsChannelId").alias("ChannelId"),  
            col("LastModifiedDate").alias("ModifiedDate"),
            col("Subject").alias("Name"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId"),
            col("StartDateTime").alias("StartDate"),
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df


eventTable = CdfTable(
    source_table_name="Event",
    source_primary_key="Id",
    target_table_name="Event",
    columns=[
        "Id", "EventSubtype", "CreatedDate", "LastModifiedDate", "Subject", "StartDateTime"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Event AS target
    USING latestSnapshot_Event AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate   = source.ModifiedDate,
        target.EventTypeId    = source.EventTypeId,
        target.Name           = source.Name,
        target.StartDate      = source.StartDate,
        target.ChannelId      = source.ChannelId
    WHEN NOT MATCHED THEN INSERT (
        EventId, CreatedDate, EventTypeId, ModifiedDate, Name, SourceId, SourceSystemId, StartDate, ChannelId
    ) VALUES (
        source.EventId, source.CreatedDate,
        source.EventTypeId, source.ModifiedDate, source.Name, source.SourceId, source.SourceSystemId, source.StartDate, source.ChannelId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichEvent,
    hard_delete=True
)

ProcessCdfTable(eventTable, source_name)

### Transform: ConstituentOpportunityStage

In [None]:
from pyspark.sql.functions import col, expr
import logging

def EnrichConstituentOpportunityStage(df):
    # Load reference tables
    # Prepare lookup with aliased columns immediately
    constituent_lookup = get_silver_table("Constituent").select(
        col("ConstituentId"),
        col("ContactId").alias("Lkp_ContactId"),
        col("AccountId").alias("Lkp_AccountId")
    )

    stage_df = get_silver_table("OpportunityStage") \
        .select("Name", "OpportunityStageId")

    account_df = get_silver_table("Account") \
        .selectExpr("SourceSystemId as AccountSourceSystemId", "AccountId as AccountGuid")

    contact_df = get_silver_table("Contact") \
        .selectExpr("SourceSystemId as ContactSourceSystemId", "ContactId as ContactGuid")

    logging.info(f"üîé Constituent rows: {constituent_lookup.count()}")
    logging.info(f"üîé OpportunityStage rows: {stage_df.count()}")
    logging.info(f"üîé Account rows: {account_df.count()}")
    logging.info(f"üîé Contact rows: {contact_df.count()}")

    # Join with OpportunityStage
    df = df.join(stage_df, df["StageName"] == stage_df["Name"], how="left")
    logging.info(f"üîÑ After join with stage: {df.count()}")

    # Translate ContactId from source system ID to internal GUID
    df = df.join(contact_df, df["ContactId"] == contact_df["ContactSourceSystemId"], how="left")

    # Translate AccountId from source system ID to internal GUID
    df = df.join(account_df, df["AccountId"] == account_df["AccountSourceSystemId"], how="left")
    logging.info(f"üîÑ After join with contact and account: {df.count()}")

    # Join with Constituent using resolved internal GUIDs

    # 1. Match via Contact (Primary)
    df_with_contact = df.filter(col("ContactGuid").isNotNull()) \
        .join(constituent_lookup, df["ContactGuid"] == constituent_lookup["Lkp_ContactId"], how="left") \
        .drop("Lkp_ContactId", "Lkp_AccountId")

    # 2. Match via Account (Fallback)
    df_with_account = df.filter(col("ContactGuid").isNull()) \
        .join(constituent_lookup, df["AccountGuid"] == constituent_lookup["Lkp_AccountId"], how="left") \
        .drop("Lkp_ContactId", "Lkp_AccountId")

    # Union results
    df = df_with_contact.unionByName(df_with_account)
    
    logging.info(f"üîÑ After join with constituent: {df.count()}")

    # Select and filter final result
    new_df = (
        df
        .select(
            expr("uuid()").alias("ConstituentOpportunityStageId"),
            col("ConstituentId"),
            col("OpportunityStageId")
        )
        .dropDuplicates(["ConstituentId", "OpportunityStageId"])
        .filter(col("ConstituentId").isNotNull() & col("OpportunityStageId").isNotNull())
    )

    return new_df


constituentOpportunityStageTable = CdfTable(
    source_table_name="Opportunity",
    source_primary_key="Id",
    target_table_name="ConstituentOpportunityStage",
    columns=["Id", "AccountId", "ContactId", "StageName"],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.ConstituentOpportunityStage AS target
    USING latestSnapshot_Opportunity AS source
    ON target.ConstituentId = source.ConstituentId AND target.OpportunityStageId = source.OpportunityStageId
    WHEN NOT MATCHED THEN INSERT (
        ConstituentOpportunityStageId, ConstituentId, OpportunityStageId
    ) VALUES (
        source.ConstituentOpportunityStageId, source.ConstituentId, source.OpportunityStageId
    )
    WHEN NOT MATCHED BY SOURCE THEN DELETE
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichConstituentOpportunityStage,

)

ProcessCdfTable(constituentOpportunityStageTable, source_name)
log_merge_metrics(
    f"{silver_lakehouse_name}.ConstituentOpportunityStage",
    "ConstituentOpportunityStage"
)

### Transform: Letter

In [None]:
from pyspark.sql.functions import col, expr, lit, current_timestamp, when

def EnrichLetter(df):
    # Silver lookups
    df_campaign = get_silver_table("Campaign").select(
        col("CampaignId").alias("CampaignId_silver"),
        col("SourceSystemId").alias("CampaignSourceSystemId")
    )

    df_contact = get_silver_table("Contact").select(
        col("ContactId").alias("ContactId_silver"),
        col("SourceSystemId").alias("ContactSourceSystemId")
    )

    df_constituent = get_silver_table("Constituent").select(
        "ConstituentId",
        col("ContactId").alias("ConstituentContactId")
    )

    df_channel = get_silver_table("Channel").select(
        col("ChannelId").alias("ChannelId_silver"),
        col("Name").alias("ChannelName")
    )

    # Filter: only Letter tasks (Subject = 'Send Letter')
    df = df.filter(col("Subject") == "Send Letter")

    # Step 1: WhoId ‚Üí Contact
    df = df.join(df_contact, df["WhoId"] == df_contact["ContactSourceSystemId"], "left")

    # Step 2: ContactId ‚Üí Constituent
    df = df.join(df_constituent, df["ContactId_silver"] == df_constituent["ConstituentContactId"], "left")

    # Step 3: WhatId ‚Üí Campaign
    df = df.join(df_campaign, df["WhatId"] == df_campaign["CampaignSourceSystemId"], "left")

    # Step 4: Map TaskSubtype ‚Üí ChannelName (e.g., 'Task' ‚Üí 'Direct Mail')
    df = df.withColumn("ResolvedChannelName",
        when(col("TaskSubtype") == "Task", lit("Direct Mail"))
    )

    # Step 5: ResolvedChannelName ‚Üí Channel to get ChannelId
    df = df.join(df_channel, df["ResolvedChannelName"] == df_channel["ChannelName"], "left")

    # Final output
    new_df = (
        df.select(
            expr("uuid()").alias("LetterId"),
            col("CampaignId_silver").alias("CampaignId"),
            col("ChannelId_silver").alias("ChannelId"),
            col("ConstituentId"),
            col("CompletedDateTime").alias("SentDate"),
            current_timestamp().alias("CreatedDate"),
            col("Subject"),
            current_timestamp().alias("ModifiedDate"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df


letterTable = CdfTable(
    source_table_name="Task",
    source_primary_key="Id",
    target_table_name="Letter",
    columns=[
        "Id", "WhoId", "WhatId", "Subject", "TaskSubtype", "CompletedDateTime"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Letter AS target
    USING latestSnapshot_Task AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.SentDate     = source.SentDate,
        target.Subject      = source.Subject,
        target.CampaignId   = source.CampaignId,
        target.ChannelId    = source.ChannelId,
        target.ConstituentId= source.ConstituentId
    WHEN NOT MATCHED THEN INSERT (
        LetterId, CampaignId, ChannelId, ConstituentId, CreatedDate,
        Subject, ModifiedDate, SourceId, SourceSystemId, SentDate
    ) VALUES (
        source.LetterId, source.CampaignId, source.ChannelId, source.ConstituentId, source.CreatedDate,
        source.Subject, source.ModifiedDate, source.SourceId, source.SourceSystemId, source.SentDate
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichLetter,
    hard_delete=True
)

ProcessCdfTable(letterTable, source_name)

### Transform: Opportunity

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, lit, broadcast
from pyspark.sql.functions import coalesce

def TransformOpportunity(df):
    # Lookup Campaign
    campaign_df = get_silver_table("Campaign") \
        .select(col("CampaignId").alias("CampaignGuid"), col("SourceSystemId").alias("CampaignSfdcId"))

    # Lookup Contact
    contact_df = get_silver_table("Contact") \
        .select(col("ContactId").alias("ContactGuid"), col("SourceSystemId").alias("ContactSfdcId"))

    # Lookup Account
    account_df = get_silver_table("Account") \
        .select(col("AccountId").alias("AccountGuid"), col("SourceSystemId").alias("AccountSfdcId"))

    # Lookup Constituent (Aliased for split-join optimization)
    constituent_lookup = get_silver_table("Constituent") \
        .select(
            col("ConstituentId"), 
            col("ContactId").alias("Lkp_ContactId"), 
            col("AccountId").alias("Lkp_AccountId")
        )

    # Lookup OpportunityStage
    stage_df = get_silver_table("OpportunityStage") \
        .select(col("Name").alias("StageName_lookup"), "OpportunityStageId")

    # Lookup OpportunityType
    type_df = get_silver_table("OpportunityType") \
        .select(col("Name").alias("TypeName_lookup"), "OpportunityTypeId")

    # 1. Base Enrichment (Standard Joins)
    # Using broadcast for small reference tables (Stage, Type) to avoid skew
    df_base = (
        df.join(broadcast(stage_df), df["StageName"] == stage_df["StageName_lookup"], how="left")
          .join(broadcast(type_df), df["Type"] == type_df["TypeName_lookup"], how="left") 
          .join(campaign_df, df["CampaignId"] == campaign_df["CampaignSfdcId"], how="left")
          .join(contact_df, df["ContactId"] == contact_df["ContactSfdcId"], how="left")
          .join(account_df, df["AccountId"] == account_df["AccountSfdcId"], how="left")
    )

    # 2. Split Join for Constituent (Optimization for OR condition)
    
    # Branch A: Match via Contact (Primary)
    df_contact_match = df_base.filter(col("ContactGuid").isNotNull())
    df_contact_match = df_contact_match.join(
        constituent_lookup,
        df_contact_match["ContactGuid"] == constituent_lookup["Lkp_ContactId"],
        "left"
    )

    # Branch B: Match via Account (Fallback, only if Contact is missing)
    df_account_match = df_base.filter(col("ContactGuid").isNull())
    df_account_match = df_account_match.join(
        constituent_lookup,
        df_account_match["AccountGuid"] == constituent_lookup["Lkp_AccountId"],
        "left"
    )

    # 3. Union results
    df_enriched = df_contact_match.unionByName(df_account_match)

    new_df = (
        df_enriched.select(
            expr("uuid()").alias("OpportunityId"),
            col("CampaignGuid").alias("CampaignId"),
            col("CloseDate"),
            col("ConstituentId"),
            col("CreatedDate"),
            col("ExpectedRevenue"),
            col("LastModifiedDate").alias("ModifiedDate"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId"),
            col("OpportunityStageId"),
            col("OpportunityTypeId"),
            col("Name").alias("OpportunityName") 
        )
        .dropDuplicates(["SourceSystemId", "SourceId"])
    )

    return new_df


opportunityTable = CdfTable(
    source_table_name="Opportunity",
    source_primary_key="Id",
    target_table_name="Opportunity",
    columns=[
        "Id", "AccountId", "Amount", "CampaignId", "CloseDate", "ContactId",
        "CreatedDate", "ExpectedRevenue", "LastModifiedDate", "Name",
        "RecordTypeId", "StageName", "SystemModstamp", "Type", "npe03__Recurring_Donation__c"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Opportunity AS target
    USING latestSnapshot_Opportunity AS source
    ON target.SourceSystemId = source.SourceSystemId AND target.SourceId = source.SourceId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.CampaignId = source.CampaignId,
        target.CloseDate = source.CloseDate,
        target.ConstituentId = source.ConstituentId,
        target.ExpectedRevenue = source.ExpectedRevenue,
        target.OpportunityStageId = source.OpportunityStageId,
        target.OpportunityTypeId = source.OpportunityTypeId,
        target.OpportunityName = source.OpportunityName  

    WHEN NOT MATCHED THEN INSERT (
        OpportunityId, CampaignId, CloseDate,
        ConstituentId, CreatedDate, ExpectedRevenue, ModifiedDate,
        SourceId, SourceSystemId, OpportunityStageId, OpportunityTypeId, OpportunityName
    ) VALUES (
        source.OpportunityId, source.CampaignId, source.CloseDate,
        source.ConstituentId, source.CreatedDate, source.ExpectedRevenue, source.ModifiedDate,
        source.SourceId, source.SourceSystemId, source.OpportunityStageId, source.OpportunityTypeId, source.OpportunityName
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=TransformOpportunity,
    hard_delete=True
)

ProcessCdfTable(opportunityTable, source_name)

### Transform: Participation

In [None]:
from pyspark.sql.functions import col, expr, lit
from datetime import datetime, timezone

def TransformParticipation(df):
    constituent_df = get_silver_table("Constituent") \
        .select("ConstituentId", "ContactId")

    contact_df = get_silver_table("Contact") \
        .select("ContactId", "SourceSystemId")
    
    contact_constituent_df = contact_df.join(
        constituent_df,
        on="ContactId",
        how="left"
    )


    # Load/Insert ParticipationType if not exists
    participation_type_df = get_silver_table("ParticipationType") \
        .filter(col("Name") == "Volunteering") \
        .select("ParticipationTypeId")

    if participation_type_df.isEmpty():
        new_type_df = spark.createDataFrame([(
            str(uuid.uuid4()), "Volunteering", source_id, "Volunteering",
            datetime.now(timezone.utc), datetime.now(timezone.utc)
        )], ["ParticipationTypeId", "Name", "SourceId", "SourceSystemId", "CreatedDate", "ModifiedDate"])

        new_type_df.write.format("delta").mode("append").saveAsTable(f"{silver_lakehouse_name}.ParticipationType")
        participation_type_df = new_type_df.select("ParticipationTypeId")

    participation_type_id = participation_type_df.collect()[0][0]

    # Join with contact ‚Üí constituent
    df = df.join(contact_constituent_df, df["GW_Volunteers__Contact__c"] == contact_constituent_df["SourceSystemId"], "left")


    new_df = (
        df.select(
            expr("uuid()").alias("ParticipationId"),
            lit(True).alias("AttendedEvent"),
            col("ConstituentId"),
            col("CreatedDate"),
            col("GW_Volunteers__End_Date__c").alias("EndDate"),
            col("GW_Volunteers__Hours_Worked__c").alias("Hours"),
            col("LastModifiedDate").alias("ModifiedDate"),
            lit(participation_type_id).alias("ParticipationTypeId"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId"),
            col("GW_Volunteers__Start_Date__c").alias("StartDate")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df


participationTable = CdfTable(
    source_table_name="VolunteerHours",
    source_primary_key="Id",
    target_table_name="Participation",
    columns=[
        "Id",
        "GW_Volunteers__Contact__c",
        "GW_Volunteers__Start_Date__c",
        "GW_Volunteers__End_Date__c",
        "GW_Volunteers__Hours_Worked__c",
        "GW_Volunteers__Volunteer_Campaign__c",
        "CreatedDate",
        "LastModifiedDate",
        "SystemModstamp"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Participation AS target
    USING latestSnapshot_VolunteerHours AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate = source.ModifiedDate,
        target.AttendedEvent = source.AttendedEvent,
        target.Hours = source.Hours,
        target.EndDate = source.EndDate,
        target.StartDate = source.StartDate

    WHEN NOT MATCHED THEN INSERT (
        ParticipationId, AttendedEvent, ConstituentId, CreatedDate,
        EndDate, Hours, ModifiedDate, ParticipationTypeId,
        SourceId, SourceSystemId, StartDate
    ) VALUES (
        source.ParticipationId, source.AttendedEvent, source.ConstituentId, source.CreatedDate,
        source.EndDate, source.Hours, source.ModifiedDate, source.ParticipationTypeId,
        source.SourceId, source.SourceSystemId, source.StartDate
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=TransformParticipation,
    hard_delete=True
)

ProcessCdfTable(participationTable, source_name)

### Transform: Phonecall

In [None]:
from pyspark.sql.functions import col, expr, lit, current_timestamp, when

def EnrichPhoneCall(df):
    # Load Silver lookups
    df_campaign = get_silver_table("Campaign").select(
        col("CampaignId").alias("CampaignId_silver"),
        col("SourceSystemId").alias("CampaignSourceSystemId")
    )

    df_contact = get_silver_table("Contact").select(
        col("ContactId").alias("ContactId_silver"),
        col("SourceSystemId").alias("ContactSourceSystemId")
    )

    df_constituent = get_silver_table("Constituent").select(
        col("ConstituentId"),
        col("ContactId").alias("ConstituentContactId")
    )

    df_channel = get_silver_table("Channel").select(
        col("ChannelId").alias("ChannelId_silver"),
        col("Name").alias("ChannelName")
    )

    # Filter: only Call-type tasks
    df = df.filter(col("TaskSubtype") == "Call")

    # Step 1: Join WhoId ‚Üí Contact
    df = df.join(df_contact, df["WhoId"] == df_contact["ContactSourceSystemId"], "left")

    # Step 2: Join ContactId ‚Üí Constituent
    df = df.join(df_constituent, df["ContactId_silver"] == df_constituent["ConstituentContactId"], "left")

    # Step 3: Join WhatId ‚Üí Campaign (Silver)
    df = df.join(df_campaign, df["WhatId"] == df_campaign["CampaignSourceSystemId"], "left")

    # Step 4: Resolve TaskSubtype = 'Call' ‚Üí ChannelName = 'Phone Call'
    df = df.withColumn("ResolvedChannelName",
        when(col("TaskSubtype") == "Call", lit("Phone Call"))
    )

    # Step 5: ResolvedChannelName ‚Üí Channel to get ChannelId
    df = df.join(df_channel, df["ResolvedChannelName"] == df_channel["ChannelName"], "left")

    # Final output
    new_df = (
        df.select(
            expr("uuid()").alias("PhonecallId"),
            col("CampaignId_silver").alias("CampaignId"),
            col("ChannelId_silver").alias("ChannelId"),
            col("ConstituentId"),
            col("CompletedDateTime").alias("CallDate"),
            current_timestamp().alias("CreatedDate"),
            col("Description"),
            current_timestamp().alias("ModifiedDate"),
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df


phonecallTable = CdfTable(
    source_table_name="Task",
    source_primary_key="Id",
    target_table_name="Phonecall",
    columns=[
        "Id", "WhoId", "WhatId", "Subject", "Description", "TaskSubtype", "CompletedDateTime"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Phonecall AS target
    USING latestSnapshot_Task AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate   = source.ModifiedDate,
        target.CallDate       = source.CallDate,
        target.CampaignId     = source.CampaignId,
        target.ChannelId      = source.ChannelId,
        target.ConstituentId  = source.ConstituentId,
        target.Description    = source.Description
    WHEN NOT MATCHED THEN INSERT (
        PhonecallId, CampaignId, ChannelId, ConstituentId, CreatedDate,
        Description, ModifiedDate, SourceId, SourceSystemId, CallDate
    ) VALUES (
        source.PhonecallId, source.CampaignId, source.ChannelId, source.ConstituentId, source.CreatedDate,
        source.Description, source.ModifiedDate, source.SourceId, source.SourceSystemId, source.CallDate
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichPhoneCall,
    hard_delete=True
)

ProcessCdfTable(phonecallTable, source_name)


### Transform: Transaction

In [None]:
def EnrichTransaction(df):
    # Silver lookups
    df_account = spark.read.table(f"{silver_lakehouse_name}.Account").select("AccountId", "SourceSystemId")
    df_contact = spark.read.table(f"{silver_lakehouse_name}.Contact").select("ContactId", "SourceSystemId")
    df_constituent = spark.read.table(f"{silver_lakehouse_name}.Constituent").select(
        col("ConstituentId"),
        col("AccountId").alias("AccountId_constituent"),
        col("ContactId").alias("ContactId_constituent")
    )
    df_campaign_silver = spark.read.table(f"{silver_lakehouse_name}.Campaign").select("CampaignId", "SourceSystemId")
    df_channel_silver = spark.read.table(f"{silver_lakehouse_name}.Channel").select("ChannelId", col("Name").alias("ChannelName"))
    df_campaign_bronze = spark.read.table(f"{bronze_lakehouse_name}.Campaign").select(
        col("Id").alias("CampaignId_bronze"),
        col("Type").alias("CampaignType_bronze")
    )
    df_recordtype = spark.read.table(f"{bronze_lakehouse_name}.RecordType").select(
        col("Id").alias("RecordTypeId_bronze"),
        col("Name").alias("RecordTypeName_bronze")
    )

    df = df.withColumnRenamed("Name", "OpportunityName")

    # Filter by allowed record types
    df = (
        df.join(
            df_recordtype,
            df["RecordTypeId"] == df_recordtype["RecordTypeId_bronze"],
            "left"
        )
        .filter(col("RecordTypeName_bronze").isin(["Donation", "Grant", "In Kind"]))
    )

    # Join to Account and Contact to get GUIDs
    df = (
        df
        .join(df_account.withColumnRenamed("AccountId", "AccountId_silver"), df["AccountId"] == df_account["SourceSystemId"], "left")
        .join(df_contact.withColumnRenamed("ContactId", "ContactId_silver"), df["ContactId"] == df_contact["SourceSystemId"], "left")
    )

    # Join to Constituent: Contact and Account separately, use unique suffixes
    df = (
        df
        # Contact Constituent (with alias)
        .join(
            df_constituent
                .withColumnRenamed("ConstituentId", "ConstituentId_Contact")
                .withColumnRenamed("ContactId_constituent", "ContactId_constituent_contact")
                .withColumnRenamed("AccountId_constituent", "AccountId_constituent_contact"),
            col("ContactId_silver") == col("ContactId_constituent_contact"),
            "left"
        )
        # Account Constituent (with alias)
        .join(
            df_constituent
                .withColumnRenamed("ConstituentId", "ConstituentId_Account")
                .withColumnRenamed("ContactId_constituent", "ContactId_constituent_account")
                .withColumnRenamed("AccountId_constituent", "AccountId_constituent_account"),
            col("AccountId_silver") == col("AccountId_constituent_account"),
            "left"
        )
    )

    # Join Silver Campaign
    df = df.join(
        df_campaign_silver.withColumnRenamed("CampaignId", "CampaignId_silver"),
        df["CampaignId"] == df_campaign_silver["SourceSystemId"],
        "left"
    )

    # Join Bronze Campaign to get Type
    df = df.join(
        df_campaign_bronze,
        df["CampaignId"] == df_campaign_bronze["CampaignId_bronze"],
        "left"
    )

    # Join Silver Channel by matching Campaign.Type to Channel.Name
    df = df.join(
        df_channel_silver.withColumnRenamed("ChannelId", "ChannelId_silver"),
        df["CampaignType_bronze"] == df_channel_silver["ChannelName"],
        "left"
    )

    new_df = (
        df
        .select(
            expr("uuid()").alias("TransactionId"),
            col("Amount"),
            col("CampaignId_silver").alias("CampaignId"),
            col("ChannelId_silver").alias("ChannelId"),
            # Prefer Contact ConstituentId, fallback to Account ConstituentId (unique aliases)
            coalesce(col("ConstituentId_Contact"), col("ConstituentId_Account")).alias("ConstituentId"),
            col("CreatedDate"),
            (col("npe03__Recurring_Donation__c").isNotNull() & (col("npe03__Recurring_Donation__c") != "") ).alias("IsRecurring"),
            col("LastModifiedDate").alias("ModifiedDate"),
            col("OpportunityName").alias("Name"),
            expr("null").cast("string").alias("OpportunityId"),  # Adjust if needed
            lit(source_id).alias("SourceId"),
            col("Id").alias("SourceSystemId"),
            col("CloseDate").alias("TransactionDate"),
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df

# ---- CdfTable ----
transactionTable = CdfTable(
    source_table_name="Opportunity",
    source_primary_key="Id",
    target_table_name="Transaction",
    columns=[
        "Id", "Name", "AccountId", "ContactId", "Amount", "CampaignId", "RecordTypeId",
        "CreatedDate", "LastModifiedDate", "Type", "CloseDate", "npe03__Recurring_Donation__c"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.Transaction AS target
    USING latestSnapshot_Opportunity AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate     = source.ModifiedDate,
        target.Amount           = source.Amount,
        target.CampaignId       = source.CampaignId,
        target.ChannelId        = source.ChannelId,
        target.ConstituentId    = source.ConstituentId,
        target.Name             = source.Name,
        target.TransactionDate  = source.TransactionDate,
        target.IsRecurring      = source.IsRecurring
    WHEN NOT MATCHED THEN INSERT (
        TransactionId, Amount, CampaignId, ChannelId, ConstituentId, CreatedDate, IsRecurring,
        ModifiedDate, Name, OpportunityId, SourceId, SourceSystemId, TransactionDate
    ) VALUES (
        source.TransactionId, source.Amount, source.CampaignId, source.ChannelId, source.ConstituentId, source.CreatedDate,
        source.IsRecurring, source.ModifiedDate, source.Name, source.OpportunityId, source.SourceId,
        source.SourceSystemId, source.TransactionDate
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichTransaction,
    hard_delete=True
)

ProcessCdfTable(transactionTable, source_name)

### Transform: SoftCredit

In [None]:
def EnrichSoftCredit(df):
    # Only keep rows where Role == "Soft Credit"
    df = df.filter(col("Role") == "Soft Credit")

    # Silver Transaction lookup (get TransactionId by matching OpportunityId)
    df_transaction = get_silver_table("Transaction") \
        .select(col("TransactionId"), col("SourceSystemId"), col("Amount"))  # SourceSystemId is Opportunity.Id

    # Silver Contact lookup
    df_contact = get_silver_table("Contact") \
        .select(col("ContactId"), col("SourceSystemId"))

    # Silver Constituent lookup
    df_constituent = get_silver_table("Constituent") \
        .select("ConstituentId", "ContactId")

    # Join OCR.OpportunityId ‚Üí Transaction.SourceSystemId
    df = df.join(
        df_transaction,
        df["OpportunityId"] == df_transaction["SourceSystemId"],   # THIS IS THE CRUCIAL JOIN
        "left"
    )
    # Join OCR.ContactId ‚Üí Contact.SourceSystemId
    df = df.join(
        df_contact.withColumnRenamed("ContactId", "ContactId_silver"),
        df["ContactId"] == df_contact["SourceSystemId"],
        "left"
    )
    # Join ContactId_silver ‚Üí Constituent.ContactId
    df = df.join(
        df_constituent,
        df["ContactId_silver"] == df_constituent["ContactId"],
        "left"
    )

    new_df = (
        df
        .select(
            expr("uuid()").alias("SoftCreditId"),
            col("ConstituentId"),
            col("CreatedDate"),
            col("LastModifiedDate").alias("ModifiedDate"),
            lit(source_id).alias("SourceId"),
            col("OpportunityId").alias("SourceSystemId"),  # SourceSystemId comes from OpportunityContactRole
            col("TransactionId"),
            col("Amount")
        )
        .dropDuplicates(["SourceId", "SourceSystemId"])
    )

    return new_df


softCreditTable = CdfTable(
    source_table_name="OpportunityContactRole",
    source_primary_key="Id",
    target_table_name="SoftCredit",
    columns=[
        "Id", "OpportunityId", "ContactId", "CreatedDate", "LastModifiedDate", "Role"
    ],
    merge_sql_template=f"""
    MERGE INTO {silver_lakehouse_name}.SoftCredit AS target
    USING latestSnapshot_OpportunityContactRole AS source
    ON target.SourceId = source.SourceId AND target.SourceSystemId = source.SourceSystemId
    WHEN MATCHED THEN UPDATE SET
        target.ModifiedDate   = source.ModifiedDate,
        target.ConstituentId  = source.ConstituentId,
        target.Amount         = source.Amount,
        target.TransactionId  = source.TransactionId
    WHEN NOT MATCHED THEN INSERT (
        SoftCreditId, Amount, ConstituentId, CreatedDate,
        ModifiedDate, SourceId, SourceSystemId, TransactionId
    ) VALUES (
        source.SoftCreditId, source.Amount, source.ConstituentId, source.CreatedDate,
        source.ModifiedDate, source.SourceId, source.SourceSystemId, source.TransactionId
    )
    """,
    source_lakehouse=bronze_lakehouse_name,
    target_lakehouse=silver_lakehouse_name,
    enrich_func=EnrichSoftCredit,
    hard_delete=True
)

ProcessCdfTable(softCreditTable, source_name)

### UpdateSourceSystemIdMapping

In [None]:
from pyspark.sql.functions import col, broadcast
from datetime import datetime, timezone
import logging

def UpdateSourceSystemIdMapping(mapping_table_full_name: str):
    logging.info(f"üîç Loading existing records from {mapping_table_full_name}...")
    existing_mappings = spark.table(mapping_table_full_name) \
        .select("SourceId", "SourceSystemId", "SourceTable") \
        .distinct() \
        .persist()

    logging.info(f"üìã Listing all tables in {silver_lakehouse_name} lakehouse...")
    tables = [
        t.name for t in spark.catalog.listTables(silver_lakehouse_name)
        if t.tableType.lower() == "managed"
    ]

    logging.info(f"‚úÖ Found {len(tables)} tables for analysis.")

    for table_name in tables:
        try:
            df = get_silver_table(table_name)
            schema_fields = [f.name for f in df.schema.fields]

            required_columns = {"SourceId", "SourceSystemId"}
            id_column = f"{table_name}Id"

            if not required_columns.issubset(schema_fields):
                continue
            if id_column not in schema_fields:
                continue

            logging.info(f"üîÑ Processing table: {table_name}")

            df_ids = df.select(
                col("SourceId"),
                col("SourceSystemId"),
                col(id_column).alias("SilverRecordId")
            ) \
            .dropna(subset=["SourceId", "SourceSystemId"]) \
            .dropDuplicates(["SourceId", "SourceSystemId"]) \
            .withColumn("SourceTable", lit(table_name))

            df_new = df_ids.join(
                broadcast(existing_mappings),
                on=["SourceId", "SourceSystemId", "SourceTable"],
                how="left_anti"
            )

            count_new = df_new.count()
            if count_new > 0:
                logging.info(f"‚ûï Inserting {count_new} new records into {mapping_table_full_name}")
                df_new.write.format("delta").mode("append").saveAsTable(mapping_table_full_name)
            else:
                logging.info("0Ô∏è‚É£ No new records to insert.")

        except Exception as e:
            logging.warning(f"‚õî Error while processing table {table_name}: {e}")

    existing_mappings.unpersist()
    logging.info("‚úÖ Done. SourceSystemIdMapping is up to date.")

# [TEMPORARY SOLUTION] Truncate table
mapping_table_full_name = get_full_table_name(silver_lakehouse_name, "SourceSystemIdMapping")
spark.sql(f"TRUNCATE TABLE {mapping_table_full_name}")
UpdateSourceSystemIdMapping(mapping_table_full_name)